Search | arXiv e-print repository

<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/>  <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b">  <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\$","\$"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a>  <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div>  <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div>  <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1–50 of 132 results for author: <span class="mathjax">Weston, J</span> </h1> </div> <div class="level-right is-hidden-mobile">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> <div class="content"> <form method="GET" action="/search/cs" aria-role="search"> Searching in archive <strong>cs</strong>. <a href="/search/?searchtype=author&query=Weston%2C+J">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Weston, J"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Weston%2C+J&terms-0-field=author&size=50&order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Weston, J"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Weston%2C+J&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Weston%2C+J&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Weston%2C+J&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Weston%2C+J&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> </ul> </nav> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.08524">arXiv:2502.08524</a> <span> [<a href="https://arxiv.org/pdf/2502.08524">pdf</a>, <a href="https://arxiv.org/format/2502.08524">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> LLM Pretraining with Continuous Concepts </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Tack%2C+J">Jihoon Tack</a>, <a href="/search/cs?searchtype=author&query=Lanchantin%2C+J">Jack Lanchantin</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+J">Jane Yu</a>, <a href="/search/cs?searchtype=author&query=Cohen%2C+A">Andrew Cohen</a>, <a href="/search/cs?searchtype=author&query=Kulikov%2C+I">Ilia Kulikov</a>, <a href="/search/cs?searchtype=author&query=Lan%2C+J">Janice Lan</a>, <a href="/search/cs?searchtype=author&query=Hao%2C+S">Shibo Hao</a>, <a href="/search/cs?searchtype=author&query=Tian%2C+Y">Yuandong Tian</a>, <a href="/search/cs?searchtype=author&query=Weston%2C+J">Jason Weston</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xian Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.08524v1-abstract-short" style="display: inline;"> Next token prediction has been the standard training objective used in large language model pretraining. Representations are learned as a result of optimizing for token-level perplexity. We propose Continuous Concept Mixing (CoCoMix), a novel pretraining framework that combines discrete next token prediction with continuous concepts. Specifically, CoCoMix predicts continuous concepts learned from… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.08524v1-abstract-full').style.display = 'inline'; document.getElementById('2502.08524v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.08524v1-abstract-full" style="display: none;"> Next token prediction has been the standard training objective used in large language model pretraining. Representations are learned as a result of optimizing for token-level perplexity. We propose Continuous Concept Mixing (CoCoMix), a novel pretraining framework that combines discrete next token prediction with continuous concepts. Specifically, CoCoMix predicts continuous concepts learned from a pretrained sparse autoencoder and mixes them into the model's hidden state by interleaving with token hidden representations. Through experiments on multiple benchmarks, including language modeling and downstream reasoning tasks, we show that CoCoMix is more sample efficient and consistently outperforms standard next token prediction, knowledge distillation and inserting pause tokens. We find that combining both concept learning and interleaving in an end-to-end framework is critical to performance gains. Furthermore, CoCoMix enhances interpretability and steerability by allowing direct inspection and modification of the predicted concept, offering a transparent way to guide the model's internal reasoning process. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.08524v1-abstract-full').style.display = 'none'; document.getElementById('2502.08524v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.18578">arXiv:2501.18578</a> <span> [<a href="https://arxiv.org/pdf/2501.18578">pdf</a>, <a href="https://arxiv.org/format/2501.18578">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> R.I.P.: Better Models by Survival of the Fittest Prompts </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yu%2C+P">Ping Yu</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+W">Weizhe Yuan</a>, <a href="/search/cs?searchtype=author&query=Golovneva%2C+O">Olga Golovneva</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+T">Tianhao Wu</a>, <a href="/search/cs?searchtype=author&query=Sukhbaatar%2C+S">Sainbayar Sukhbaatar</a>, <a href="/search/cs?searchtype=author&query=Weston%2C+J">Jason Weston</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+J">Jing Xu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.18578v1-abstract-short" style="display: inline;"> Training data quality is one of the most important drivers of final model quality. In this work, we introduce a method for evaluating data integrity based on the assumption that low-quality input prompts result in high variance and low quality responses. This is achieved by measuring the rejected response quality and the reward gap between the chosen and rejected preference pair. Our method, Rejec… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.18578v1-abstract-full').style.display = 'inline'; document.getElementById('2501.18578v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.18578v1-abstract-full" style="display: none;"> Training data quality is one of the most important drivers of final model quality. In this work, we introduce a method for evaluating data integrity based on the assumption that low-quality input prompts result in high variance and low quality responses. This is achieved by measuring the rejected response quality and the reward gap between the chosen and rejected preference pair. Our method, Rejecting Instruction Preferences (RIP) can be used to filter prompts from existing training sets, or to make high quality synthetic datasets, yielding large performance gains across various benchmarks compared to unfiltered data. Using Llama 3.1-8B-Instruct, RIP improves AlpacaEval2 LC Win Rate by 9.4%, Arena-Hard by 8.7%, and WildBench by 9.9%. Using Llama 3.3-70B-Instruct, RIP improves Arena-Hard from 67.5 to 82.9, which is from 18th place to 6th overall in the leaderboard. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.18578v1-abstract-full').style.display = 'none'; document.getElementById('2501.18578v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.18101">arXiv:2501.18101</a> <span> [<a href="https://arxiv.org/pdf/2501.18101">pdf</a>, <a href="https://arxiv.org/format/2501.18101">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Diverse Preference Optimization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lanchantin%2C+J">Jack Lanchantin</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+A">Angelica Chen</a>, <a href="/search/cs?searchtype=author&query=Dhuliawala%2C+S">Shehzaad Dhuliawala</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+P">Ping Yu</a>, <a href="/search/cs?searchtype=author&query=Weston%2C+J">Jason Weston</a>, <a href="/search/cs?searchtype=author&query=Sukhbaatar%2C+S">Sainbayar Sukhbaatar</a>, <a href="/search/cs?searchtype=author&query=Kulikov%2C+I">Ilia Kulikov</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.18101v3-abstract-short" style="display: inline;"> Post-training of language models, either through reinforcement learning, preference optimization or supervised finetuning, tends to sharpen the output probability distribution and reduce the diversity of generated responses. This is particularly a problem for creative generative tasks where varied responses are desired. In this work we introduce Diverse Preference Optimization (DivPO), an optimiza… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.18101v3-abstract-full').style.display = 'inline'; document.getElementById('2501.18101v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.18101v3-abstract-full" style="display: none;"> Post-training of language models, either through reinforcement learning, preference optimization or supervised finetuning, tends to sharpen the output probability distribution and reduce the diversity of generated responses. This is particularly a problem for creative generative tasks where varied responses are desired. In this work we introduce Diverse Preference Optimization (DivPO), an optimization method which learns to generate much more diverse responses than standard pipelines, while maintaining the quality of the generations. In DivPO, preference pairs are selected by first considering a pool of responses, and a measure of diversity among them, and selecting chosen examples as being more rare but high quality, while rejected examples are more common, but low quality. DivPO results in generating 45.6% more diverse persona attributes, and an 74.6% increase in story diversity, while maintaining similar win rates as standard baselines. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.18101v3-abstract-full').style.display = 'none'; document.getElementById('2501.18101v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 29 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.18099">arXiv:2501.18099</a> <span> [<a href="https://arxiv.org/pdf/2501.18099">pdf</a>, <a href="https://arxiv.org/format/2501.18099">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Learning to Plan & Reason for Evaluation with Thinking-LLM-as-a-Judge </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Saha%2C+S">Swarnadeep Saha</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xian Li</a>, <a href="/search/cs?searchtype=author&query=Ghazvininejad%2C+M">Marjan Ghazvininejad</a>, <a href="/search/cs?searchtype=author&query=Weston%2C+J">Jason Weston</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+T">Tianlu Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.18099v1-abstract-short" style="display: inline;"> LLM-as-a-Judge models generate chain-of-thought (CoT) sequences intended to capture the step-bystep reasoning process that underlies the final evaluation of a response. However, due to the lack of human annotated CoTs for evaluation, the required components and structure of effective reasoning traces remain understudied. Consequently, previous approaches often (1) constrain reasoning traces to han… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.18099v1-abstract-full').style.display = 'inline'; document.getElementById('2501.18099v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.18099v1-abstract-full" style="display: none;"> LLM-as-a-Judge models generate chain-of-thought (CoT) sequences intended to capture the step-bystep reasoning process that underlies the final evaluation of a response. However, due to the lack of human annotated CoTs for evaluation, the required components and structure of effective reasoning traces remain understudied. Consequently, previous approaches often (1) constrain reasoning traces to hand-designed components, such as a list of criteria, reference answers, or verification questions and (2) structure them such that planning is intertwined with the reasoning for evaluation. In this work, we propose EvalPlanner, a preference optimization algorithm for Thinking-LLM-as-a-Judge that first generates an unconstrained evaluation plan, followed by its execution, and then the final judgment. In a self-training loop, EvalPlanner iteratively optimizes over synthetically constructed evaluation plans and executions, leading to better final verdicts. Our method achieves a new state-of-the-art performance for generative reward models on RewardBench (with a score of 93.9), despite being trained on fewer amount of, and synthetically generated, preference pairs. Additional experiments on other benchmarks like RM-Bench, JudgeBench, and FollowBenchEval further highlight the utility of both planning and reasoning for building robust LLM-as-a-Judge reasoning models. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.18099v1-abstract-full').style.display = 'none'; document.getElementById('2501.18099v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.10799">arXiv:2501.10799</a> <span> [<a href="https://arxiv.org/pdf/2501.10799">pdf</a>, <a href="https://arxiv.org/format/2501.10799">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Step-KTO: Optimizing Mathematical Reasoning through Stepwise Binary Feedback </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lin%2C+Y">Yen-Ting Lin</a>, <a href="/search/cs?searchtype=author&query=Jin%2C+D">Di Jin</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+T">Tengyu Xu</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+T">Tianhao Wu</a>, <a href="/search/cs?searchtype=author&query=Sukhbaatar%2C+S">Sainbayar Sukhbaatar</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+C">Chen Zhu</a>, <a href="/search/cs?searchtype=author&query=He%2C+Y">Yun He</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yun-Nung Chen</a>, <a href="/search/cs?searchtype=author&query=Weston%2C+J">Jason Weston</a>, <a href="/search/cs?searchtype=author&query=Tian%2C+Y">Yuandong Tian</a>, <a href="/search/cs?searchtype=author&query=Rahnama%2C+A">Arash Rahnama</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+S">Sinong Wang</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+H">Hao Ma</a>, <a href="/search/cs?searchtype=author&query=Fang%2C+H">Han Fang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.10799v1-abstract-short" style="display: inline;"> Large language models (LLMs) have recently demonstrated remarkable success in mathematical reasoning. Despite progress in methods like chain-of-thought prompting and self-consistency sampling, these advances often focus on final correctness without ensuring that the underlying reasoning process is coherent and reliable. This paper introduces Step-KTO, a training framework that combines process-lev… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.10799v1-abstract-full').style.display = 'inline'; document.getElementById('2501.10799v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.10799v1-abstract-full" style="display: none;"> Large language models (LLMs) have recently demonstrated remarkable success in mathematical reasoning. Despite progress in methods like chain-of-thought prompting and self-consistency sampling, these advances often focus on final correctness without ensuring that the underlying reasoning process is coherent and reliable. This paper introduces Step-KTO, a training framework that combines process-level and outcome-level binary feedback to guide LLMs toward more trustworthy reasoning trajectories. By providing binary evaluations for both the intermediate reasoning steps and the final answer, Step-KTO encourages the model to adhere to logical progressions rather than relying on superficial shortcuts. Our experiments on challenging mathematical benchmarks show that Step-KTO significantly improves both final answer accuracy and the quality of intermediate reasoning steps. For example, on the MATH-500 dataset, Step-KTO achieves a notable improvement in Pass@1 accuracy over strong baselines. These results highlight the promise of integrating stepwise process feedback into LLM training, paving the way toward more interpretable and dependable reasoning capabilities. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.10799v1-abstract-full').style.display = 'none'; document.getElementById('2501.10799v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.09871">arXiv:2412.09871</a> <span> [<a href="https://arxiv.org/pdf/2412.09871">pdf</a>, <a href="https://arxiv.org/format/2412.09871">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Byte Latent Transformer: Patches Scale Better Than Tokens </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Pagnoni%2C+A">Artidoro Pagnoni</a>, <a href="/search/cs?searchtype=author&query=Pasunuru%2C+R">Ram Pasunuru</a>, <a href="/search/cs?searchtype=author&query=Rodriguez%2C+P">Pedro Rodriguez</a>, <a href="/search/cs?searchtype=author&query=Nguyen%2C+J">John Nguyen</a>, <a href="/search/cs?searchtype=author&query=Muller%2C+B">Benjamin Muller</a>, <a href="/search/cs?searchtype=author&query=Li%2C+M">Margaret Li</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+C">Chunting Zhou</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+L">Lili Yu</a>, <a href="/search/cs?searchtype=author&query=Weston%2C+J">Jason Weston</a>, <a href="/search/cs?searchtype=author&query=Zettlemoyer%2C+L">Luke Zettlemoyer</a>, <a href="/search/cs?searchtype=author&query=Ghosh%2C+G">Gargi Ghosh</a>, <a href="/search/cs?searchtype=author&query=Lewis%2C+M">Mike Lewis</a>, <a href="/search/cs?searchtype=author&query=Holtzman%2C+A">Ari Holtzman</a>, <a href="/search/cs?searchtype=author&query=Iyer%2C+S">Srinivasan Iyer</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.09871v1-abstract-short" style="display: inline;"> We introduce the Byte Latent Transformer (BLT), a new byte-level LLM architecture that, for the first time, matches tokenization-based LLM performance at scale with significant improvements in inference efficiency and robustness. BLT encodes bytes into dynamically sized patches, which serve as the primary units of computation. Patches are segmented based on the entropy of the next byte, allocating… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.09871v1-abstract-full').style.display = 'inline'; document.getElementById('2412.09871v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.09871v1-abstract-full" style="display: none;"> We introduce the Byte Latent Transformer (BLT), a new byte-level LLM architecture that, for the first time, matches tokenization-based LLM performance at scale with significant improvements in inference efficiency and robustness. BLT encodes bytes into dynamically sized patches, which serve as the primary units of computation. Patches are segmented based on the entropy of the next byte, allocating more compute and model capacity where increased data complexity demands it. We present the first FLOP controlled scaling study of byte-level models up to 8B parameters and 4T training bytes. Our results demonstrate the feasibility of scaling models trained on raw bytes without a fixed vocabulary. Both training and inference efficiency improve due to dynamically selecting long patches when data is predictable, along with qualitative improvements on reasoning and long tail generalization. Overall, for fixed inference costs, BLT shows significantly better scaling than tokenization-based models, by simultaneously growing both patch and model size. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.09871v1-abstract-full').style.display = 'none'; document.getElementById('2412.09871v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.06769">arXiv:2412.06769</a> <span> [<a href="https://arxiv.org/pdf/2412.06769">pdf</a>, <a href="https://arxiv.org/format/2412.06769">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Training Large Language Models to Reason in a Continuous Latent Space </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Hao%2C+S">Shibo Hao</a>, <a href="/search/cs?searchtype=author&query=Sukhbaatar%2C+S">Sainbayar Sukhbaatar</a>, <a href="/search/cs?searchtype=author&query=Su%2C+D">DiJia Su</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xian Li</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+Z">Zhiting Hu</a>, <a href="/search/cs?searchtype=author&query=Weston%2C+J">Jason Weston</a>, <a href="/search/cs?searchtype=author&query=Tian%2C+Y">Yuandong Tian</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.06769v2-abstract-short" style="display: inline;"> Large language models (LLMs) are restricted to reason in the "language space", where they typically express the reasoning process with a chain-of-thought (CoT) to solve a complex reasoning problem. However, we argue that language space may not always be optimal for reasoning. For example, most word tokens are primarily for textual coherence and not essential for reasoning, while some critical toke… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.06769v2-abstract-full').style.display = 'inline'; document.getElementById('2412.06769v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.06769v2-abstract-full" style="display: none;"> Large language models (LLMs) are restricted to reason in the "language space", where they typically express the reasoning process with a chain-of-thought (CoT) to solve a complex reasoning problem. However, we argue that language space may not always be optimal for reasoning. For example, most word tokens are primarily for textual coherence and not essential for reasoning, while some critical tokens require complex planning and pose huge challenges to LLMs. To explore the potential of LLM reasoning in an unrestricted latent space instead of using natural language, we introduce a new paradigm Coconut (Chain of Continuous Thought). We utilize the last hidden state of the LLM as a representation of the reasoning state (termed "continuous thought"). Rather than decoding this into a word token, we feed it back to the LLM as the subsequent input embedding directly in the continuous space. Experiments show that Coconut can effectively augment the LLM on several reasoning tasks. This novel latent reasoning paradigm leads to emergent advanced reasoning patterns: the continuous thought can encode multiple alternative next reasoning steps, allowing the model to perform a breadth-first search (BFS) to solve the problem, rather than prematurely committing to a single deterministic path like CoT. Coconut outperforms CoT in certain logical reasoning tasks that require substantial backtracking during planning, with fewer thinking tokens during inference. These findings demonstrate the promise of latent reasoning and offer valuable insights for future research. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.06769v2-abstract-full').style.display = 'none'; document.getElementById('2412.06769v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 9 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.04305">arXiv:2412.04305</a> <span> [<a href="https://arxiv.org/pdf/2412.04305">pdf</a>, <a href="https://arxiv.org/format/2412.04305">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> ALMA: Alignment with Minimal Annotation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yasunaga%2C+M">Michihiro Yasunaga</a>, <a href="/search/cs?searchtype=author&query=Shamis%2C+L">Leonid Shamis</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+C">Chunting Zhou</a>, <a href="/search/cs?searchtype=author&query=Cohen%2C+A">Andrew Cohen</a>, <a href="/search/cs?searchtype=author&query=Weston%2C+J">Jason Weston</a>, <a href="/search/cs?searchtype=author&query=Zettlemoyer%2C+L">Luke Zettlemoyer</a>, <a href="/search/cs?searchtype=author&query=Ghazvininejad%2C+M">Marjan Ghazvininejad</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.04305v1-abstract-short" style="display: inline;"> Recent approaches to large language model (LLM) alignment typically require millions of human annotations or rely on external aligned models for synthetic data generation. This paper introduces ALMA: Alignment with Minimal Annotation, demonstrating that effective alignment can be achieved using only 9,000 labeled examples -- less than 1% of conventional approaches. ALMA generates large amounts of… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.04305v1-abstract-full').style.display = 'inline'; document.getElementById('2412.04305v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.04305v1-abstract-full" style="display: none;"> Recent approaches to large language model (LLM) alignment typically require millions of human annotations or rely on external aligned models for synthetic data generation. This paper introduces ALMA: Alignment with Minimal Annotation, demonstrating that effective alignment can be achieved using only 9,000 labeled examples -- less than 1% of conventional approaches. ALMA generates large amounts of high-quality synthetic alignment data through new techniques: diverse prompt synthesis via few-shot learning, diverse response generation with multiple model checkpoints, and judge (reward model) enhancement through score aggregation and self-distillation. Using only a pretrained Llama3 base model, 5,000 SFT examples, and 4,000 judge annotations, ALMA achieves performance close to Llama3-Instruct across diverse alignment benchmarks (e.g., 0.1% difference on AlpacaEval 2.0 score). These results are achieved with a multi-round, self-bootstrapped data synthesis and training recipe that continues to improve for 10 rounds, surpassing the typical 3-round ceiling of previous methods. These results suggest that base models already possess sufficient knowledge for effective alignment, and that synthetic data generation methods can expose it. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.04305v1-abstract-full').style.display = 'none'; document.getElementById('2412.04305v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.09661">arXiv:2411.09661</a> <span> [<a href="https://arxiv.org/pdf/2411.09661">pdf</a>, <a href="https://arxiv.org/format/2411.09661">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Adaptive Decoding via Latent Preference Optimization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Dhuliawala%2C+S">Shehzaad Dhuliawala</a>, <a href="/search/cs?searchtype=author&query=Kulikov%2C+I">Ilia Kulikov</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+P">Ping Yu</a>, <a href="/search/cs?searchtype=author&query=Celikyilmaz%2C+A">Asli Celikyilmaz</a>, <a href="/search/cs?searchtype=author&query=Weston%2C+J">Jason Weston</a>, <a href="/search/cs?searchtype=author&query=Sukhbaatar%2C+S">Sainbayar Sukhbaatar</a>, <a href="/search/cs?searchtype=author&query=Lanchantin%2C+J">Jack Lanchantin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.09661v1-abstract-short" style="display: inline;"> During language model decoding, it is known that using higher temperature sampling gives more creative responses, while lower temperatures are more factually accurate. However, such models are commonly applied to general instruction following, which involves both creative and fact seeking tasks, using a single fixed temperature across all examples and tokens. In this work, we introduce Adaptive De… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.09661v1-abstract-full').style.display = 'inline'; document.getElementById('2411.09661v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.09661v1-abstract-full" style="display: none;"> During language model decoding, it is known that using higher temperature sampling gives more creative responses, while lower temperatures are more factually accurate. However, such models are commonly applied to general instruction following, which involves both creative and fact seeking tasks, using a single fixed temperature across all examples and tokens. In this work, we introduce Adaptive Decoding, a layer added to the model to select the sampling temperature dynamically at inference time, at either the token or example level, in order to optimize performance. To learn its parameters we introduce Latent Preference Optimization (LPO) a general approach to train discrete latent variables such as choices of temperature. Our method outperforms all fixed decoding temperatures across a range of tasks that require different temperatures, including UltraFeedback, Creative Story Writing, and GSM8K. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.09661v1-abstract-full').style.display = 'none'; document.getElementById('2411.09661v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.04109">arXiv:2411.04109</a> <span> [<a href="https://arxiv.org/pdf/2411.04109">pdf</a>, <a href="https://arxiv.org/format/2411.04109">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Self-Consistency Preference Optimization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Prasad%2C+A">Archiki Prasad</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+W">Weizhe Yuan</a>, <a href="/search/cs?searchtype=author&query=Pang%2C+R+Y">Richard Yuanzhe Pang</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+J">Jing Xu</a>, <a href="/search/cs?searchtype=author&query=Fazel-Zarandi%2C+M">Maryam Fazel-Zarandi</a>, <a href="/search/cs?searchtype=author&query=Bansal%2C+M">Mohit Bansal</a>, <a href="/search/cs?searchtype=author&query=Sukhbaatar%2C+S">Sainbayar Sukhbaatar</a>, <a href="/search/cs?searchtype=author&query=Weston%2C+J">Jason Weston</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+J">Jane Yu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.04109v2-abstract-short" style="display: inline;"> Self-alignment, whereby models learn to improve themselves without human annotation, is a rapidly growing research area. However, existing techniques often fail to improve complex reasoning tasks due to the difficulty of assigning correct rewards. An orthogonal approach that is known to improve correctness is self-consistency, a method applied at inference time based on multiple sampling in order… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.04109v2-abstract-full').style.display = 'inline'; document.getElementById('2411.04109v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.04109v2-abstract-full" style="display: none;"> Self-alignment, whereby models learn to improve themselves without human annotation, is a rapidly growing research area. However, existing techniques often fail to improve complex reasoning tasks due to the difficulty of assigning correct rewards. An orthogonal approach that is known to improve correctness is self-consistency, a method applied at inference time based on multiple sampling in order to find the most consistent answer. In this work, we extend the self-consistency concept to help train models. We thus introduce self-consistency preference optimization (ScPO), which iteratively trains consistent answers to be preferred over inconsistent ones on unsupervised new problems. We show ScPO leads to large improvements over conventional reward model training on reasoning tasks such as GSM8K and MATH, closing the gap with supervised training with gold answers or preferences, and that combining ScPO with standard supervised learning improves results even further. On ZebraLogic, ScPO finetunes Llama-3 8B to be superior to Llama-3 70B, Gemma-2 27B, and Claude-3 Haiku. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.04109v2-abstract-full').style.display = 'none'; document.getElementById('2411.04109v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 6 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">16 pages, 3 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.10630">arXiv:2410.10630</a> <span> [<a href="https://arxiv.org/pdf/2410.10630">pdf</a>, <a href="https://arxiv.org/format/2410.10630">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Thinking LLMs: General Instruction Following with Thought Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wu%2C+T">Tianhao Wu</a>, <a href="/search/cs?searchtype=author&query=Lan%2C+J">Janice Lan</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+W">Weizhe Yuan</a>, <a href="/search/cs?searchtype=author&query=Jiao%2C+J">Jiantao Jiao</a>, <a href="/search/cs?searchtype=author&query=Weston%2C+J">Jason Weston</a>, <a href="/search/cs?searchtype=author&query=Sukhbaatar%2C+S">Sainbayar Sukhbaatar</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.10630v1-abstract-short" style="display: inline;"> LLMs are typically trained to answer user questions or follow instructions similarly to how human experts respond. However, in the standard alignment framework they lack the basic ability of explicit thinking before answering. Thinking is important for complex questions that require reasoning and planning -- but can be applied to any task. We propose a training method for equipping existing LLMs w… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.10630v1-abstract-full').style.display = 'inline'; document.getElementById('2410.10630v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.10630v1-abstract-full" style="display: none;"> LLMs are typically trained to answer user questions or follow instructions similarly to how human experts respond. However, in the standard alignment framework they lack the basic ability of explicit thinking before answering. Thinking is important for complex questions that require reasoning and planning -- but can be applied to any task. We propose a training method for equipping existing LLMs with such thinking abilities for general instruction following without use of additional human data. We achieve this by an iterative search and optimization procedure that explores the space of possible thought generations, allowing the model to learn how to think without direct supervision. For each instruction, the thought candidates are scored using a judge model to evaluate their responses only, and then optimized via preference optimization. We show that this procedure leads to superior performance on AlpacaEval and Arena-Hard, and shows gains from thinking on non-reasoning categories such as marketing, health and general knowledge, in addition to more traditional reasoning & problem-solving tasks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.10630v1-abstract-full').style.display = 'none'; document.getElementById('2410.10630v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.14586">arXiv:2409.14586</a> <span> [<a href="https://arxiv.org/pdf/2409.14586">pdf</a>, <a href="https://arxiv.org/format/2409.14586">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Backtracking Improves Generation Safety </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yiming Zhang</a>, <a href="/search/cs?searchtype=author&query=Chi%2C+J">Jianfeng Chi</a>, <a href="/search/cs?searchtype=author&query=Nguyen%2C+H">Hailey Nguyen</a>, <a href="/search/cs?searchtype=author&query=Upasani%2C+K">Kartikeya Upasani</a>, <a href="/search/cs?searchtype=author&query=Bikel%2C+D+M">Daniel M. Bikel</a>, <a href="/search/cs?searchtype=author&query=Weston%2C+J">Jason Weston</a>, <a href="/search/cs?searchtype=author&query=Smith%2C+E+M">Eric Michael Smith</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.14586v1-abstract-short" style="display: inline;"> Text generation has a fundamental limitation almost by definition: there is no taking back tokens that have been generated, even when they are clearly problematic. In the context of language model safety, when a partial unsafe generation is produced, language models by their nature tend to happily keep on generating similarly unsafe additional text. This is in fact how safety alignment of frontier… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.14586v1-abstract-full').style.display = 'inline'; document.getElementById('2409.14586v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.14586v1-abstract-full" style="display: none;"> Text generation has a fundamental limitation almost by definition: there is no taking back tokens that have been generated, even when they are clearly problematic. In the context of language model safety, when a partial unsafe generation is produced, language models by their nature tend to happily keep on generating similarly unsafe additional text. This is in fact how safety alignment of frontier models gets circumvented in the wild, despite great efforts in improving their safety. Deviating from the paradigm of approaching safety alignment as prevention (decreasing the probability of harmful responses), we propose backtracking, a technique that allows language models to "undo" and recover from their own unsafe generation through the introduction of a special [RESET] token. Our method can be incorporated into either SFT or DPO training to optimize helpfulness and harmlessness. We show that models trained to backtrack are consistently safer than baseline models: backtracking Llama-3-8B is four times more safe than the baseline model (6.1\% $\to$ 1.5\%) in our evaluations without regression in helpfulness. Our method additionally provides protection against four adversarial attacks including an adaptive attack, despite not being trained to do so. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.14586v1-abstract-full').style.display = 'none'; document.getElementById('2409.14586v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.08239">arXiv:2409.08239</a> <span> [<a href="https://arxiv.org/pdf/2409.08239">pdf</a>, <a href="https://arxiv.org/format/2409.08239">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Source2Synth: Synthetic Data Generation and Curation Grounded in Real Data Sources </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lupidi%2C+A">Alisia Lupidi</a>, <a href="/search/cs?searchtype=author&query=Gemmell%2C+C">Carlos Gemmell</a>, <a href="/search/cs?searchtype=author&query=Cancedda%2C+N">Nicola Cancedda</a>, <a href="/search/cs?searchtype=author&query=Dwivedi-Yu%2C+J">Jane Dwivedi-Yu</a>, <a href="/search/cs?searchtype=author&query=Weston%2C+J">Jason Weston</a>, <a href="/search/cs?searchtype=author&query=Foerster%2C+J">Jakob Foerster</a>, <a href="/search/cs?searchtype=author&query=Raileanu%2C+R">Roberta Raileanu</a>, <a href="/search/cs?searchtype=author&query=Lomeli%2C+M">Maria Lomeli</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.08239v1-abstract-short" style="display: inline;"> Large Language Models still struggle in challenging scenarios that leverage structured data, complex reasoning, or tool usage. In this paper, we propose Source2Synth: a new method that can be used for teaching LLMs new skills without relying on costly human annotations. Source2Synth takes as input a custom data source and produces synthetic data points with intermediate reasoning steps grounded in… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.08239v1-abstract-full').style.display = 'inline'; document.getElementById('2409.08239v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.08239v1-abstract-full" style="display: none;"> Large Language Models still struggle in challenging scenarios that leverage structured data, complex reasoning, or tool usage. In this paper, we propose Source2Synth: a new method that can be used for teaching LLMs new skills without relying on costly human annotations. Source2Synth takes as input a custom data source and produces synthetic data points with intermediate reasoning steps grounded in real-world sources. Source2Synth improves the dataset quality by discarding low-quality generations based on their answerability. We demonstrate the generality of this approach by applying it to two challenging domains: we test reasoning abilities in multi-hop question answering (MHQA), and tool usage in tabular question answering (TQA). Our method improves performance by 25.51% for TQA on WikiSQL and 22.57% for MHQA on HotPotQA compared to the fine-tuned baselines. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.08239v1-abstract-full').style.display = 'none'; document.getElementById('2409.08239v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.04614">arXiv:2408.04614</a> <span> [<a href="https://arxiv.org/pdf/2408.04614">pdf</a>, <a href="https://arxiv.org/format/2408.04614">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Better Alignment with Instruction Back-and-Forth Translation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Nguyen%2C+T">Thao Nguyen</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Jeffrey Li</a>, <a href="/search/cs?searchtype=author&query=Oh%2C+S">Sewoong Oh</a>, <a href="/search/cs?searchtype=author&query=Schmidt%2C+L">Ludwig Schmidt</a>, <a href="/search/cs?searchtype=author&query=Weston%2C+J">Jason Weston</a>, <a href="/search/cs?searchtype=author&query=Zettlemoyer%2C+L">Luke Zettlemoyer</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xian Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.04614v2-abstract-short" style="display: inline;"> We propose a new method, instruction back-and-forth translation, to construct high-quality synthetic data grounded in world knowledge for aligning large language models (LLMs). Given documents from a web corpus, we generate and curate synthetic instructions using the backtranslation approach proposed by Li et al.(2023a), and rewrite the responses to improve their quality further based on the initi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.04614v2-abstract-full').style.display = 'inline'; document.getElementById('2408.04614v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.04614v2-abstract-full" style="display: none;"> We propose a new method, instruction back-and-forth translation, to construct high-quality synthetic data grounded in world knowledge for aligning large language models (LLMs). Given documents from a web corpus, we generate and curate synthetic instructions using the backtranslation approach proposed by Li et al.(2023a), and rewrite the responses to improve their quality further based on the initial documents. Fine-tuning with the resulting (backtranslated instruction, rewritten response) pairs yields higher win rates on AlpacaEval than using other common instruction datasets such as Humpback, ShareGPT, Open Orca, Alpaca-GPT4 and Self-instruct. We also demonstrate that rewriting the responses with an LLM outperforms direct distillation, and the two generated text distributions exhibit significant distinction in embedding space. Further analysis shows that our backtranslated instructions are of higher quality than other sources of synthetic instructions, while our responses are more diverse and complex than those obtained from distillation. Overall we find that instruction back-and-forth translation combines the best of both worlds -- making use of the information diversity and quantity found on the web, while ensuring the quality of the responses which is necessary for effective alignment. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.04614v2-abstract-full').style.display = 'none'; document.getElementById('2408.04614v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 8 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.02666">arXiv:2408.02666</a> <span> [<a href="https://arxiv.org/pdf/2408.02666">pdf</a>, <a href="https://arxiv.org/format/2408.02666">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Self-Taught Evaluators </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+T">Tianlu Wang</a>, <a href="/search/cs?searchtype=author&query=Kulikov%2C+I">Ilia Kulikov</a>, <a href="/search/cs?searchtype=author&query=Golovneva%2C+O">Olga Golovneva</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+P">Ping Yu</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+W">Weizhe Yuan</a>, <a href="/search/cs?searchtype=author&query=Dwivedi-Yu%2C+J">Jane Dwivedi-Yu</a>, <a href="/search/cs?searchtype=author&query=Pang%2C+R+Y">Richard Yuanzhe Pang</a>, <a href="/search/cs?searchtype=author&query=Fazel-Zarandi%2C+M">Maryam Fazel-Zarandi</a>, <a href="/search/cs?searchtype=author&query=Weston%2C+J">Jason Weston</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xian Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.02666v2-abstract-short" style="display: inline;"> Model-based evaluation is at the heart of successful model development -- as a reward model for training, and as a replacement for human evaluation. To train such evaluators, the standard approach is to collect a large amount of human preference judgments over model responses, which is costly and the data becomes stale as models improve. In this work, we present an approach that aims to im-prove e… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.02666v2-abstract-full').style.display = 'inline'; document.getElementById('2408.02666v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.02666v2-abstract-full" style="display: none;"> Model-based evaluation is at the heart of successful model development -- as a reward model for training, and as a replacement for human evaluation. To train such evaluators, the standard approach is to collect a large amount of human preference judgments over model responses, which is costly and the data becomes stale as models improve. In this work, we present an approach that aims to im-prove evaluators without human annotations, using synthetic training data only. Starting from unlabeled instructions, our iterative self-improvement scheme generates contrasting model outputs and trains an LLM-as-a-Judge to produce reasoning traces and final judgments, repeating this training at each new iteration using the improved predictions. Without any labeled preference data, our Self-Taught Evaluator can improve a strong LLM (Llama3-70B-Instruct) from 75.4 to 88.3 (88.7 with majority vote) on RewardBench. This outperforms commonly used LLM judges such as GPT-4 and matches the performance of the top-performing reward models trained with labeled examples. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.02666v2-abstract-full').style.display = 'none'; document.getElementById('2408.02666v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 5 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.19594">arXiv:2407.19594</a> <span> [<a href="https://arxiv.org/pdf/2407.19594">pdf</a>, <a href="https://arxiv.org/format/2407.19594">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Meta-Rewarding Language Models: Self-Improving Alignment with LLM-as-a-Meta-Judge </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wu%2C+T">Tianhao Wu</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+W">Weizhe Yuan</a>, <a href="/search/cs?searchtype=author&query=Golovneva%2C+O">Olga Golovneva</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+J">Jing Xu</a>, <a href="/search/cs?searchtype=author&query=Tian%2C+Y">Yuandong Tian</a>, <a href="/search/cs?searchtype=author&query=Jiao%2C+J">Jiantao Jiao</a>, <a href="/search/cs?searchtype=author&query=Weston%2C+J">Jason Weston</a>, <a href="/search/cs?searchtype=author&query=Sukhbaatar%2C+S">Sainbayar Sukhbaatar</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.19594v2-abstract-short" style="display: inline;"> Large Language Models (LLMs) are rapidly surpassing human knowledge in many domains. While improving these models traditionally relies on costly human data, recent self-rewarding mechanisms (Yuan et al., 2024) have shown that LLMs can improve by judging their own responses instead of relying on human labelers. However, existing methods have primarily focused on improving model responses rather tha… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.19594v2-abstract-full').style.display = 'inline'; document.getElementById('2407.19594v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.19594v2-abstract-full" style="display: none;"> Large Language Models (LLMs) are rapidly surpassing human knowledge in many domains. While improving these models traditionally relies on costly human data, recent self-rewarding mechanisms (Yuan et al., 2024) have shown that LLMs can improve by judging their own responses instead of relying on human labelers. However, existing methods have primarily focused on improving model responses rather than judgment capabilities, resulting in rapid saturation during iterative training. To address this issue, we introduce a novel Meta-Rewarding step to the self-improvement process, where the model judges its own judgements and uses that feedback to refine its judgment skills. Surprisingly, this unsupervised approach improves the model's ability to judge {\em and} follow instructions, as demonstrated by a win rate improvement of Llama-3-8B-Instruct from 22.9% to 39.4% on AlpacaEval 2, and 20.6% to 29.1% on Arena-Hard. These results strongly suggest the potential for self-improving models without human supervision. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.19594v2-abstract-full').style.display = 'none'; document.getElementById('2407.19594v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 28 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.06023">arXiv:2407.06023</a> <span> [<a href="https://arxiv.org/pdf/2407.06023">pdf</a>, <a href="https://arxiv.org/format/2407.06023">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Distilling System 2 into System 1 </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yu%2C+P">Ping Yu</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+J">Jing Xu</a>, <a href="/search/cs?searchtype=author&query=Weston%2C+J">Jason Weston</a>, <a href="/search/cs?searchtype=author&query=Kulikov%2C+I">Ilia Kulikov</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.06023v3-abstract-short" style="display: inline;"> Large language models (LLMs) can spend extra compute during inference to generate intermediate thoughts, which helps to produce better final responses. Since Chain-of-Thought (Wei et al., 2022), many such System 2 techniques have been proposed such as Rephrase and Respond (Deng et al., 2023a), System 2 Attention (Weston and Sukhbaatar, 2023) and Branch-Solve-Merge (Saha et al., 2023). In this work… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.06023v3-abstract-full').style.display = 'inline'; document.getElementById('2407.06023v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.06023v3-abstract-full" style="display: none;"> Large language models (LLMs) can spend extra compute during inference to generate intermediate thoughts, which helps to produce better final responses. Since Chain-of-Thought (Wei et al., 2022), many such System 2 techniques have been proposed such as Rephrase and Respond (Deng et al., 2023a), System 2 Attention (Weston and Sukhbaatar, 2023) and Branch-Solve-Merge (Saha et al., 2023). In this work we investigate self-supervised methods to ``compile'' (distill) higher quality outputs from System 2 techniques back into LLM generations without intermediate reasoning token sequences, as this reasoning has been distilled into System 1. We show that several such techniques can be successfully distilled, resulting in improved results compared to the original System 1 performance, and with less inference cost than System 2. We posit that such System 2 distillation will be an important feature of future continually learning AI systems, enabling them to focus System 2 capabilities on the reasoning tasks that they cannot yet do well. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.06023v3-abstract-full').style.display = 'none'; document.getElementById('2407.06023v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 8 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.17744">arXiv:2406.17744</a> <span> [<a href="https://arxiv.org/pdf/2406.17744">pdf</a>, <a href="https://arxiv.org/format/2406.17744">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Following Length Constraints in Instructions </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yuan%2C+W">Weizhe Yuan</a>, <a href="/search/cs?searchtype=author&query=Kulikov%2C+I">Ilia Kulikov</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+P">Ping Yu</a>, <a href="/search/cs?searchtype=author&query=Cho%2C+K">Kyunghyun Cho</a>, <a href="/search/cs?searchtype=author&query=Sukhbaatar%2C+S">Sainbayar Sukhbaatar</a>, <a href="/search/cs?searchtype=author&query=Weston%2C+J">Jason Weston</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+J">Jing Xu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.17744v1-abstract-short" style="display: inline;"> Aligned instruction following models can better fulfill user requests than their unaligned counterparts. However, it has been shown that there is a length bias in evaluation of such models, and that training algorithms tend to exploit this bias by learning longer responses. In this work we show how to train models that can be controlled at inference time with instructions containing desired length… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.17744v1-abstract-full').style.display = 'inline'; document.getElementById('2406.17744v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.17744v1-abstract-full" style="display: none;"> Aligned instruction following models can better fulfill user requests than their unaligned counterparts. However, it has been shown that there is a length bias in evaluation of such models, and that training algorithms tend to exploit this bias by learning longer responses. In this work we show how to train models that can be controlled at inference time with instructions containing desired length constraints. Such models are superior in length instructed evaluations, outperforming standard instruction following models such as GPT4, Llama 3 and Mixtral. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.17744v1-abstract-full').style.display = 'none'; document.getElementById('2406.17744v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">13 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.18719">arXiv:2405.18719</a> <span> [<a href="https://arxiv.org/pdf/2405.18719">pdf</a>, <a href="https://arxiv.org/format/2405.18719">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Contextual Position Encoding: Learning to Count What's Important </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Golovneva%2C+O">Olga Golovneva</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+T">Tianlu Wang</a>, <a href="/search/cs?searchtype=author&query=Weston%2C+J">Jason Weston</a>, <a href="/search/cs?searchtype=author&query=Sukhbaatar%2C+S">Sainbayar Sukhbaatar</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.18719v2-abstract-short" style="display: inline;"> The attention mechanism is a critical component of Large Language Models (LLMs) that allows tokens in a sequence to interact with each other, but is order-invariant. Incorporating position encoding (PE) makes it possible to address by position, such as attending to the i-th token. However, current PE methods use token counts to derive position, and thus cannot generalize to higher levels of abstra… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.18719v2-abstract-full').style.display = 'inline'; document.getElementById('2405.18719v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.18719v2-abstract-full" style="display: none;"> The attention mechanism is a critical component of Large Language Models (LLMs) that allows tokens in a sequence to interact with each other, but is order-invariant. Incorporating position encoding (PE) makes it possible to address by position, such as attending to the i-th token. However, current PE methods use token counts to derive position, and thus cannot generalize to higher levels of abstraction, such as attending to the i-th sentence. In this paper, we propose a new position encoding method, Contextual Position Encoding (CoPE), that allows positions to be conditioned on context by incrementing position only on certain tokens determined by the model. This allows more general position addressing such as attending to the $i$-th particular word, noun, or sentence. We show that CoPE can solve the selective copy, counting and Flip-Flop tasks where popular position embeddings fail, and improves perplexity on language modeling and coding tasks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.18719v2-abstract-full').style.display = 'none'; document.getElementById('2405.18719v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 28 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2404.19733">arXiv:2404.19733</a> <span> [<a href="https://arxiv.org/pdf/2404.19733">pdf</a>, <a href="https://arxiv.org/format/2404.19733">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Iterative Reasoning Preference Optimization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Pang%2C+R+Y">Richard Yuanzhe Pang</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+W">Weizhe Yuan</a>, <a href="/search/cs?searchtype=author&query=Cho%2C+K">Kyunghyun Cho</a>, <a href="/search/cs?searchtype=author&query=He%2C+H">He He</a>, <a href="/search/cs?searchtype=author&query=Sukhbaatar%2C+S">Sainbayar Sukhbaatar</a>, <a href="/search/cs?searchtype=author&query=Weston%2C+J">Jason Weston</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2404.19733v3-abstract-short" style="display: inline;"> Iterative preference optimization methods have recently been shown to perform well for general instruction tuning tasks, but typically make little improvement on reasoning tasks (Yuan et al., 2024, Chen et al., 2024). In this work we develop an iterative approach that optimizes the preference between competing generated Chain-of-Thought (CoT) candidates by optimizing for winning vs. losing reasoni… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.19733v3-abstract-full').style.display = 'inline'; document.getElementById('2404.19733v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2404.19733v3-abstract-full" style="display: none;"> Iterative preference optimization methods have recently been shown to perform well for general instruction tuning tasks, but typically make little improvement on reasoning tasks (Yuan et al., 2024, Chen et al., 2024). In this work we develop an iterative approach that optimizes the preference between competing generated Chain-of-Thought (CoT) candidates by optimizing for winning vs. losing reasoning steps that lead to the correct answer. We train using a modified DPO loss (Rafailov et al., 2023) with an additional negative log-likelihood term, which we find to be crucial. We show reasoning improves across repeated iterations of this scheme. While only relying on examples in the training set, our approach results in increasing accuracy on GSM8K, MATH, and ARC-Challenge for Llama-2-70B-Chat, outperforming other Llama-2-based models not relying on additionally sourced datasets. For example, we see a large improvement from 55.6% to 81.6% on GSM8K and an accuracy of 88.7% with majority voting out of 32 samples. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.19733v3-abstract-full').style.display = 'none'; document.getElementById('2404.19733v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 30 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2403.13799">arXiv:2403.13799</a> <span> [<a href="https://arxiv.org/pdf/2403.13799">pdf</a>, <a href="https://arxiv.org/format/2403.13799">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Reverse Training to Nurse the Reversal Curse </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Golovneva%2C+O">Olga Golovneva</a>, <a href="/search/cs?searchtype=author&query=Allen-Zhu%2C+Z">Zeyuan Allen-Zhu</a>, <a href="/search/cs?searchtype=author&query=Weston%2C+J">Jason Weston</a>, <a href="/search/cs?searchtype=author&query=Sukhbaatar%2C+S">Sainbayar Sukhbaatar</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2403.13799v3-abstract-short" style="display: inline;"> Large language models (LLMs) have a surprising failure: when trained on "A has a feature B", they do not generalize to "B is a feature of A", which is termed the Reversal Curse. Even when training with trillions of tokens this issue still appears due to Zipf's law - hence even if we train on the entire internet. This work proposes an alternative training scheme, called reverse training, whereby al… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.13799v3-abstract-full').style.display = 'inline'; document.getElementById('2403.13799v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2403.13799v3-abstract-full" style="display: none;"> Large language models (LLMs) have a surprising failure: when trained on "A has a feature B", they do not generalize to "B is a feature of A", which is termed the Reversal Curse. Even when training with trillions of tokens this issue still appears due to Zipf's law - hence even if we train on the entire internet. This work proposes an alternative training scheme, called reverse training, whereby all words are used twice, doubling the amount of available tokens. The LLM is trained in both forward and reverse directions by reversing the training strings while preserving (i.e., not reversing) chosen substrings, such as entities. We show that data-matched reverse-trained models provide superior performance to standard models on standard tasks, and compute-matched reverse-trained models provide far superior performance on reversal tasks, helping resolve the reversal curse issue. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.13799v3-abstract-full').style.display = 'none'; document.getElementById('2403.13799v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 20 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2403.07816">arXiv:2403.07816</a> <span> [<a href="https://arxiv.org/pdf/2403.07816">pdf</a>, <a href="https://arxiv.org/format/2403.07816">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Branch-Train-MiX: Mixing Expert LLMs into a Mixture-of-Experts LLM </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Sukhbaatar%2C+S">Sainbayar Sukhbaatar</a>, <a href="/search/cs?searchtype=author&query=Golovneva%2C+O">Olga Golovneva</a>, <a href="/search/cs?searchtype=author&query=Sharma%2C+V">Vasu Sharma</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+H">Hu Xu</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+X+V">Xi Victoria Lin</a>, <a href="/search/cs?searchtype=author&query=Rozi%C3%A8re%2C+B">Baptiste Rozi猫re</a>, <a href="/search/cs?searchtype=author&query=Kahn%2C+J">Jacob Kahn</a>, <a href="/search/cs?searchtype=author&query=Li%2C+D">Daniel Li</a>, <a href="/search/cs?searchtype=author&query=Yih%2C+W">Wen-tau Yih</a>, <a href="/search/cs?searchtype=author&query=Weston%2C+J">Jason Weston</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xian Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2403.07816v1-abstract-short" style="display: inline;"> We investigate efficient methods for training Large Language Models (LLMs) to possess capabilities in multiple specialized domains, such as coding, math reasoning and world knowledge. Our method, named Branch-Train-MiX (BTX), starts from a seed model, which is branched to train experts in embarrassingly parallel fashion with high throughput and reduced communication cost. After individual experts… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.07816v1-abstract-full').style.display = 'inline'; document.getElementById('2403.07816v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2403.07816v1-abstract-full" style="display: none;"> We investigate efficient methods for training Large Language Models (LLMs) to possess capabilities in multiple specialized domains, such as coding, math reasoning and world knowledge. Our method, named Branch-Train-MiX (BTX), starts from a seed model, which is branched to train experts in embarrassingly parallel fashion with high throughput and reduced communication cost. After individual experts are asynchronously trained, BTX brings together their feedforward parameters as experts in Mixture-of-Expert (MoE) layers and averages the remaining parameters, followed by an MoE-finetuning stage to learn token-level routing. BTX generalizes two special cases, the Branch-Train-Merge method, which does not have the MoE finetuning stage to learn routing, and sparse upcycling, which omits the stage of training experts asynchronously. Compared to alternative approaches, BTX achieves the best accuracy-efficiency tradeoff. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.07816v1-abstract-full').style.display = 'none'; document.getElementById('2403.07816v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2402.14158">arXiv:2402.14158</a> <span> [<a href="https://arxiv.org/pdf/2402.14158">pdf</a>, <a href="https://arxiv.org/format/2402.14158">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> TOOLVERIFIER: Generalization to New Tools via Self-Verification </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Mekala%2C+D">Dheeraj Mekala</a>, <a href="/search/cs?searchtype=author&query=Weston%2C+J">Jason Weston</a>, <a href="/search/cs?searchtype=author&query=Lanchantin%2C+J">Jack Lanchantin</a>, <a href="/search/cs?searchtype=author&query=Raileanu%2C+R">Roberta Raileanu</a>, <a href="/search/cs?searchtype=author&query=Lomeli%2C+M">Maria Lomeli</a>, <a href="/search/cs?searchtype=author&query=Shang%2C+J">Jingbo Shang</a>, <a href="/search/cs?searchtype=author&query=Dwivedi-Yu%2C+J">Jane Dwivedi-Yu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2402.14158v2-abstract-short" style="display: inline;"> Teaching language models to use tools is an important milestone towards building general assistants, but remains an open problem. While there has been significant progress on learning to use specific tools via fine-tuning, language models still struggle with learning how to robustly use new tools from only a few demonstrations. In this work we introduce a self-verification method which distinguish… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.14158v2-abstract-full').style.display = 'inline'; document.getElementById('2402.14158v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2402.14158v2-abstract-full" style="display: none;"> Teaching language models to use tools is an important milestone towards building general assistants, but remains an open problem. While there has been significant progress on learning to use specific tools via fine-tuning, language models still struggle with learning how to robustly use new tools from only a few demonstrations. In this work we introduce a self-verification method which distinguishes between close candidates by self-asking contrastive questions during (1) tool selection; and (2) parameter generation. We construct synthetic, high-quality, self-generated data for this goal using Llama-2 70B, which we intend to release publicly. Extensive experiments on 4 tasks from the ToolBench benchmark, consisting of 17 unseen tools, demonstrate an average improvement of 22% over few-shot baselines, even in scenarios where the distinctions between candidate tools are finely nuanced. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.14158v2-abstract-full').style.display = 'none'; document.getElementById('2402.14158v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 21 February, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2401.10020">arXiv:2401.10020</a> <span> [<a href="https://arxiv.org/pdf/2401.10020">pdf</a>, <a href="https://arxiv.org/format/2401.10020">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Self-Rewarding Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yuan%2C+W">Weizhe Yuan</a>, <a href="/search/cs?searchtype=author&query=Pang%2C+R+Y">Richard Yuanzhe Pang</a>, <a href="/search/cs?searchtype=author&query=Cho%2C+K">Kyunghyun Cho</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xian Li</a>, <a href="/search/cs?searchtype=author&query=Sukhbaatar%2C+S">Sainbayar Sukhbaatar</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+J">Jing Xu</a>, <a href="/search/cs?searchtype=author&query=Weston%2C+J">Jason Weston</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2401.10020v2-abstract-short" style="display: inline;"> We posit that to achieve superhuman agents, future models require superhuman feedback in order to provide an adequate training signal. Current approaches commonly train reward models from human preferences, which may then be bottlenecked by human performance level, and secondly these separate frozen reward models cannot then learn to improve during LLM training. In this work, we study Self-Rewardi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2401.10020v2-abstract-full').style.display = 'inline'; document.getElementById('2401.10020v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2401.10020v2-abstract-full" style="display: none;"> We posit that to achieve superhuman agents, future models require superhuman feedback in order to provide an adequate training signal. Current approaches commonly train reward models from human preferences, which may then be bottlenecked by human performance level, and secondly these separate frozen reward models cannot then learn to improve during LLM training. In this work, we study Self-Rewarding Language Models, where the language model itself is used via LLM-as-a-Judge prompting to provide its own rewards during training. We show that during Iterative DPO training that not only does instruction following ability improve, but also the ability to provide high-quality rewards to itself. Fine-tuning Llama 2 70B on three iterations of our approach yields a model that outperforms many existing systems on the AlpacaEval 2.0 leaderboard, including Claude 2, Gemini Pro, and GPT-4 0613. While there is much left still to explore, this work opens the door to the possibility of models that can continually improve in both axes. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2401.10020v2-abstract-full').style.display = 'none'; document.getElementById('2401.10020v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 February, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 18 January, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2312.16682">arXiv:2312.16682</a> <span> [<a href="https://arxiv.org/pdf/2312.16682">pdf</a>, <a href="https://arxiv.org/format/2312.16682">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Some things are more CRINGE than others: Iterative Preference Optimization with the Pairwise Cringe Loss </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xu%2C+J">Jing Xu</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+A">Andrew Lee</a>, <a href="/search/cs?searchtype=author&query=Sukhbaatar%2C+S">Sainbayar Sukhbaatar</a>, <a href="/search/cs?searchtype=author&query=Weston%2C+J">Jason Weston</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2312.16682v2-abstract-short" style="display: inline;"> Practitioners commonly align large language models using pairwise preferences, i.e., given labels of the type response A is preferred to response B for a given input. Perhaps less commonly, methods have also been developed for binary feedback, i.e. training models given labels of type response A is good or bad. We show how an existing performant binary feedback method, the Cringe Loss (Adolphs et… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.16682v2-abstract-full').style.display = 'inline'; document.getElementById('2312.16682v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2312.16682v2-abstract-full" style="display: none;"> Practitioners commonly align large language models using pairwise preferences, i.e., given labels of the type response A is preferred to response B for a given input. Perhaps less commonly, methods have also been developed for binary feedback, i.e. training models given labels of type response A is good or bad. We show how an existing performant binary feedback method, the Cringe Loss (Adolphs et al., 2022), can be generalized to the pairwise preference setting using a simple soft margin extension. Pairwise Cringe Loss is straightforward to implement and efficient to train, and we find it outperforms state-of-the-art preference optimization algorithms such as PPO and DPO on the AlpacaFarm benchmark. We show that iterations of training of our model are important for improved results, and that we can generalize DPO to Iterative DPO in the same way. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.16682v2-abstract-full').style.display = 'none'; document.getElementById('2312.16682v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 27 December, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2311.11829">arXiv:2311.11829</a> <span> [<a href="https://arxiv.org/pdf/2311.11829">pdf</a>, <a href="https://arxiv.org/format/2311.11829">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> System 2 Attention (is something you might need too) </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Weston%2C+J">Jason Weston</a>, <a href="/search/cs?searchtype=author&query=Sukhbaatar%2C+S">Sainbayar Sukhbaatar</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2311.11829v1-abstract-short" style="display: inline;"> Soft attention in Transformer-based Large Language Models (LLMs) is susceptible to incorporating irrelevant information from the context into its latent representations, which adversely affects next token generations. To help rectify these issues, we introduce System 2 Attention (S2A), which leverages the ability of LLMs to reason in natural language and follow instructions in order to decide what… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2311.11829v1-abstract-full').style.display = 'inline'; document.getElementById('2311.11829v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2311.11829v1-abstract-full" style="display: none;"> Soft attention in Transformer-based Large Language Models (LLMs) is susceptible to incorporating irrelevant information from the context into its latent representations, which adversely affects next token generations. To help rectify these issues, we introduce System 2 Attention (S2A), which leverages the ability of LLMs to reason in natural language and follow instructions in order to decide what to attend to. S2A regenerates the input context to only include the relevant portions, before attending to the regenerated context to elicit the final response. In experiments, S2A outperforms standard attention-based LLMs on three tasks containing opinion or irrelevant information, QA, math word problems and longform generation, where S2A increases factuality and objectivity, and decreases sycophancy. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2311.11829v1-abstract-full').style.display = 'none'; document.getElementById('2311.11829v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 November, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2311.07961">arXiv:2311.07961</a> <span> [<a href="https://arxiv.org/pdf/2311.07961">pdf</a>, <a href="https://arxiv.org/format/2311.07961">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> The ART of LLM Refinement: Ask, Refine, and Trust </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Shridhar%2C+K">Kumar Shridhar</a>, <a href="/search/cs?searchtype=author&query=Sinha%2C+K">Koustuv Sinha</a>, <a href="/search/cs?searchtype=author&query=Cohen%2C+A">Andrew Cohen</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+T">Tianlu Wang</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+P">Ping Yu</a>, <a href="/search/cs?searchtype=author&query=Pasunuru%2C+R">Ram Pasunuru</a>, <a href="/search/cs?searchtype=author&query=Sachan%2C+M">Mrinmaya Sachan</a>, <a href="/search/cs?searchtype=author&query=Weston%2C+J">Jason Weston</a>, <a href="/search/cs?searchtype=author&query=Celikyilmaz%2C+A">Asli Celikyilmaz</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2311.07961v1-abstract-short" style="display: inline;"> In recent years, Large Language Models (LLMs) have demonstrated remarkable generative abilities, but can they judge the quality of their own generations? A popular concept, referred to as self-refinement, postulates that LLMs can detect and correct the errors in their generations when asked to do so. However, recent empirical evidence points in the opposite direction, suggesting that LLMs often st… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2311.07961v1-abstract-full').style.display = 'inline'; document.getElementById('2311.07961v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2311.07961v1-abstract-full" style="display: none;"> In recent years, Large Language Models (LLMs) have demonstrated remarkable generative abilities, but can they judge the quality of their own generations? A popular concept, referred to as self-refinement, postulates that LLMs can detect and correct the errors in their generations when asked to do so. However, recent empirical evidence points in the opposite direction, suggesting that LLMs often struggle to accurately identify errors when reasoning is involved. To address this, we propose a reasoning with refinement objective called ART: Ask, Refine, and Trust, which asks necessary questions to decide when an LLM should refine its output, and either affirm or withhold trust in its refinement by ranking the refinement and the initial prediction. On two multistep reasoning tasks of mathematical word problems (GSM8K) and question answering (StrategyQA), ART achieves a performance gain of +5 points over self-refinement baselines, while using a much smaller model as the decision maker. We also demonstrate the benefit of using smaller models to make refinement decisions as a cost-effective alternative to fine-tuning a larger model. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2311.07961v1-abstract-full').style.display = 'none'; document.getElementById('2311.07961v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 November, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2310.15123">arXiv:2310.15123</a> <span> [<a href="https://arxiv.org/pdf/2310.15123">pdf</a>, <a href="https://arxiv.org/format/2310.15123">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Branch-Solve-Merge Improves Large Language Model Evaluation and Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Saha%2C+S">Swarnadeep Saha</a>, <a href="/search/cs?searchtype=author&query=Levy%2C+O">Omer Levy</a>, <a href="/search/cs?searchtype=author&query=Celikyilmaz%2C+A">Asli Celikyilmaz</a>, <a href="/search/cs?searchtype=author&query=Bansal%2C+M">Mohit Bansal</a>, <a href="/search/cs?searchtype=author&query=Weston%2C+J">Jason Weston</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xian Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2310.15123v2-abstract-short" style="display: inline;"> Large Language Models (LLMs) are frequently used for multi-faceted language generation and evaluation tasks that involve satisfying intricate user constraints or taking into account multiple aspects and criteria. However, their performance can fall short, due to the model's lack of coherence and inability to plan and decompose the problem. We propose Branch-Solve-Merge (BSM), a Large Language Mode… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2310.15123v2-abstract-full').style.display = 'inline'; document.getElementById('2310.15123v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2310.15123v2-abstract-full" style="display: none;"> Large Language Models (LLMs) are frequently used for multi-faceted language generation and evaluation tasks that involve satisfying intricate user constraints or taking into account multiple aspects and criteria. However, their performance can fall short, due to the model's lack of coherence and inability to plan and decompose the problem. We propose Branch-Solve-Merge (BSM), a Large Language Model program (Schlag et al., 2023) for tackling such challenging natural language tasks. It consists of branch, solve, and merge modules that are parameterized with specific prompts to the base LLM. These three modules plan a decomposition of the task into multiple parallel sub-tasks, independently solve them, and fuse the solutions to the sub-tasks. We apply our method to the tasks of LLM response evaluation and constrained text generation and evaluate its effectiveness with multiple LLMs, including Vicuna, LLaMA-2-chat, and GPT-4. BSM improves the evaluation correctness and consistency for each LLM by enhancing human-LLM agreement by up to 26%, reducing length and pairwise position biases by up to 50%, and allowing LLaMA2-chat to match or outperform GPT-4 on most domains. On a constraint story generation task, BSM improves the coherence of stories while also improving constraint satisfaction by 12%. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2310.15123v2-abstract-full').style.display = 'none'; document.getElementById('2310.15123v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 23 October, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">NAACL 2024 (19 pages, 7 figures, 11 tables)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2310.05029">arXiv:2310.05029</a> <span> [<a href="https://arxiv.org/pdf/2310.05029">pdf</a>, <a href="https://arxiv.org/format/2310.05029">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Walking Down the Memory Maze: Beyond Context Limit through Interactive Reading </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+H">Howard Chen</a>, <a href="/search/cs?searchtype=author&query=Pasunuru%2C+R">Ramakanth Pasunuru</a>, <a href="/search/cs?searchtype=author&query=Weston%2C+J">Jason Weston</a>, <a href="/search/cs?searchtype=author&query=Celikyilmaz%2C+A">Asli Celikyilmaz</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2310.05029v1-abstract-short" style="display: inline;"> Large language models (LLMs) have advanced in large strides due to the effectiveness of the self-attention mechanism that processes and compares all tokens at once. However, this mechanism comes with a fundamental issue -- the predetermined context window is bound to be limited. Despite attempts to extend the context window through methods like extrapolating the positional embedding, using recurre… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2310.05029v1-abstract-full').style.display = 'inline'; document.getElementById('2310.05029v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2310.05029v1-abstract-full" style="display: none;"> Large language models (LLMs) have advanced in large strides due to the effectiveness of the self-attention mechanism that processes and compares all tokens at once. However, this mechanism comes with a fundamental issue -- the predetermined context window is bound to be limited. Despite attempts to extend the context window through methods like extrapolating the positional embedding, using recurrence, or selectively retrieving essential parts of the long sequence, long-text understanding continues to be a challenge. We propose an alternative approach which instead treats the LLM as an interactive agent, allowing it to decide how to read the text via iterative prompting. We introduce MemWalker, a method that first processes the long context into a tree of summary nodes. Upon receiving a query, the model navigates this tree in search of relevant information, and responds once it gathers sufficient information. On long-text question answering tasks our method outperforms baseline approaches that use long context windows, recurrence, and retrieval. We show that, beyond effective reading, MemWalker enhances explainability by highlighting the reasoning steps as it interactively reads the text; pinpointing the relevant text segments related to the query. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2310.05029v1-abstract-full').style.display = 'none'; document.getElementById('2310.05029v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 October, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2309.11495">arXiv:2309.11495</a> <span> [<a href="https://arxiv.org/pdf/2309.11495">pdf</a>, <a href="https://arxiv.org/format/2309.11495">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Chain-of-Verification Reduces Hallucination in Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Dhuliawala%2C+S">Shehzaad Dhuliawala</a>, <a href="/search/cs?searchtype=author&query=Komeili%2C+M">Mojtaba Komeili</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+J">Jing Xu</a>, <a href="/search/cs?searchtype=author&query=Raileanu%2C+R">Roberta Raileanu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xian Li</a>, <a href="/search/cs?searchtype=author&query=Celikyilmaz%2C+A">Asli Celikyilmaz</a>, <a href="/search/cs?searchtype=author&query=Weston%2C+J">Jason Weston</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2309.11495v2-abstract-short" style="display: inline;"> Generation of plausible yet incorrect factual information, termed hallucination, is an unsolved issue in large language models. We study the ability of language models to deliberate on the responses they give in order to correct their mistakes. We develop the Chain-of-Verification (CoVe) method whereby the model first (i) drafts an initial response; then (ii) plans verification questions to fact-c… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2309.11495v2-abstract-full').style.display = 'inline'; document.getElementById('2309.11495v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2309.11495v2-abstract-full" style="display: none;"> Generation of plausible yet incorrect factual information, termed hallucination, is an unsolved issue in large language models. We study the ability of language models to deliberate on the responses they give in order to correct their mistakes. We develop the Chain-of-Verification (CoVe) method whereby the model first (i) drafts an initial response; then (ii) plans verification questions to fact-check its draft; (iii) answers those questions independently so the answers are not biased by other responses; and (iv) generates its final verified response. In experiments, we show CoVe decreases hallucinations across a variety of tasks, from list-based questions from Wikidata, closed book MultiSpanQA and longform text generation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2309.11495v2-abstract-full').style.display = 'none'; document.getElementById('2309.11495v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 September, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 20 September, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2308.06259">arXiv:2308.06259</a> <span> [<a href="https://arxiv.org/pdf/2308.06259">pdf</a>, <a href="https://arxiv.org/format/2308.06259">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Self-Alignment with Instruction Backtranslation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+X">Xian Li</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+P">Ping Yu</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+C">Chunting Zhou</a>, <a href="/search/cs?searchtype=author&query=Schick%2C+T">Timo Schick</a>, <a href="/search/cs?searchtype=author&query=Levy%2C+O">Omer Levy</a>, <a href="/search/cs?searchtype=author&query=Zettlemoyer%2C+L">Luke Zettlemoyer</a>, <a href="/search/cs?searchtype=author&query=Weston%2C+J">Jason Weston</a>, <a href="/search/cs?searchtype=author&query=Lewis%2C+M">Mike Lewis</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2308.06259v3-abstract-short" style="display: inline;"> We present a scalable method to build a high quality instruction following language model by automatically labelling human-written text with corresponding instructions. Our approach, named instruction backtranslation, starts with a language model finetuned on a small amount of seed data, and a given web corpus. The seed model is used to construct training examples by generating instruction prompts… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2308.06259v3-abstract-full').style.display = 'inline'; document.getElementById('2308.06259v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2308.06259v3-abstract-full" style="display: none;"> We present a scalable method to build a high quality instruction following language model by automatically labelling human-written text with corresponding instructions. Our approach, named instruction backtranslation, starts with a language model finetuned on a small amount of seed data, and a given web corpus. The seed model is used to construct training examples by generating instruction prompts for web documents (self-augmentation), and then selecting high quality examples from among these candidates (self-curation). This data is then used to finetune a stronger model. Finetuning LLaMa on two iterations of our approach yields a model that outperforms all other LLaMa-based models on the Alpaca leaderboard not relying on distillation data, demonstrating highly effective self-alignment. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2308.06259v3-abstract-full').style.display = 'none'; document.getElementById('2308.06259v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 11 August, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ICLR2024 camera ready</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2307.14117">arXiv:2307.14117</a> <span> [<a href="https://arxiv.org/pdf/2307.14117">pdf</a>, <a href="https://arxiv.org/format/2307.14117">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Leveraging Implicit Feedback from Deployment Data in Dialogue </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Pang%2C+R+Y">Richard Yuanzhe Pang</a>, <a href="/search/cs?searchtype=author&query=Roller%2C+S">Stephen Roller</a>, <a href="/search/cs?searchtype=author&query=Cho%2C+K">Kyunghyun Cho</a>, <a href="/search/cs?searchtype=author&query=He%2C+H">He He</a>, <a href="/search/cs?searchtype=author&query=Weston%2C+J">Jason Weston</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2307.14117v2-abstract-short" style="display: inline;"> We study improving social conversational agents by learning from natural dialogue between users and a deployed model, without extra annotations. To implicitly measure the quality of a machine-generated utterance, we leverage signals like user response length, sentiment and reaction of the future human utterances in the collected dialogue episodes. Our experiments use the publicly released deployme… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2307.14117v2-abstract-full').style.display = 'inline'; document.getElementById('2307.14117v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2307.14117v2-abstract-full" style="display: none;"> We study improving social conversational agents by learning from natural dialogue between users and a deployed model, without extra annotations. To implicitly measure the quality of a machine-generated utterance, we leverage signals like user response length, sentiment and reaction of the future human utterances in the collected dialogue episodes. Our experiments use the publicly released deployment data from BlenderBot (Xu et al., 2023). Human evaluation indicates improvements in our new models over baseline responses; however, we find that some proxy signals can lead to more generations with undesirable properties as well. For example, optimizing for conversation length can lead to more controversial or unfriendly generations compared to the baseline, whereas optimizing for positive sentiment or reaction can decrease these behaviors. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2307.14117v2-abstract-full').style.display = 'none'; document.getElementById('2307.14117v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 January, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 26 July, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">EACL 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2306.13588">arXiv:2306.13588</a> <span> [<a href="https://arxiv.org/pdf/2306.13588">pdf</a>, <a href="https://arxiv.org/format/2306.13588">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> System-Level Natural Language Feedback </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yuan%2C+W">Weizhe Yuan</a>, <a href="/search/cs?searchtype=author&query=Cho%2C+K">Kyunghyun Cho</a>, <a href="/search/cs?searchtype=author&query=Weston%2C+J">Jason Weston</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2306.13588v3-abstract-short" style="display: inline;"> Natural language (NL) feedback offers rich insights into user experience. While existing studies focus on an instance-level approach, where feedback is used to refine specific examples, we introduce a framework for system-level use of NL feedback. We show how to use feedback to formalize system-level design decisions in a human-in-the-loop-process -- in order to produce better models. In particula… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2306.13588v3-abstract-full').style.display = 'inline'; document.getElementById('2306.13588v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2306.13588v3-abstract-full" style="display: none;"> Natural language (NL) feedback offers rich insights into user experience. While existing studies focus on an instance-level approach, where feedback is used to refine specific examples, we introduce a framework for system-level use of NL feedback. We show how to use feedback to formalize system-level design decisions in a human-in-the-loop-process -- in order to produce better models. In particular this is done through: (i) metric design for tasks; and (ii) language model prompt design for refining model responses. We conduct two case studies of this approach for improving search query and dialog response generation, demonstrating the effectiveness of system-level feedback. We show the combination of system-level and instance-level feedback brings further gains, and that human written instance-level feedback results in more grounded refinements than GPT-3.5 written ones, underlying the importance of human feedback for building systems. We release our code and data at https://github.com/yyy-Apple/Sys-NL-Feedback. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2306.13588v3-abstract-full').style.display = 'none'; document.getElementById('2306.13588v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 February, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 23 June, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by EACL 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2306.04765">arXiv:2306.04765</a> <span> [<a href="https://arxiv.org/pdf/2306.04765">pdf</a>, <a href="https://arxiv.org/format/2306.04765">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> The HCI Aspects of Public Deployment of Research Chatbots: A User Study, Design Recommendations, and Open Challenges </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Behrooz%2C+M">Morteza Behrooz</a>, <a href="/search/cs?searchtype=author&query=Ngan%2C+W">William Ngan</a>, <a href="/search/cs?searchtype=author&query=Lane%2C+J">Joshua Lane</a>, <a href="/search/cs?searchtype=author&query=Morse%2C+G">Giuliano Morse</a>, <a href="/search/cs?searchtype=author&query=Babcock%2C+B">Benjamin Babcock</a>, <a href="/search/cs?searchtype=author&query=Shuster%2C+K">Kurt Shuster</a>, <a href="/search/cs?searchtype=author&query=Komeili%2C+M">Mojtaba Komeili</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+M">Moya Chen</a>, <a href="/search/cs?searchtype=author&query=Kambadur%2C+M">Melanie Kambadur</a>, <a href="/search/cs?searchtype=author&query=Boureau%2C+Y">Y-Lan Boureau</a>, <a href="/search/cs?searchtype=author&query=Weston%2C+J">Jason Weston</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2306.04765v1-abstract-short" style="display: inline;"> Publicly deploying research chatbots is a nuanced topic involving necessary risk-benefit analyses. While there have recently been frequent discussions on whether it is responsible to deploy such models, there has been far less focus on the interaction paradigms and design approaches that the resulting interfaces should adopt, in order to achieve their goals more effectively. We aim to pose, ground… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2306.04765v1-abstract-full').style.display = 'inline'; document.getElementById('2306.04765v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2306.04765v1-abstract-full" style="display: none;"> Publicly deploying research chatbots is a nuanced topic involving necessary risk-benefit analyses. While there have recently been frequent discussions on whether it is responsible to deploy such models, there has been far less focus on the interaction paradigms and design approaches that the resulting interfaces should adopt, in order to achieve their goals more effectively. We aim to pose, ground, and attempt to answer HCI questions involved in this scope, by reporting on a mixed-methods user study conducted on a recent research chatbot. We find that abstract anthropomorphic representation for the agent has a significant effect on user's perception, that offering AI explainability may have an impact on feedback rates, and that two (diegetic and extradiegetic) levels of the chat experience should be intentionally designed. We offer design recommendations and areas of further focus for the research community. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2306.04765v1-abstract-full').style.display = 'none'; document.getElementById('2306.04765v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 June, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2306.04707">arXiv:2306.04707</a> <span> [<a href="https://arxiv.org/pdf/2306.04707">pdf</a>, <a href="https://arxiv.org/format/2306.04707">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Improving Open Language Models by Learning from Organic Interactions </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xu%2C+J">Jing Xu</a>, <a href="/search/cs?searchtype=author&query=Ju%2C+D">Da Ju</a>, <a href="/search/cs?searchtype=author&query=Lane%2C+J">Joshua Lane</a>, <a href="/search/cs?searchtype=author&query=Komeili%2C+M">Mojtaba Komeili</a>, <a href="/search/cs?searchtype=author&query=Smith%2C+E+M">Eric Michael Smith</a>, <a href="/search/cs?searchtype=author&query=Ung%2C+M">Megan Ung</a>, <a href="/search/cs?searchtype=author&query=Behrooz%2C+M">Morteza Behrooz</a>, <a href="/search/cs?searchtype=author&query=Ngan%2C+W">William Ngan</a>, <a href="/search/cs?searchtype=author&query=Moritz%2C+R">Rashel Moritz</a>, <a href="/search/cs?searchtype=author&query=Sukhbaatar%2C+S">Sainbayar Sukhbaatar</a>, <a href="/search/cs?searchtype=author&query=Boureau%2C+Y">Y-Lan Boureau</a>, <a href="/search/cs?searchtype=author&query=Weston%2C+J">Jason Weston</a>, <a href="/search/cs?searchtype=author&query=Shuster%2C+K">Kurt Shuster</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2306.04707v1-abstract-short" style="display: inline;"> We present BlenderBot 3x, an update on the conversational model BlenderBot 3, which is now trained using organic conversation and feedback data from participating users of the system in order to improve both its skills and safety. We are publicly releasing the participating de-identified interaction data for use by the research community, in order to spur further progress. Training models with org… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2306.04707v1-abstract-full').style.display = 'inline'; document.getElementById('2306.04707v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2306.04707v1-abstract-full" style="display: none;"> We present BlenderBot 3x, an update on the conversational model BlenderBot 3, which is now trained using organic conversation and feedback data from participating users of the system in order to improve both its skills and safety. We are publicly releasing the participating de-identified interaction data for use by the research community, in order to spur further progress. Training models with organic data is challenging because interactions with people "in the wild" include both high quality conversations and feedback, as well as adversarial and toxic behavior. We study techniques that enable learning from helpful teachers while avoiding learning from people who are trying to trick the model into unhelpful or toxic responses. BlenderBot 3x is both preferred in conversation to BlenderBot 3, and is shown to produce safer responses in challenging situations. While our current models are still far from perfect, we believe further improvement can be achieved by continued use of the techniques explored in this work. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2306.04707v1-abstract-full').style.display = 'none'; document.getElementById('2306.04707v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 June, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2305.05364">arXiv:2305.05364</a> <span> [<a href="https://arxiv.org/pdf/2305.05364">pdf</a>, <a href="https://arxiv.org/format/2305.05364">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Large Language Model Programs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Schlag%2C+I">Imanol Schlag</a>, <a href="/search/cs?searchtype=author&query=Sukhbaatar%2C+S">Sainbayar Sukhbaatar</a>, <a href="/search/cs?searchtype=author&query=Celikyilmaz%2C+A">Asli Celikyilmaz</a>, <a href="/search/cs?searchtype=author&query=Yih%2C+W">Wen-tau Yih</a>, <a href="/search/cs?searchtype=author&query=Weston%2C+J">Jason Weston</a>, <a href="/search/cs?searchtype=author&query=Schmidhuber%2C+J">J眉rgen Schmidhuber</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xian Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2305.05364v1-abstract-short" style="display: inline;"> In recent years, large pre-trained language models (LLMs) have demonstrated the ability to follow instructions and perform novel tasks from a few examples. The possibility to parameterise an LLM through such in-context examples widens their capability at a much lower cost than finetuning. We extend this line of reasoning and present a method which further expands the capabilities of an LLM by embe… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2305.05364v1-abstract-full').style.display = 'inline'; document.getElementById('2305.05364v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2305.05364v1-abstract-full" style="display: none;"> In recent years, large pre-trained language models (LLMs) have demonstrated the ability to follow instructions and perform novel tasks from a few examples. The possibility to parameterise an LLM through such in-context examples widens their capability at a much lower cost than finetuning. We extend this line of reasoning and present a method which further expands the capabilities of an LLM by embedding it within an algorithm or program. To demonstrate the benefits of this approach, we present an illustrative example of evidence-supported question-answering. We obtain a 6.4\% improvement over the chain of thought baseline through a more algorithmic approach without any finetuning. Furthermore, we highlight recent work from this perspective and discuss the advantages and disadvantages in comparison to the standard approaches. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2305.05364v1-abstract-full').style.display = 'none'; document.getElementById('2305.05364v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 May, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2305.00833">arXiv:2305.00833</a> <span> [<a href="https://arxiv.org/pdf/2305.00833">pdf</a>, <a href="https://arxiv.org/format/2305.00833">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Learning to Reason and Memorize with Self-Notes </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lanchantin%2C+J">Jack Lanchantin</a>, <a href="/search/cs?searchtype=author&query=Toshniwal%2C+S">Shubham Toshniwal</a>, <a href="/search/cs?searchtype=author&query=Weston%2C+J">Jason Weston</a>, <a href="/search/cs?searchtype=author&query=Szlam%2C+A">Arthur Szlam</a>, <a href="/search/cs?searchtype=author&query=Sukhbaatar%2C+S">Sainbayar Sukhbaatar</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2305.00833v2-abstract-short" style="display: inline;"> Large language models have been shown to struggle with multi-step reasoning, and do not retain previous reasoning steps for future use. We propose a simple method for solving both of these problems by allowing the model to take Self-Notes. Unlike recent chain-of-thought or scratchpad approaches, the model can deviate from the input context at any time to explicitly think and write down its thought… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2305.00833v2-abstract-full').style.display = 'inline'; document.getElementById('2305.00833v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2305.00833v2-abstract-full" style="display: none;"> Large language models have been shown to struggle with multi-step reasoning, and do not retain previous reasoning steps for future use. We propose a simple method for solving both of these problems by allowing the model to take Self-Notes. Unlike recent chain-of-thought or scratchpad approaches, the model can deviate from the input context at any time to explicitly think and write down its thoughts. This allows the model to perform reasoning on the fly as it reads the context and even integrate previous reasoning steps, thus enhancing its memory with useful information and enabling multi-step reasoning. Experiments across a wide variety of tasks demonstrate that our method can outperform chain-of-thought and scratchpad methods by taking Self-Notes that interleave the input text. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2305.00833v2-abstract-full').style.display = 'none'; document.getElementById('2305.00833v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 October, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 1 May, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2304.13835">arXiv:2304.13835</a> <span> [<a href="https://arxiv.org/pdf/2304.13835">pdf</a>, <a href="https://arxiv.org/format/2304.13835">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Multi-Party Chat: Conversational Agents in Group Settings with Humans and Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wei%2C+J">Jimmy Wei</a>, <a href="/search/cs?searchtype=author&query=Shuster%2C+K">Kurt Shuster</a>, <a href="/search/cs?searchtype=author&query=Szlam%2C+A">Arthur Szlam</a>, <a href="/search/cs?searchtype=author&query=Weston%2C+J">Jason Weston</a>, <a href="/search/cs?searchtype=author&query=Urbanek%2C+J">Jack Urbanek</a>, <a href="/search/cs?searchtype=author&query=Komeili%2C+M">Mojtaba Komeili</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2304.13835v3-abstract-short" style="display: inline;"> Current dialogue research primarily studies pairwise (two-party) conversations, and does not address the everyday setting where more than two speakers converse together. In this work, we both collect and evaluate multi-party conversations to study this more general case. We use the LIGHT environment to construct grounded conversations, where each participant has an assigned character to role-play.… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2304.13835v3-abstract-full').style.display = 'inline'; document.getElementById('2304.13835v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2304.13835v3-abstract-full" style="display: none;"> Current dialogue research primarily studies pairwise (two-party) conversations, and does not address the everyday setting where more than two speakers converse together. In this work, we both collect and evaluate multi-party conversations to study this more general case. We use the LIGHT environment to construct grounded conversations, where each participant has an assigned character to role-play. We thus evaluate the ability of language models to act as one or more characters in such conversations. Models require two skills that pairwise-trained models appear to lack: (1) being able to decide when to talk; (2) producing coherent utterances grounded on multiple characters. We compare models trained on our new dataset to existing pairwise-trained dialogue models, as well as large language models with few-shot prompting. We find that our new dataset, MultiLIGHT, which we will publicly release, can help bring significant improvements in the group setting. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2304.13835v3-abstract-full').style.display = 'none'; document.getElementById('2304.13835v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 June, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 26 April, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2302.06784">arXiv:2302.06784</a> <span> [<a href="https://arxiv.org/pdf/2302.06784">pdf</a>, <a href="https://arxiv.org/format/2302.06784">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> The Stable Entropy Hypothesis and Entropy-Aware Decoding: An Analysis and Algorithm for Robust Natural Language Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Arora%2C+K">Kushal Arora</a>, <a href="/search/cs?searchtype=author&query=O%27Donnell%2C+T+J">Timothy J. O'Donnell</a>, <a href="/search/cs?searchtype=author&query=Precup%2C+D">Doina Precup</a>, <a href="/search/cs?searchtype=author&query=Weston%2C+J">Jason Weston</a>, <a href="/search/cs?searchtype=author&query=Cheung%2C+J+C+K">Jackie C. K. Cheung</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2302.06784v1-abstract-short" style="display: inline;"> State-of-the-art language generation models can degenerate when applied to open-ended generation problems such as text completion, story generation, or dialog modeling. This degeneration usually shows up in the form of incoherence, lack of vocabulary diversity, and self-repetition or copying from the context. In this paper, we postulate that ``human-like'' generations usually lie in a narrow and n… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2302.06784v1-abstract-full').style.display = 'inline'; document.getElementById('2302.06784v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2302.06784v1-abstract-full" style="display: none;"> State-of-the-art language generation models can degenerate when applied to open-ended generation problems such as text completion, story generation, or dialog modeling. This degeneration usually shows up in the form of incoherence, lack of vocabulary diversity, and self-repetition or copying from the context. In this paper, we postulate that ``human-like'' generations usually lie in a narrow and nearly flat entropy band, and violation of these entropy bounds correlates with degenerate behavior. Our experiments show that this stable narrow entropy zone exists across models, tasks, and domains and confirm the hypothesis that violations of this zone correlate with degeneration. We then use this insight to propose an entropy-aware decoding algorithm that respects these entropy bounds resulting in less degenerate, more contextual, and "human-like" language generation in open-ended text generation settings. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2302.06784v1-abstract-full').style.display = 'none'; document.getElementById('2302.06784v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 February, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2301.05746">arXiv:2301.05746</a> <span> [<a href="https://arxiv.org/pdf/2301.05746">pdf</a>, <a href="https://arxiv.org/format/2301.05746">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Infusing Commonsense World Models with Graph Knowledge </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Gurung%2C+A">Alexander Gurung</a>, <a href="/search/cs?searchtype=author&query=Komeili%2C+M">Mojtaba Komeili</a>, <a href="/search/cs?searchtype=author&query=Szlam%2C+A">Arthur Szlam</a>, <a href="/search/cs?searchtype=author&query=Weston%2C+J">Jason Weston</a>, <a href="/search/cs?searchtype=author&query=Urbanek%2C+J">Jack Urbanek</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2301.05746v1-abstract-short" style="display: inline;"> While language models have become more capable of producing compelling language, we find there are still gaps in maintaining consistency, especially when describing events in a dynamically changing world. We study the setting of generating narratives in an open world text adventure game, where a graph representation of the underlying game state can be used to train models that consume and output b… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2301.05746v1-abstract-full').style.display = 'inline'; document.getElementById('2301.05746v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2301.05746v1-abstract-full" style="display: none;"> While language models have become more capable of producing compelling language, we find there are still gaps in maintaining consistency, especially when describing events in a dynamically changing world. We study the setting of generating narratives in an open world text adventure game, where a graph representation of the underlying game state can be used to train models that consume and output both grounded graph representations and natural language descriptions and actions. We build a large set of tasks by combining crowdsourced and simulated gameplays with a novel dataset of complex actions in order to to construct such models. We find it is possible to improve the consistency of action narration models by training on graph contexts and targets, even if graphs are not present at test time. This is shown both in automatic metrics and human evaluations. We plan to release our code, the new set of tasks, and best performing models. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2301.05746v1-abstract-full').style.display = 'none'; document.getElementById('2301.05746v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 January, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2211.05826">arXiv:2211.05826</a> <span> [<a href="https://arxiv.org/pdf/2211.05826">pdf</a>, <a href="https://arxiv.org/format/2211.05826">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> The CRINGE Loss: Learning what language not to model </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Adolphs%2C+L">Leonard Adolphs</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+T">Tianyu Gao</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+J">Jing Xu</a>, <a href="/search/cs?searchtype=author&query=Shuster%2C+K">Kurt Shuster</a>, <a href="/search/cs?searchtype=author&query=Sukhbaatar%2C+S">Sainbayar Sukhbaatar</a>, <a href="/search/cs?searchtype=author&query=Weston%2C+J">Jason Weston</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2211.05826v1-abstract-short" style="display: inline;"> Standard language model training employs gold human documents or human-human interaction data, and treats all training data as positive examples. Growing evidence shows that even with very large amounts of positive training data, issues remain that can be alleviated with relatively small amounts of negative data -- examples of what the model should not do. In this work, we propose a novel procedur… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2211.05826v1-abstract-full').style.display = 'inline'; document.getElementById('2211.05826v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2211.05826v1-abstract-full" style="display: none;"> Standard language model training employs gold human documents or human-human interaction data, and treats all training data as positive examples. Growing evidence shows that even with very large amounts of positive training data, issues remain that can be alleviated with relatively small amounts of negative data -- examples of what the model should not do. In this work, we propose a novel procedure to train with such data called the CRINGE loss (ContRastive Iterative Negative GEneration). We show the effectiveness of this approach across three different experiments on the tasks of safe generation, contradiction avoidance, and open-domain dialogue. Our models outperform multiple strong baselines and are conceptually simple, easy to train and implement. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2211.05826v1-abstract-full').style.display = 'none'; document.getElementById('2211.05826v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 November, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2022. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2210.15893">arXiv:2210.15893</a> <span> [<a href="https://arxiv.org/pdf/2210.15893">pdf</a>, <a href="https://arxiv.org/format/2210.15893">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> When Life Gives You Lemons, Make Cherryade: Converting Feedback from Bad Responses into Good Labels </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Shi%2C+W">Weiyan Shi</a>, <a href="/search/cs?searchtype=author&query=Dinan%2C+E">Emily Dinan</a>, <a href="/search/cs?searchtype=author&query=Shuster%2C+K">Kurt Shuster</a>, <a href="/search/cs?searchtype=author&query=Weston%2C+J">Jason Weston</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+J">Jing Xu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2210.15893v1-abstract-short" style="display: inline;"> Deployed dialogue agents have the potential to integrate human feedback to continuously improve themselves. However, humans may not always provide explicit signals when the chatbot makes mistakes during interactions. In this work, we propose Juicer, a framework to make use of both binary and free-form textual human feedback. It works by: (i) extending sparse binary feedback by training a satisfact… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2210.15893v1-abstract-full').style.display = 'inline'; document.getElementById('2210.15893v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2210.15893v1-abstract-full" style="display: none;"> Deployed dialogue agents have the potential to integrate human feedback to continuously improve themselves. However, humans may not always provide explicit signals when the chatbot makes mistakes during interactions. In this work, we propose Juicer, a framework to make use of both binary and free-form textual human feedback. It works by: (i) extending sparse binary feedback by training a satisfaction classifier to label the unlabeled data; and (ii) training a reply corrector to map the bad replies to good ones. We find that augmenting training with model-corrected replies improves the final dialogue model, and we can further improve performance by using both positive and negative replies through the recently proposed Director model. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2210.15893v1-abstract-full').style.display = 'none'; document.getElementById('2210.15893v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 October, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2022. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2208.03295">arXiv:2208.03295</a> <span> [<a href="https://arxiv.org/pdf/2208.03295">pdf</a>, <a href="https://arxiv.org/format/2208.03295">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Learning from data in the mixed adversarial non-adversarial case: Finding the helpers and ignoring the trolls </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ju%2C+D">Da Ju</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+J">Jing Xu</a>, <a href="/search/cs?searchtype=author&query=Boureau%2C+Y">Y-Lan Boureau</a>, <a href="/search/cs?searchtype=author&query=Weston%2C+J">Jason Weston</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2208.03295v1-abstract-short" style="display: inline;"> The promise of interaction between intelligent conversational agents and humans is that models can learn from such feedback in order to improve. Unfortunately, such exchanges in the wild will not always involve human utterances that are benign or of high quality, and will include a mixture of engaged (helpers) and unengaged or even malicious users (trolls). In this work we study how to perform rob… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2208.03295v1-abstract-full').style.display = 'inline'; document.getElementById('2208.03295v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2208.03295v1-abstract-full" style="display: none;"> The promise of interaction between intelligent conversational agents and humans is that models can learn from such feedback in order to improve. Unfortunately, such exchanges in the wild will not always involve human utterances that are benign or of high quality, and will include a mixture of engaged (helpers) and unengaged or even malicious users (trolls). In this work we study how to perform robust learning in such an environment. We introduce a benchmark evaluation, SafetyMix, which can evaluate methods that learn safe vs. toxic language in a variety of adversarial settings to test their robustness. We propose and analyze several mitigating learning algorithms that identify trolls either at the example or at the user level. Our main finding is that user-based methods, that take into account that troll users will exhibit adversarial behavior across multiple examples, work best in a variety of settings on our benchmark. We then test these methods in a further real-life setting of conversations collected during deployment, with similar results. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2208.03295v1-abstract-full').style.display = 'none'; document.getElementById('2208.03295v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 August, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2022. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2208.03270">arXiv:2208.03270</a> <span> [<a href="https://arxiv.org/pdf/2208.03270">pdf</a>, <a href="https://arxiv.org/format/2208.03270">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Learning New Skills after Deployment: Improving open-domain internet-driven dialogue with human feedback </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xu%2C+J">Jing Xu</a>, <a href="/search/cs?searchtype=author&query=Ung%2C+M">Megan Ung</a>, <a href="/search/cs?searchtype=author&query=Komeili%2C+M">Mojtaba Komeili</a>, <a href="/search/cs?searchtype=author&query=Arora%2C+K">Kushal Arora</a>, <a href="/search/cs?searchtype=author&query=Boureau%2C+Y">Y-Lan Boureau</a>, <a href="/search/cs?searchtype=author&query=Weston%2C+J">Jason Weston</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2208.03270v2-abstract-short" style="display: inline;"> Frozen models trained to mimic static datasets can never improve their performance. Models that can employ internet-retrieval for up-to-date information and obtain feedback from humans during deployment provide the promise of both adapting to new information, and improving their performance. In this work we study how to improve internet-driven conversational skills in such a learning framework. We… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2208.03270v2-abstract-full').style.display = 'inline'; document.getElementById('2208.03270v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2208.03270v2-abstract-full" style="display: none;"> Frozen models trained to mimic static datasets can never improve their performance. Models that can employ internet-retrieval for up-to-date information and obtain feedback from humans during deployment provide the promise of both adapting to new information, and improving their performance. In this work we study how to improve internet-driven conversational skills in such a learning framework. We collect deployment data, which we make publicly available, of human interactions, and collect various types of human feedback -- including binary quality measurements, free-form text feedback, and fine-grained reasons for failure. We then study various algorithms for improving from such feedback, including standard supervised learning, rejection sampling, model-guiding and reward-based learning, in order to make recommendations on which type of feedback and algorithms work best. We find the recently introduced Director model (Arora et al., '22) shows significant improvements over other existing approaches. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2208.03270v2-abstract-full').style.display = 'none'; document.getElementById('2208.03270v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 August, 2022; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 5 August, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2022. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2208.03188">arXiv:2208.03188</a> <span> [<a href="https://arxiv.org/pdf/2208.03188">pdf</a>, <a href="https://arxiv.org/format/2208.03188">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> BlenderBot 3: a deployed conversational agent that continually learns to responsibly engage </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Shuster%2C+K">Kurt Shuster</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+J">Jing Xu</a>, <a href="/search/cs?searchtype=author&query=Komeili%2C+M">Mojtaba Komeili</a>, <a href="/search/cs?searchtype=author&query=Ju%2C+D">Da Ju</a>, <a href="/search/cs?searchtype=author&query=Smith%2C+E+M">Eric Michael Smith</a>, <a href="/search/cs?searchtype=author&query=Roller%2C+S">Stephen Roller</a>, <a href="/search/cs?searchtype=author&query=Ung%2C+M">Megan Ung</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+M">Moya Chen</a>, <a href="/search/cs?searchtype=author&query=Arora%2C+K">Kushal Arora</a>, <a href="/search/cs?searchtype=author&query=Lane%2C+J">Joshua Lane</a>, <a href="/search/cs?searchtype=author&query=Behrooz%2C+M">Morteza Behrooz</a>, <a href="/search/cs?searchtype=author&query=Ngan%2C+W">William Ngan</a>, <a href="/search/cs?searchtype=author&query=Poff%2C+S">Spencer Poff</a>, <a href="/search/cs?searchtype=author&query=Goyal%2C+N">Naman Goyal</a>, <a href="/search/cs?searchtype=author&query=Szlam%2C+A">Arthur Szlam</a>, <a href="/search/cs?searchtype=author&query=Boureau%2C+Y">Y-Lan Boureau</a>, <a href="/search/cs?searchtype=author&query=Kambadur%2C+M">Melanie Kambadur</a>, <a href="/search/cs?searchtype=author&query=Weston%2C+J">Jason Weston</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2208.03188v3-abstract-short" style="display: inline;"> We present BlenderBot 3, a 175B parameter dialogue model capable of open-domain conversation with access to the internet and a long-term memory, and having been trained on a large number of user defined tasks. We release both the model weights and code, and have also deployed the model on a public web page to interact with organic users. This technical report describes how the model was built (arc… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2208.03188v3-abstract-full').style.display = 'inline'; document.getElementById('2208.03188v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2208.03188v3-abstract-full" style="display: none;"> We present BlenderBot 3, a 175B parameter dialogue model capable of open-domain conversation with access to the internet and a long-term memory, and having been trained on a large number of user defined tasks. We release both the model weights and code, and have also deployed the model on a public web page to interact with organic users. This technical report describes how the model was built (architecture, model and training scheme), and details of its deployment, including safety mechanisms. Human evaluations show its superiority to existing open-domain dialogue agents, including its predecessors (Roller et al., 2021; Komeili et al., 2022). Finally, we detail our plan for continual learning using the data collected from deployment, which will also be publicly released. The goal of this research program is thus to enable the community to study ever-improving responsible agents that learn through interaction. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2208.03188v3-abstract-full').style.display = 'none'; document.getElementById('2208.03188v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 August, 2022; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 5 August, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2022. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2206.07694">arXiv:2206.07694</a> <span> [<a href="https://arxiv.org/pdf/2206.07694">pdf</a>, <a href="https://arxiv.org/format/2206.07694">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> DIRECTOR: Generator-Classifiers For Supervised Language Modeling </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Arora%2C+K">Kushal Arora</a>, <a href="/search/cs?searchtype=author&query=Shuster%2C+K">Kurt Shuster</a>, <a href="/search/cs?searchtype=author&query=Sukhbaatar%2C+S">Sainbayar Sukhbaatar</a>, <a href="/search/cs?searchtype=author&query=Weston%2C+J">Jason Weston</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2206.07694v2-abstract-short" style="display: inline;"> Current language models achieve low perplexity but their resulting generations still suffer from toxic responses, repetitiveness and contradictions. The standard language modeling setup fails to address these issues. In this paper, we introduce a new architecture, {\sc Director}, that consists of a unified generator-classifier with both a language modeling and a classification head for each output… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2206.07694v2-abstract-full').style.display = 'inline'; document.getElementById('2206.07694v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2206.07694v2-abstract-full" style="display: none;"> Current language models achieve low perplexity but their resulting generations still suffer from toxic responses, repetitiveness and contradictions. The standard language modeling setup fails to address these issues. In this paper, we introduce a new architecture, {\sc Director}, that consists of a unified generator-classifier with both a language modeling and a classification head for each output token. Training is conducted jointly using both standard language modeling data, and data labeled with desirable and undesirable sequences. Experiments in several settings show that the model has competitive training and decoding speed compared to standard language models while yielding superior results, alleviating known issues while maintaining generation quality. It also outperforms existing model guiding approaches in terms of both accuracy and efficiency. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2206.07694v2-abstract-full').style.display = 'none'; document.getElementById('2206.07694v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 November, 2022; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 15 June, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2022. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2203.13224">arXiv:2203.13224</a> <span> [<a href="https://arxiv.org/pdf/2203.13224">pdf</a>, <a href="https://arxiv.org/format/2203.13224">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Language Models that Seek for Knowledge: Modular Search & Generation for Dialogue and Prompt Completion </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Shuster%2C+K">Kurt Shuster</a>, <a href="/search/cs?searchtype=author&query=Komeili%2C+M">Mojtaba Komeili</a>, <a href="/search/cs?searchtype=author&query=Adolphs%2C+L">Leonard Adolphs</a>, <a href="/search/cs?searchtype=author&query=Roller%2C+S">Stephen Roller</a>, <a href="/search/cs?searchtype=author&query=Szlam%2C+A">Arthur Szlam</a>, <a href="/search/cs?searchtype=author&query=Weston%2C+J">Jason Weston</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2203.13224v2-abstract-short" style="display: inline;"> Language models (LMs) have recently been shown to generate more factual responses by employing modularity (Zhou et al., 2021) in combination with retrieval (Adolphs et al., 2021). We extend the recent approach of Adolphs et al. (2021) to include internet search as a module. Our SeeKeR (Search engine->Knowledge->Response) method thus applies a single LM to three modular tasks in succession: search,… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2203.13224v2-abstract-full').style.display = 'inline'; document.getElementById('2203.13224v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2203.13224v2-abstract-full" style="display: none;"> Language models (LMs) have recently been shown to generate more factual responses by employing modularity (Zhou et al., 2021) in combination with retrieval (Adolphs et al., 2021). We extend the recent approach of Adolphs et al. (2021) to include internet search as a module. Our SeeKeR (Search engine->Knowledge->Response) method thus applies a single LM to three modular tasks in succession: search, generating knowledge, and generating a final response. We show that, when using SeeKeR as a dialogue model, it outperforms the state-of-the-art model BlenderBot 2 (Chen et al., 2021) on open-domain knowledge-grounded conversations for the same number of parameters, in terms of consistency, knowledge and per-turn engagingness. SeeKeR applied to topical prompt completions as a standard language model outperforms GPT2 (Radford et al., 2019) and GPT3 (Brown et al., 2020) in terms of factuality and topicality, despite GPT3 being a vastly larger model. Our code and models are made publicly available. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2203.13224v2-abstract-full').style.display = 'none'; document.getElementById('2203.13224v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 March, 2022; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 24 March, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2022. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2201.04723">arXiv:2201.04723</a> <span> [<a href="https://arxiv.org/pdf/2201.04723">pdf</a>, <a href="https://arxiv.org/format/2201.04723">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Human Evaluation of Conversations is an Open Problem: comparing the sensitivity of various methods for evaluating dialogue agents </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Smith%2C+E+M">Eric Michael Smith</a>, <a href="/search/cs?searchtype=author&query=Hsu%2C+O">Orion Hsu</a>, <a href="/search/cs?searchtype=author&query=Qian%2C+R">Rebecca Qian</a>, <a href="/search/cs?searchtype=author&query=Roller%2C+S">Stephen Roller</a>, <a href="/search/cs?searchtype=author&query=Boureau%2C+Y">Y-Lan Boureau</a>, <a href="/search/cs?searchtype=author&query=Weston%2C+J">Jason Weston</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2201.04723v1-abstract-short" style="display: inline;"> At the heart of improving conversational AI is the open problem of how to evaluate conversations. Issues with automatic metrics are well known (Liu et al., 2016, arXiv:1603.08023), with human evaluations still considered the gold standard. Unfortunately, how to perform human evaluations is also an open problem: differing data collection methods have varying levels of human agreement and statistica… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2201.04723v1-abstract-full').style.display = 'inline'; document.getElementById('2201.04723v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2201.04723v1-abstract-full" style="display: none;"> At the heart of improving conversational AI is the open problem of how to evaluate conversations. Issues with automatic metrics are well known (Liu et al., 2016, arXiv:1603.08023), with human evaluations still considered the gold standard. Unfortunately, how to perform human evaluations is also an open problem: differing data collection methods have varying levels of human agreement and statistical sensitivity, resulting in differing amounts of human annotation hours and labor costs. In this work we compare five different crowdworker-based human evaluation methods and find that different methods are best depending on the types of models compared, with no clear winner across the board. While this highlights the open problems in the area, our analysis leads to advice of when to use which one, and possible future directions. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2201.04723v1-abstract-full').style.display = 'none'; document.getElementById('2201.04723v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 January, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2022. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2112.05843">arXiv:2112.05843</a> <span> [<a href="https://arxiv.org/pdf/2112.05843">pdf</a>, <a href="https://arxiv.org/format/2112.05843">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Am I Me or You? State-of-the-Art Dialogue Models Cannot Maintain an Identity </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Shuster%2C+K">Kurt Shuster</a>, <a href="/search/cs?searchtype=author&query=Urbanek%2C+J">Jack Urbanek</a>, <a href="/search/cs?searchtype=author&query=Szlam%2C+A">Arthur Szlam</a>, <a href="/search/cs?searchtype=author&query=Weston%2C+J">Jason Weston</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2112.05843v1-abstract-short" style="display: inline;"> State-of-the-art dialogue models still often stumble with regards to factual accuracy and self-contradiction. Anecdotally, they have been observed to fail to maintain character identity throughout discourse; and more specifically, may take on the role of their interlocutor. In this work we formalize and quantify this deficiency, and show experimentally through human evaluations that this is indeed… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2112.05843v1-abstract-full').style.display = 'inline'; document.getElementById('2112.05843v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2112.05843v1-abstract-full" style="display: none;"> State-of-the-art dialogue models still often stumble with regards to factual accuracy and self-contradiction. Anecdotally, they have been observed to fail to maintain character identity throughout discourse; and more specifically, may take on the role of their interlocutor. In this work we formalize and quantify this deficiency, and show experimentally through human evaluations that this is indeed a problem. In contrast, we show that discriminative models trained specifically to recognize who is speaking can perform well; and further, these can be used as automated metrics. Finally, we evaluate a wide variety of mitigation methods, including changes to model architecture, training protocol, and decoding strategy. Our best models reduce mistaken identity issues by nearly 65% according to human annotators, while simultaneously improving engagingness. Despite these results, we find that maintaining character identity still remains a challenging problem. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2112.05843v1-abstract-full').style.display = 'none'; document.getElementById('2112.05843v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 December, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2021. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2111.05204">arXiv:2111.05204</a> <span> [<a href="https://arxiv.org/pdf/2111.05204">pdf</a>, <a href="https://arxiv.org/format/2111.05204">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Reason first, then respond: Modular Generation for Knowledge-infused Dialogue </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Adolphs%2C+L">Leonard Adolphs</a>, <a href="/search/cs?searchtype=author&query=Shuster%2C+K">Kurt Shuster</a>, <a href="/search/cs?searchtype=author&query=Urbanek%2C+J">Jack Urbanek</a>, <a href="/search/cs?searchtype=author&query=Szlam%2C+A">Arthur Szlam</a>, <a href="/search/cs?searchtype=author&query=Weston%2C+J">Jason Weston</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2111.05204v1-abstract-short" style="display: inline;"> Large language models can produce fluent dialogue but often hallucinate factual inaccuracies. While retrieval-augmented models help alleviate this issue, they still face a difficult challenge of both reasoning to provide correct knowledge and generating conversation simultaneously. In this work, we propose a modular model, Knowledge to Response (K2R), for incorporating knowledge into conversationa… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2111.05204v1-abstract-full').style.display = 'inline'; document.getElementById('2111.05204v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2111.05204v1-abstract-full" style="display: none;"> Large language models can produce fluent dialogue but often hallucinate factual inaccuracies. While retrieval-augmented models help alleviate this issue, they still face a difficult challenge of both reasoning to provide correct knowledge and generating conversation simultaneously. In this work, we propose a modular model, Knowledge to Response (K2R), for incorporating knowledge into conversational agents, which breaks down this problem into two easier steps. K2R first generates a knowledge sequence, given a dialogue context, as an intermediate step. After this "reasoning step", the model then attends to its own generated knowledge sequence, as well as the dialogue context, to produce a final response. In detailed experiments, we find that such a model hallucinates less in knowledge-grounded dialogue tasks, and has advantages in terms of interpretability and modularity. In particular, it can be used to fuse QA and dialogue systems together to enable dialogue agents to give knowledgeable answers, or QA models to give conversational responses in a zero-shot setting. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2111.05204v1-abstract-full').style.display = 'none'; document.getElementById('2111.05204v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 November, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2021. </p> </li> </ol> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Weston%2C+J&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Weston%2C+J&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Weston%2C+J&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Weston%2C+J&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> </ul> </nav> <div class="is-hidden-tablet">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary">  <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div>   <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div>  </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>

CINXE.COM

Search | arXiv e-print repository