CINXE.COM
Large language model - Wikipedia
<!DOCTYPE html> <html class="client-nojs vector-feature-language-in-header-enabled vector-feature-language-in-main-page-header-disabled vector-feature-sticky-header-disabled vector-feature-page-tools-pinned-disabled vector-feature-toc-pinned-clientpref-1 vector-feature-main-menu-pinned-disabled vector-feature-limited-width-clientpref-1 vector-feature-limited-width-content-enabled vector-feature-custom-font-size-clientpref-1 vector-feature-appearance-pinned-clientpref-1 vector-feature-night-mode-enabled skin-theme-clientpref-day vector-toc-available" lang="en" dir="ltr"> <head> <meta charset="UTF-8"> <title>Large language model - Wikipedia</title> <script>(function(){var className="client-js vector-feature-language-in-header-enabled vector-feature-language-in-main-page-header-disabled vector-feature-sticky-header-disabled vector-feature-page-tools-pinned-disabled vector-feature-toc-pinned-clientpref-1 vector-feature-main-menu-pinned-disabled vector-feature-limited-width-clientpref-1 vector-feature-limited-width-content-enabled vector-feature-custom-font-size-clientpref-1 vector-feature-appearance-pinned-clientpref-1 vector-feature-night-mode-enabled skin-theme-clientpref-day vector-toc-available";var cookie=document.cookie.match(/(?:^|; )enwikimwclientpreferences=([^;]+)/);if(cookie){cookie[1].split('%2C').forEach(function(pref){className=className.replace(new RegExp('(^| )'+pref.replace(/-clientpref-\w+$|[^\w-]+/g,'')+'-clientpref-\\w+( |$)'),'$1'+pref+'$2');});}document.documentElement.className=className;}());RLCONF={"wgBreakFrames":false,"wgSeparatorTransformTable":["",""],"wgDigitTransformTable":["",""],"wgDefaultDateFormat":"dmy", "wgMonthNames":["","January","February","March","April","May","June","July","August","September","October","November","December"],"wgRequestId":"7cabca14-af67-49de-9d60-9a94e40b625b","wgCanonicalNamespace":"","wgCanonicalSpecialPageName":false,"wgNamespaceNumber":0,"wgPageName":"Large_language_model","wgTitle":"Large language model","wgCurRevisionId":1258457238,"wgRevisionId":1258457238,"wgArticleId":73248112,"wgIsArticle":true,"wgIsRedirect":false,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgCategories":["CS1: long volume value","Webarchive template wayback links","Articles with short description","Short description is different from Wikidata","Articles containing potentially dated statements from 2024","All articles containing potentially dated statements","Articles containing potentially dated statements from June 2024","All accuracy disputes","Articles with disputed statements from September 2024","All articles with unsourced statements", "Articles with unsourced statements from February 2024","Articles containing potentially dated statements from October 2024","Large language models","Deep learning","Natural language processing"],"wgPageViewLanguage":"en","wgPageContentLanguage":"en","wgPageContentModel":"wikitext","wgRelevantPageName":"Large_language_model","wgRelevantArticleId":73248112,"wgIsProbablyEditable":true,"wgRelevantPageIsProbablyEditable":true,"wgRestrictionEdit":[],"wgRestrictionMove":[],"wgNoticeProject":"wikipedia","wgCiteReferencePreviewsActive":false,"wgFlaggedRevsParams":{"tags":{"status":{"levels":1}}},"wgMediaViewerOnClick":true,"wgMediaViewerEnabledByDefault":true,"wgPopupsFlags":0,"wgVisualEditor":{"pageLanguageCode":"en","pageLanguageDir":"ltr","pageVariantFallbacks":"en"},"wgMFDisplayWikibaseDescriptions":{"search":true,"watchlist":true,"tagline":false,"nearby":true},"wgWMESchemaEditAttemptStepOversample":false,"wgWMEPageLength":200000,"wgRelatedArticlesCompat":[],"wgCentralAuthMobileDomain": false,"wgEditSubmitButtonLabelPublish":true,"wgULSPosition":"interlanguage","wgULSisCompactLinksEnabled":false,"wgVector2022LanguageInHeader":true,"wgULSisLanguageSelectorEmpty":false,"wgWikibaseItemId":"Q115305900","wgCheckUserClientHintsHeadersJsApi":["brands","architecture","bitness","fullVersionList","mobile","model","platform","platformVersion"],"GEHomepageSuggestedEditsEnableTopics":true,"wgGETopicsMatchModeEnabled":false,"wgGEStructuredTaskRejectionReasonTextInputEnabled":false,"wgGELevelingUpEnabledForUser":false};RLSTATE={"ext.globalCssJs.user.styles":"ready","site.styles":"ready","user.styles":"ready","ext.globalCssJs.user":"ready","user":"ready","user.options":"loading","ext.cite.styles":"ready","ext.math.styles":"ready","skins.vector.search.codex.styles":"ready","skins.vector.styles":"ready","skins.vector.icons":"ready","jquery.tablesorter.styles":"ready","jquery.makeCollapsible.styles":"ready","ext.wikimediamessages.styles":"ready", "ext.visualEditor.desktopArticleTarget.noscript":"ready","ext.uls.interlanguage":"ready","wikibase.client.init":"ready","ext.wikimediaBadges":"ready"};RLPAGEMODULES=["ext.cite.ux-enhancements","mediawiki.page.media","ext.scribunto.logs","site","mediawiki.page.ready","jquery.tablesorter","jquery.makeCollapsible","mediawiki.toc","skins.vector.js","ext.centralNotice.geoIP","ext.centralNotice.startUp","ext.gadget.ReferenceTooltips","ext.gadget.switcher","ext.urlShortener.toolbar","ext.centralauth.centralautologin","mmv.bootstrap","ext.popups","ext.visualEditor.desktopArticleTarget.init","ext.visualEditor.targetLoader","ext.echo.centralauth","ext.eventLogging","ext.wikimediaEvents","ext.navigationTiming","ext.uls.interface","ext.cx.eventlogging.campaigns","ext.cx.uls.quick.actions","wikibase.client.vector-2022","ext.checkUser.clientHints","ext.growthExperiments.SuggestedEditSession","wikibase.sidebar.tracking"];</script> <script>(RLQ=window.RLQ||[]).push(function(){mw.loader.impl(function(){return["user.options@12s5i",function($,jQuery,require,module){mw.user.tokens.set({"patrolToken":"+\\","watchToken":"+\\","csrfToken":"+\\"}); }];});});</script> <link rel="stylesheet" href="/w/load.php?lang=en&modules=ext.cite.styles%7Cext.math.styles%7Cext.uls.interlanguage%7Cext.visualEditor.desktopArticleTarget.noscript%7Cext.wikimediaBadges%7Cext.wikimediamessages.styles%7Cjquery.makeCollapsible.styles%7Cjquery.tablesorter.styles%7Cskins.vector.icons%2Cstyles%7Cskins.vector.search.codex.styles%7Cwikibase.client.init&only=styles&skin=vector-2022"> <script async="" src="/w/load.php?lang=en&modules=startup&only=scripts&raw=1&skin=vector-2022"></script> <meta name="ResourceLoaderDynamicStyles" content=""> <link rel="stylesheet" href="/w/load.php?lang=en&modules=site.styles&only=styles&skin=vector-2022"> <meta name="generator" content="MediaWiki 1.44.0-wmf.4"> <meta name="referrer" content="origin"> <meta name="referrer" content="origin-when-cross-origin"> <meta name="robots" content="max-image-preview:standard"> <meta name="format-detection" content="telephone=no"> <meta name="viewport" content="width=1120"> <meta property="og:title" content="Large language model - Wikipedia"> <meta property="og:type" content="website"> <link rel="preconnect" href="//upload.wikimedia.org"> <link rel="alternate" media="only screen and (max-width: 640px)" href="//en.m.wikipedia.org/wiki/Large_language_model"> <link rel="alternate" type="application/x-wiki" title="Edit this page" href="/w/index.php?title=Large_language_model&action=edit"> <link rel="apple-touch-icon" href="/static/apple-touch/wikipedia.png"> <link rel="icon" href="/static/favicon/wikipedia.ico"> <link rel="search" type="application/opensearchdescription+xml" href="/w/rest.php/v1/search" title="Wikipedia (en)"> <link rel="EditURI" type="application/rsd+xml" href="//en.wikipedia.org/w/api.php?action=rsd"> <link rel="canonical" href="https://en.wikipedia.org/wiki/Large_language_model"> <link rel="license" href="https://creativecommons.org/licenses/by-sa/4.0/deed.en"> <link rel="alternate" type="application/atom+xml" title="Wikipedia Atom feed" href="/w/index.php?title=Special:RecentChanges&feed=atom"> <link rel="dns-prefetch" href="//meta.wikimedia.org" /> <link rel="dns-prefetch" href="//login.wikimedia.org"> </head> <body class="skin--responsive skin-vector skin-vector-search-vue mediawiki ltr sitedir-ltr mw-hide-empty-elt ns-0 ns-subject mw-editable page-Large_language_model rootpage-Large_language_model skin-vector-2022 action-view"><a class="mw-jump-link" href="#bodyContent">Jump to content</a> <div class="vector-header-container"> <header class="vector-header mw-header"> <div class="vector-header-start"> <nav class="vector-main-menu-landmark" aria-label="Site"> <div id="vector-main-menu-dropdown" class="vector-dropdown vector-main-menu-dropdown vector-button-flush-left vector-button-flush-right" > <input type="checkbox" id="vector-main-menu-dropdown-checkbox" role="button" aria-haspopup="true" data-event-name="ui.dropdown-vector-main-menu-dropdown" class="vector-dropdown-checkbox " aria-label="Main menu" > <label id="vector-main-menu-dropdown-label" for="vector-main-menu-dropdown-checkbox" class="vector-dropdown-label cdx-button cdx-button--fake-button cdx-button--fake-button--enabled cdx-button--weight-quiet cdx-button--icon-only " aria-hidden="true" ><span class="vector-icon mw-ui-icon-menu mw-ui-icon-wikimedia-menu"></span> <span class="vector-dropdown-label-text">Main menu</span> </label> <div class="vector-dropdown-content"> <div id="vector-main-menu-unpinned-container" class="vector-unpinned-container"> <div id="vector-main-menu" class="vector-main-menu vector-pinnable-element"> <div class="vector-pinnable-header vector-main-menu-pinnable-header vector-pinnable-header-unpinned" data-feature-name="main-menu-pinned" data-pinnable-element-id="vector-main-menu" data-pinned-container-id="vector-main-menu-pinned-container" data-unpinned-container-id="vector-main-menu-unpinned-container" > <div class="vector-pinnable-header-label">Main menu</div> <button class="vector-pinnable-header-toggle-button vector-pinnable-header-pin-button" data-event-name="pinnable-header.vector-main-menu.pin">move to sidebar</button> <button class="vector-pinnable-header-toggle-button vector-pinnable-header-unpin-button" data-event-name="pinnable-header.vector-main-menu.unpin">hide</button> </div> <div id="p-navigation" class="vector-menu mw-portlet mw-portlet-navigation" > <div class="vector-menu-heading"> Navigation </div> <div class="vector-menu-content"> <ul class="vector-menu-content-list"> <li id="n-mainpage-description" class="mw-list-item"><a href="/wiki/Main_Page" title="Visit the main page [z]" accesskey="z"><span>Main page</span></a></li><li id="n-contents" class="mw-list-item"><a href="/wiki/Wikipedia:Contents" title="Guides to browsing Wikipedia"><span>Contents</span></a></li><li id="n-currentevents" class="mw-list-item"><a href="/wiki/Portal:Current_events" title="Articles related to current events"><span>Current events</span></a></li><li id="n-randompage" class="mw-list-item"><a href="/wiki/Special:Random" title="Visit a randomly selected article [x]" accesskey="x"><span>Random article</span></a></li><li id="n-aboutsite" class="mw-list-item"><a href="/wiki/Wikipedia:About" title="Learn about Wikipedia and how it works"><span>About Wikipedia</span></a></li><li id="n-contactpage" class="mw-list-item"><a href="//en.wikipedia.org/wiki/Wikipedia:Contact_us" title="How to contact Wikipedia"><span>Contact us</span></a></li> </ul> </div> </div> <div id="p-interaction" class="vector-menu mw-portlet mw-portlet-interaction" > <div class="vector-menu-heading"> Contribute </div> <div class="vector-menu-content"> <ul class="vector-menu-content-list"> <li id="n-help" class="mw-list-item"><a href="/wiki/Help:Contents" title="Guidance on how to use and edit Wikipedia"><span>Help</span></a></li><li id="n-introduction" class="mw-list-item"><a href="/wiki/Help:Introduction" title="Learn how to edit Wikipedia"><span>Learn to edit</span></a></li><li id="n-portal" class="mw-list-item"><a href="/wiki/Wikipedia:Community_portal" title="The hub for editors"><span>Community portal</span></a></li><li id="n-recentchanges" class="mw-list-item"><a href="/wiki/Special:RecentChanges" title="A list of recent changes to Wikipedia [r]" accesskey="r"><span>Recent changes</span></a></li><li id="n-upload" class="mw-list-item"><a href="/wiki/Wikipedia:File_upload_wizard" title="Add images or other media for use on Wikipedia"><span>Upload file</span></a></li> </ul> </div> </div> </div> </div> </div> </div> </nav> <a href="/wiki/Main_Page" class="mw-logo"> <img class="mw-logo-icon" src="/static/images/icons/wikipedia.png" alt="" aria-hidden="true" height="50" width="50"> <span class="mw-logo-container skin-invert"> <img class="mw-logo-wordmark" alt="Wikipedia" src="/static/images/mobile/copyright/wikipedia-wordmark-en.svg" style="width: 7.5em; height: 1.125em;"> <img class="mw-logo-tagline" alt="The Free Encyclopedia" src="/static/images/mobile/copyright/wikipedia-tagline-en.svg" width="117" height="13" style="width: 7.3125em; height: 0.8125em;"> </span> </a> </div> <div class="vector-header-end"> <div id="p-search" role="search" class="vector-search-box-vue vector-search-box-collapses vector-search-box-show-thumbnail vector-search-box-auto-expand-width vector-search-box"> <a href="/wiki/Special:Search" class="cdx-button cdx-button--fake-button cdx-button--fake-button--enabled cdx-button--weight-quiet cdx-button--icon-only search-toggle" title="Search Wikipedia [f]" accesskey="f"><span class="vector-icon mw-ui-icon-search mw-ui-icon-wikimedia-search"></span> <span>Search</span> </a> <div class="vector-typeahead-search-container"> <div class="cdx-typeahead-search cdx-typeahead-search--show-thumbnail cdx-typeahead-search--auto-expand-width"> <form action="/w/index.php" id="searchform" class="cdx-search-input cdx-search-input--has-end-button"> <div id="simpleSearch" class="cdx-search-input__input-wrapper" data-search-loc="header-moved"> <div class="cdx-text-input cdx-text-input--has-start-icon"> <input class="cdx-text-input__input" type="search" name="search" placeholder="Search Wikipedia" aria-label="Search Wikipedia" autocapitalize="sentences" title="Search Wikipedia [f]" accesskey="f" id="searchInput" > <span class="cdx-text-input__icon cdx-text-input__start-icon"></span> </div> <input type="hidden" name="title" value="Special:Search"> </div> <button class="cdx-button cdx-search-input__end-button">Search</button> </form> </div> </div> </div> <nav class="vector-user-links vector-user-links-wide" aria-label="Personal tools"> <div class="vector-user-links-main"> <div id="p-vector-user-menu-preferences" class="vector-menu mw-portlet emptyPortlet" > <div class="vector-menu-content"> <ul class="vector-menu-content-list"> </ul> </div> </div> <div id="p-vector-user-menu-userpage" class="vector-menu mw-portlet emptyPortlet" > <div class="vector-menu-content"> <ul class="vector-menu-content-list"> </ul> </div> </div> <nav class="vector-appearance-landmark" aria-label="Appearance"> <div id="vector-appearance-dropdown" class="vector-dropdown " title="Change the appearance of the page's font size, width, and color" > <input type="checkbox" id="vector-appearance-dropdown-checkbox" role="button" aria-haspopup="true" data-event-name="ui.dropdown-vector-appearance-dropdown" class="vector-dropdown-checkbox " aria-label="Appearance" > <label id="vector-appearance-dropdown-label" for="vector-appearance-dropdown-checkbox" class="vector-dropdown-label cdx-button cdx-button--fake-button cdx-button--fake-button--enabled cdx-button--weight-quiet cdx-button--icon-only " aria-hidden="true" ><span class="vector-icon mw-ui-icon-appearance mw-ui-icon-wikimedia-appearance"></span> <span class="vector-dropdown-label-text">Appearance</span> </label> <div class="vector-dropdown-content"> <div id="vector-appearance-unpinned-container" class="vector-unpinned-container"> </div> </div> </div> </nav> <div id="p-vector-user-menu-notifications" class="vector-menu mw-portlet emptyPortlet" > <div class="vector-menu-content"> <ul class="vector-menu-content-list"> </ul> </div> </div> <div id="p-vector-user-menu-overflow" class="vector-menu mw-portlet" > <div class="vector-menu-content"> <ul class="vector-menu-content-list"> <li id="pt-sitesupport-2" class="user-links-collapsible-item mw-list-item user-links-collapsible-item"><a data-mw="interface" href="https://donate.wikimedia.org/wiki/Special:FundraiserRedirector?utm_source=donate&utm_medium=sidebar&utm_campaign=C13_en.wikipedia.org&uselang=en" class=""><span>Donate</span></a> </li> <li id="pt-createaccount-2" class="user-links-collapsible-item mw-list-item user-links-collapsible-item"><a data-mw="interface" href="/w/index.php?title=Special:CreateAccount&returnto=Large+language+model" title="You are encouraged to create an account and log in; however, it is not mandatory" class=""><span>Create account</span></a> </li> <li id="pt-login-2" class="user-links-collapsible-item mw-list-item user-links-collapsible-item"><a data-mw="interface" href="/w/index.php?title=Special:UserLogin&returnto=Large+language+model" title="You're encouraged to log in; however, it's not mandatory. [o]" accesskey="o" class=""><span>Log in</span></a> </li> </ul> </div> </div> </div> <div id="vector-user-links-dropdown" class="vector-dropdown vector-user-menu vector-button-flush-right vector-user-menu-logged-out" title="Log in and more options" > <input type="checkbox" id="vector-user-links-dropdown-checkbox" role="button" aria-haspopup="true" data-event-name="ui.dropdown-vector-user-links-dropdown" class="vector-dropdown-checkbox " aria-label="Personal tools" > <label id="vector-user-links-dropdown-label" for="vector-user-links-dropdown-checkbox" class="vector-dropdown-label cdx-button cdx-button--fake-button cdx-button--fake-button--enabled cdx-button--weight-quiet cdx-button--icon-only " aria-hidden="true" ><span class="vector-icon mw-ui-icon-ellipsis mw-ui-icon-wikimedia-ellipsis"></span> <span class="vector-dropdown-label-text">Personal tools</span> </label> <div class="vector-dropdown-content"> <div id="p-personal" class="vector-menu mw-portlet mw-portlet-personal user-links-collapsible-item" title="User menu" > <div class="vector-menu-content"> <ul class="vector-menu-content-list"> <li id="pt-sitesupport" class="user-links-collapsible-item mw-list-item"><a href="https://donate.wikimedia.org/wiki/Special:FundraiserRedirector?utm_source=donate&utm_medium=sidebar&utm_campaign=C13_en.wikipedia.org&uselang=en"><span>Donate</span></a></li><li id="pt-createaccount" class="user-links-collapsible-item mw-list-item"><a href="/w/index.php?title=Special:CreateAccount&returnto=Large+language+model" title="You are encouraged to create an account and log in; however, it is not mandatory"><span class="vector-icon mw-ui-icon-userAdd mw-ui-icon-wikimedia-userAdd"></span> <span>Create account</span></a></li><li id="pt-login" class="user-links-collapsible-item mw-list-item"><a href="/w/index.php?title=Special:UserLogin&returnto=Large+language+model" title="You're encouraged to log in; however, it's not mandatory. [o]" accesskey="o"><span class="vector-icon mw-ui-icon-logIn mw-ui-icon-wikimedia-logIn"></span> <span>Log in</span></a></li> </ul> </div> </div> <div id="p-user-menu-anon-editor" class="vector-menu mw-portlet mw-portlet-user-menu-anon-editor" > <div class="vector-menu-heading"> Pages for logged out editors <a href="/wiki/Help:Introduction" aria-label="Learn more about editing"><span>learn more</span></a> </div> <div class="vector-menu-content"> <ul class="vector-menu-content-list"> <li id="pt-anoncontribs" class="mw-list-item"><a href="/wiki/Special:MyContributions" title="A list of edits made from this IP address [y]" accesskey="y"><span>Contributions</span></a></li><li id="pt-anontalk" class="mw-list-item"><a href="/wiki/Special:MyTalk" title="Discussion about edits from this IP address [n]" accesskey="n"><span>Talk</span></a></li> </ul> </div> </div> </div> </div> </nav> </div> </header> </div> <div class="mw-page-container"> <div class="mw-page-container-inner"> <div class="vector-sitenotice-container"> <div id="siteNotice"><!-- CentralNotice --></div> </div> <div class="vector-column-start"> <div class="vector-main-menu-container"> <div id="mw-navigation"> <nav id="mw-panel" class="vector-main-menu-landmark" aria-label="Site"> <div id="vector-main-menu-pinned-container" class="vector-pinned-container"> </div> </nav> </div> </div> <div class="vector-sticky-pinned-container"> <nav id="mw-panel-toc" aria-label="Contents" data-event-name="ui.sidebar-toc" class="mw-table-of-contents-container vector-toc-landmark"> <div id="vector-toc-pinned-container" class="vector-pinned-container"> <div id="vector-toc" class="vector-toc vector-pinnable-element"> <div class="vector-pinnable-header vector-toc-pinnable-header vector-pinnable-header-pinned" data-feature-name="toc-pinned" data-pinnable-element-id="vector-toc" > <h2 class="vector-pinnable-header-label">Contents</h2> <button class="vector-pinnable-header-toggle-button vector-pinnable-header-pin-button" data-event-name="pinnable-header.vector-toc.pin">move to sidebar</button> <button class="vector-pinnable-header-toggle-button vector-pinnable-header-unpin-button" data-event-name="pinnable-header.vector-toc.unpin">hide</button> </div> <ul class="vector-toc-contents" id="mw-panel-toc-list"> <li id="toc-mw-content-text" class="vector-toc-list-item vector-toc-level-1"> <a href="#" class="vector-toc-link"> <div class="vector-toc-text">(Top)</div> </a> </li> <li id="toc-History" class="vector-toc-list-item vector-toc-level-1"> <a class="vector-toc-link" href="#History"> <div class="vector-toc-text"> <span class="vector-toc-numb">1</span> <span>History</span> </div> </a> <ul id="toc-History-sublist" class="vector-toc-list"> </ul> </li> <li id="toc-Dataset_preprocessing" class="vector-toc-list-item vector-toc-level-1"> <a class="vector-toc-link" href="#Dataset_preprocessing"> <div class="vector-toc-text"> <span class="vector-toc-numb">2</span> <span>Dataset preprocessing</span> </div> </a> <button aria-controls="toc-Dataset_preprocessing-sublist" class="cdx-button cdx-button--weight-quiet cdx-button--icon-only vector-toc-toggle"> <span class="vector-icon mw-ui-icon-wikimedia-expand"></span> <span>Toggle Dataset preprocessing subsection</span> </button> <ul id="toc-Dataset_preprocessing-sublist" class="vector-toc-list"> <li id="toc-Tokenization" class="vector-toc-list-item vector-toc-level-2"> <a class="vector-toc-link" href="#Tokenization"> <div class="vector-toc-text"> <span class="vector-toc-numb">2.1</span> <span>Tokenization</span> </div> </a> <ul id="toc-Tokenization-sublist" class="vector-toc-list"> <li id="toc-BPE" class="vector-toc-list-item vector-toc-level-3"> <a class="vector-toc-link" href="#BPE"> <div class="vector-toc-text"> <span class="vector-toc-numb">2.1.1</span> <span>BPE</span> </div> </a> <ul id="toc-BPE-sublist" class="vector-toc-list"> </ul> </li> <li id="toc-Problems" class="vector-toc-list-item vector-toc-level-3"> <a class="vector-toc-link" href="#Problems"> <div class="vector-toc-text"> <span class="vector-toc-numb">2.1.2</span> <span>Problems</span> </div> </a> <ul id="toc-Problems-sublist" class="vector-toc-list"> </ul> </li> </ul> </li> <li id="toc-Dataset_cleaning" class="vector-toc-list-item vector-toc-level-2"> <a class="vector-toc-link" href="#Dataset_cleaning"> <div class="vector-toc-text"> <span class="vector-toc-numb">2.2</span> <span>Dataset cleaning</span> </div> </a> <ul id="toc-Dataset_cleaning-sublist" class="vector-toc-list"> </ul> </li> <li id="toc-Synthetic_data" class="vector-toc-list-item vector-toc-level-2"> <a class="vector-toc-link" href="#Synthetic_data"> <div class="vector-toc-text"> <span class="vector-toc-numb">2.3</span> <span>Synthetic data</span> </div> </a> <ul id="toc-Synthetic_data-sublist" class="vector-toc-list"> </ul> </li> </ul> </li> <li id="toc-Training_and_architecture" class="vector-toc-list-item vector-toc-level-1"> <a class="vector-toc-link" href="#Training_and_architecture"> <div class="vector-toc-text"> <span class="vector-toc-numb">3</span> <span>Training and architecture</span> </div> </a> <button aria-controls="toc-Training_and_architecture-sublist" class="cdx-button cdx-button--weight-quiet cdx-button--icon-only vector-toc-toggle"> <span class="vector-icon mw-ui-icon-wikimedia-expand"></span> <span>Toggle Training and architecture subsection</span> </button> <ul id="toc-Training_and_architecture-sublist" class="vector-toc-list"> <li id="toc-Reinforcement_learning_from_human_feedback_(RLHF)" class="vector-toc-list-item vector-toc-level-2"> <a class="vector-toc-link" href="#Reinforcement_learning_from_human_feedback_(RLHF)"> <div class="vector-toc-text"> <span class="vector-toc-numb">3.1</span> <span>Reinforcement learning from human feedback (RLHF)</span> </div> </a> <ul id="toc-Reinforcement_learning_from_human_feedback_(RLHF)-sublist" class="vector-toc-list"> </ul> </li> <li id="toc-Instruction_tuning" class="vector-toc-list-item vector-toc-level-2"> <a class="vector-toc-link" href="#Instruction_tuning"> <div class="vector-toc-text"> <span class="vector-toc-numb">3.2</span> <span>Instruction tuning</span> </div> </a> <ul id="toc-Instruction_tuning-sublist" class="vector-toc-list"> </ul> </li> <li id="toc-Mixture_of_experts" class="vector-toc-list-item vector-toc-level-2"> <a class="vector-toc-link" href="#Mixture_of_experts"> <div class="vector-toc-text"> <span class="vector-toc-numb">3.3</span> <span>Mixture of experts</span> </div> </a> <ul id="toc-Mixture_of_experts-sublist" class="vector-toc-list"> </ul> </li> <li id="toc-Prompt_engineering,_attention_mechanism,_and_context_window" class="vector-toc-list-item vector-toc-level-2"> <a class="vector-toc-link" href="#Prompt_engineering,_attention_mechanism,_and_context_window"> <div class="vector-toc-text"> <span class="vector-toc-numb">3.4</span> <span>Prompt engineering, attention mechanism, and context window</span> </div> </a> <ul id="toc-Prompt_engineering,_attention_mechanism,_and_context_window-sublist" class="vector-toc-list"> </ul> </li> <li id="toc-Infrastructure" class="vector-toc-list-item vector-toc-level-2"> <a class="vector-toc-link" href="#Infrastructure"> <div class="vector-toc-text"> <span class="vector-toc-numb">3.5</span> <span>Infrastructure</span> </div> </a> <ul id="toc-Infrastructure-sublist" class="vector-toc-list"> </ul> </li> </ul> </li> <li id="toc-Training_cost" class="vector-toc-list-item vector-toc-level-1"> <a class="vector-toc-link" href="#Training_cost"> <div class="vector-toc-text"> <span class="vector-toc-numb">4</span> <span>Training cost</span> </div> </a> <ul id="toc-Training_cost-sublist" class="vector-toc-list"> </ul> </li> <li id="toc-Tool_use" class="vector-toc-list-item vector-toc-level-1"> <a class="vector-toc-link" href="#Tool_use"> <div class="vector-toc-text"> <span class="vector-toc-numb">5</span> <span>Tool use</span> </div> </a> <ul id="toc-Tool_use-sublist" class="vector-toc-list"> </ul> </li> <li id="toc-Agency" class="vector-toc-list-item vector-toc-level-1"> <a class="vector-toc-link" href="#Agency"> <div class="vector-toc-text"> <span class="vector-toc-numb">6</span> <span>Agency</span> </div> </a> <ul id="toc-Agency-sublist" class="vector-toc-list"> </ul> </li> <li id="toc-Compression" class="vector-toc-list-item vector-toc-level-1"> <a class="vector-toc-link" href="#Compression"> <div class="vector-toc-text"> <span class="vector-toc-numb">7</span> <span>Compression</span> </div> </a> <ul id="toc-Compression-sublist" class="vector-toc-list"> </ul> </li> <li id="toc-Multimodality" class="vector-toc-list-item vector-toc-level-1"> <a class="vector-toc-link" href="#Multimodality"> <div class="vector-toc-text"> <span class="vector-toc-numb">8</span> <span>Multimodality</span> </div> </a> <ul id="toc-Multimodality-sublist" class="vector-toc-list"> </ul> </li> <li id="toc-Properties" class="vector-toc-list-item vector-toc-level-1"> <a class="vector-toc-link" href="#Properties"> <div class="vector-toc-text"> <span class="vector-toc-numb">9</span> <span>Properties</span> </div> </a> <button aria-controls="toc-Properties-sublist" class="cdx-button cdx-button--weight-quiet cdx-button--icon-only vector-toc-toggle"> <span class="vector-icon mw-ui-icon-wikimedia-expand"></span> <span>Toggle Properties subsection</span> </button> <ul id="toc-Properties-sublist" class="vector-toc-list"> <li id="toc-Scaling_laws" class="vector-toc-list-item vector-toc-level-2"> <a class="vector-toc-link" href="#Scaling_laws"> <div class="vector-toc-text"> <span class="vector-toc-numb">9.1</span> <span>Scaling laws</span> </div> </a> <ul id="toc-Scaling_laws-sublist" class="vector-toc-list"> </ul> </li> <li id="toc-Emergent_abilities" class="vector-toc-list-item vector-toc-level-2"> <a class="vector-toc-link" href="#Emergent_abilities"> <div class="vector-toc-text"> <span class="vector-toc-numb">9.2</span> <span>Emergent abilities</span> </div> </a> <ul id="toc-Emergent_abilities-sublist" class="vector-toc-list"> </ul> </li> </ul> </li> <li id="toc-Interpretation" class="vector-toc-list-item vector-toc-level-1"> <a class="vector-toc-link" href="#Interpretation"> <div class="vector-toc-text"> <span class="vector-toc-numb">10</span> <span>Interpretation</span> </div> </a> <button aria-controls="toc-Interpretation-sublist" class="cdx-button cdx-button--weight-quiet cdx-button--icon-only vector-toc-toggle"> <span class="vector-icon mw-ui-icon-wikimedia-expand"></span> <span>Toggle Interpretation subsection</span> </button> <ul id="toc-Interpretation-sublist" class="vector-toc-list"> <li id="toc-Understanding_and_intelligence" class="vector-toc-list-item vector-toc-level-2"> <a class="vector-toc-link" href="#Understanding_and_intelligence"> <div class="vector-toc-text"> <span class="vector-toc-numb">10.1</span> <span>Understanding and intelligence</span> </div> </a> <ul id="toc-Understanding_and_intelligence-sublist" class="vector-toc-list"> </ul> </li> </ul> </li> <li id="toc-Evaluation" class="vector-toc-list-item vector-toc-level-1"> <a class="vector-toc-link" href="#Evaluation"> <div class="vector-toc-text"> <span class="vector-toc-numb">11</span> <span>Evaluation</span> </div> </a> <button aria-controls="toc-Evaluation-sublist" class="cdx-button cdx-button--weight-quiet cdx-button--icon-only vector-toc-toggle"> <span class="vector-icon mw-ui-icon-wikimedia-expand"></span> <span>Toggle Evaluation subsection</span> </button> <ul id="toc-Evaluation-sublist" class="vector-toc-list"> <li id="toc-Perplexity" class="vector-toc-list-item vector-toc-level-2"> <a class="vector-toc-link" href="#Perplexity"> <div class="vector-toc-text"> <span class="vector-toc-numb">11.1</span> <span>Perplexity</span> </div> </a> <ul id="toc-Perplexity-sublist" class="vector-toc-list"> <li id="toc-BPW,_BPC,_and_BPT" class="vector-toc-list-item vector-toc-level-3"> <a class="vector-toc-link" href="#BPW,_BPC,_and_BPT"> <div class="vector-toc-text"> <span class="vector-toc-numb">11.1.1</span> <span>BPW, BPC, and BPT</span> </div> </a> <ul id="toc-BPW,_BPC,_and_BPT-sublist" class="vector-toc-list"> </ul> </li> </ul> </li> <li id="toc-Task-specific_datasets_and_benchmarks" class="vector-toc-list-item vector-toc-level-2"> <a class="vector-toc-link" href="#Task-specific_datasets_and_benchmarks"> <div class="vector-toc-text"> <span class="vector-toc-numb">11.2</span> <span>Task-specific datasets and benchmarks</span> </div> </a> <ul id="toc-Task-specific_datasets_and_benchmarks-sublist" class="vector-toc-list"> <li id="toc-Adversarially_constructed_evaluations" class="vector-toc-list-item vector-toc-level-3"> <a class="vector-toc-link" href="#Adversarially_constructed_evaluations"> <div class="vector-toc-text"> <span class="vector-toc-numb">11.2.1</span> <span>Adversarially constructed evaluations</span> </div> </a> <ul id="toc-Adversarially_constructed_evaluations-sublist" class="vector-toc-list"> </ul> </li> </ul> </li> </ul> </li> <li id="toc-Wider_impact" class="vector-toc-list-item vector-toc-level-1"> <a class="vector-toc-link" href="#Wider_impact"> <div class="vector-toc-text"> <span class="vector-toc-numb">12</span> <span>Wider impact</span> </div> </a> <button aria-controls="toc-Wider_impact-sublist" class="cdx-button cdx-button--weight-quiet cdx-button--icon-only vector-toc-toggle"> <span class="vector-icon mw-ui-icon-wikimedia-expand"></span> <span>Toggle Wider impact subsection</span> </button> <ul id="toc-Wider_impact-sublist" class="vector-toc-list"> <li id="toc-Memorization_and_copyright" class="vector-toc-list-item vector-toc-level-2"> <a class="vector-toc-link" href="#Memorization_and_copyright"> <div class="vector-toc-text"> <span class="vector-toc-numb">12.1</span> <span>Memorization and copyright</span> </div> </a> <ul id="toc-Memorization_and_copyright-sublist" class="vector-toc-list"> </ul> </li> <li id="toc-Security" class="vector-toc-list-item vector-toc-level-2"> <a class="vector-toc-link" href="#Security"> <div class="vector-toc-text"> <span class="vector-toc-numb">12.2</span> <span>Security</span> </div> </a> <ul id="toc-Security-sublist" class="vector-toc-list"> </ul> </li> <li id="toc-Algorithmic_bias" class="vector-toc-list-item vector-toc-level-2"> <a class="vector-toc-link" href="#Algorithmic_bias"> <div class="vector-toc-text"> <span class="vector-toc-numb">12.3</span> <span>Algorithmic bias</span> </div> </a> <ul id="toc-Algorithmic_bias-sublist" class="vector-toc-list"> <li id="toc-Stereotyping" class="vector-toc-list-item vector-toc-level-3"> <a class="vector-toc-link" href="#Stereotyping"> <div class="vector-toc-text"> <span class="vector-toc-numb">12.3.1</span> <span>Stereotyping</span> </div> </a> <ul id="toc-Stereotyping-sublist" class="vector-toc-list"> </ul> </li> <li id="toc-Political_bias" class="vector-toc-list-item vector-toc-level-3"> <a class="vector-toc-link" href="#Political_bias"> <div class="vector-toc-text"> <span class="vector-toc-numb">12.3.2</span> <span>Political bias</span> </div> </a> <ul id="toc-Political_bias-sublist" class="vector-toc-list"> </ul> </li> </ul> </li> </ul> </li> <li id="toc-List_of_large_language_models" class="vector-toc-list-item vector-toc-level-1"> <a class="vector-toc-link" href="#List_of_large_language_models"> <div class="vector-toc-text"> <span class="vector-toc-numb">13</span> <span>List of large language models</span> </div> </a> <ul id="toc-List_of_large_language_models-sublist" class="vector-toc-list"> </ul> </li> <li id="toc-See_also" class="vector-toc-list-item vector-toc-level-1"> <a class="vector-toc-link" href="#See_also"> <div class="vector-toc-text"> <span class="vector-toc-numb">14</span> <span>See also</span> </div> </a> <ul id="toc-See_also-sublist" class="vector-toc-list"> </ul> </li> <li id="toc-Notes" class="vector-toc-list-item vector-toc-level-1"> <a class="vector-toc-link" href="#Notes"> <div class="vector-toc-text"> <span class="vector-toc-numb">15</span> <span>Notes</span> </div> </a> <ul id="toc-Notes-sublist" class="vector-toc-list"> </ul> </li> <li id="toc-References" class="vector-toc-list-item vector-toc-level-1"> <a class="vector-toc-link" href="#References"> <div class="vector-toc-text"> <span class="vector-toc-numb">16</span> <span>References</span> </div> </a> <ul id="toc-References-sublist" class="vector-toc-list"> </ul> </li> <li id="toc-Further_reading" class="vector-toc-list-item vector-toc-level-1"> <a class="vector-toc-link" href="#Further_reading"> <div class="vector-toc-text"> <span class="vector-toc-numb">17</span> <span>Further reading</span> </div> </a> <ul id="toc-Further_reading-sublist" class="vector-toc-list"> </ul> </li> </ul> </div> </div> </nav> </div> </div> <div class="mw-content-container"> <main id="content" class="mw-body"> <header class="mw-body-header vector-page-titlebar"> <nav aria-label="Contents" class="vector-toc-landmark"> <div id="vector-page-titlebar-toc" class="vector-dropdown vector-page-titlebar-toc vector-button-flush-left" > <input type="checkbox" id="vector-page-titlebar-toc-checkbox" role="button" aria-haspopup="true" data-event-name="ui.dropdown-vector-page-titlebar-toc" class="vector-dropdown-checkbox " aria-label="Toggle the table of contents" > <label id="vector-page-titlebar-toc-label" for="vector-page-titlebar-toc-checkbox" class="vector-dropdown-label cdx-button cdx-button--fake-button cdx-button--fake-button--enabled cdx-button--weight-quiet cdx-button--icon-only " aria-hidden="true" ><span class="vector-icon mw-ui-icon-listBullet mw-ui-icon-wikimedia-listBullet"></span> <span class="vector-dropdown-label-text">Toggle the table of contents</span> </label> <div class="vector-dropdown-content"> <div id="vector-page-titlebar-toc-unpinned-container" class="vector-unpinned-container"> </div> </div> </div> </nav> <h1 id="firstHeading" class="firstHeading mw-first-heading"><span class="mw-page-title-main">Large language model</span></h1> <div id="p-lang-btn" class="vector-dropdown mw-portlet mw-portlet-lang" > <input type="checkbox" id="p-lang-btn-checkbox" role="button" aria-haspopup="true" data-event-name="ui.dropdown-p-lang-btn" class="vector-dropdown-checkbox mw-interlanguage-selector" aria-label="Go to an article in another language. Available in 45 languages" > <label id="p-lang-btn-label" for="p-lang-btn-checkbox" class="vector-dropdown-label cdx-button cdx-button--fake-button cdx-button--fake-button--enabled cdx-button--weight-quiet cdx-button--action-progressive mw-portlet-lang-heading-45" aria-hidden="true" ><span class="vector-icon mw-ui-icon-language-progressive mw-ui-icon-wikimedia-language-progressive"></span> <span class="vector-dropdown-label-text">45 languages</span> </label> <div class="vector-dropdown-content"> <div class="vector-menu-content"> <ul class="vector-menu-content-list"> <li class="interlanguage-link interwiki-af mw-list-item"><a href="https://af.wikipedia.org/wiki/Groot_taalmodel" title="Groot taalmodel – Afrikaans" lang="af" hreflang="af" data-title="Groot taalmodel" data-language-autonym="Afrikaans" data-language-local-name="Afrikaans" class="interlanguage-link-target"><span>Afrikaans</span></a></li><li class="interlanguage-link interwiki-ar mw-list-item"><a href="https://ar.wikipedia.org/wiki/%D9%86%D9%85%D9%88%D8%B0%D8%AC_%D9%84%D8%BA%D9%88%D9%8A_%D9%83%D8%A8%D9%8A%D8%B1" title="نموذج لغوي كبير – Arabic" lang="ar" hreflang="ar" data-title="نموذج لغوي كبير" data-language-autonym="العربية" data-language-local-name="Arabic" class="interlanguage-link-target"><span>العربية</span></a></li><li class="interlanguage-link interwiki-az mw-list-item"><a href="https://az.wikipedia.org/wiki/B%C3%B6y%C3%BCk_dil_modeli" title="Böyük dil modeli – Azerbaijani" lang="az" hreflang="az" data-title="Böyük dil modeli" data-language-autonym="Azərbaycanca" data-language-local-name="Azerbaijani" class="interlanguage-link-target"><span>Azərbaycanca</span></a></li><li class="interlanguage-link interwiki-zh-min-nan mw-list-item"><a href="https://zh-min-nan.wikipedia.org/wiki/T%C5%8Da-h%C3%AAng_g%C3%AD-gi%C3%A2n_b%C3%B4%CD%98-h%C3%AAng" title="Tōa-hêng gí-giân bô͘-hêng – Minnan" lang="nan" hreflang="nan" data-title="Tōa-hêng gí-giân bô͘-hêng" data-language-autonym="閩南語 / Bân-lâm-gú" data-language-local-name="Minnan" class="interlanguage-link-target"><span>閩南語 / Bân-lâm-gú</span></a></li><li class="interlanguage-link interwiki-bar mw-list-item"><a href="https://bar.wikipedia.org/wiki/Large_language_model" title="Large language model – Bavarian" lang="bar" hreflang="bar" data-title="Large language model" data-language-autonym="Boarisch" data-language-local-name="Bavarian" class="interlanguage-link-target"><span>Boarisch</span></a></li><li class="interlanguage-link interwiki-bs mw-list-item"><a href="https://bs.wikipedia.org/wiki/Veliki_jezi%C4%8Dki_modeli" title="Veliki jezički modeli – Bosnian" lang="bs" hreflang="bs" data-title="Veliki jezički modeli" data-language-autonym="Bosanski" data-language-local-name="Bosnian" class="interlanguage-link-target"><span>Bosanski</span></a></li><li class="interlanguage-link interwiki-ca mw-list-item"><a href="https://ca.wikipedia.org/wiki/Model_de_llenguatge_extens" title="Model de llenguatge extens – Catalan" lang="ca" hreflang="ca" data-title="Model de llenguatge extens" data-language-autonym="Català" data-language-local-name="Catalan" class="interlanguage-link-target"><span>Català</span></a></li><li class="interlanguage-link interwiki-cs mw-list-item"><a href="https://cs.wikipedia.org/wiki/Velk%C3%BD_jazykov%C3%BD_model" title="Velký jazykový model – Czech" lang="cs" hreflang="cs" data-title="Velký jazykový model" data-language-autonym="Čeština" data-language-local-name="Czech" class="interlanguage-link-target"><span>Čeština</span></a></li><li class="interlanguage-link interwiki-de mw-list-item"><a href="https://de.wikipedia.org/wiki/Large_Language_Model" title="Large Language Model – German" lang="de" hreflang="de" data-title="Large Language Model" data-language-autonym="Deutsch" data-language-local-name="German" class="interlanguage-link-target"><span>Deutsch</span></a></li><li class="interlanguage-link interwiki-el mw-list-item"><a href="https://el.wikipedia.org/wiki/%CE%9C%CE%B5%CE%B3%CE%AC%CE%BB%CE%BF_%CE%B3%CE%BB%CF%89%CF%83%CF%83%CE%B9%CE%BA%CF%8C_%CE%BC%CE%BF%CE%BD%CF%84%CE%AD%CE%BB%CE%BF" title="Μεγάλο γλωσσικό μοντέλο – Greek" lang="el" hreflang="el" data-title="Μεγάλο γλωσσικό μοντέλο" data-language-autonym="Ελληνικά" data-language-local-name="Greek" class="interlanguage-link-target"><span>Ελληνικά</span></a></li><li class="interlanguage-link interwiki-es mw-list-item"><a href="https://es.wikipedia.org/wiki/Modelo_extenso_de_lenguaje" title="Modelo extenso de lenguaje – Spanish" lang="es" hreflang="es" data-title="Modelo extenso de lenguaje" data-language-autonym="Español" data-language-local-name="Spanish" class="interlanguage-link-target"><span>Español</span></a></li><li class="interlanguage-link interwiki-eu mw-list-item"><a href="https://eu.wikipedia.org/wiki/Hizkuntza_Eredu_Handiak_(LLM)" title="Hizkuntza Eredu Handiak (LLM) – Basque" lang="eu" hreflang="eu" data-title="Hizkuntza Eredu Handiak (LLM)" data-language-autonym="Euskara" data-language-local-name="Basque" class="interlanguage-link-target"><span>Euskara</span></a></li><li class="interlanguage-link interwiki-fa mw-list-item"><a href="https://fa.wikipedia.org/wiki/%D9%85%D8%AF%D9%84_%D8%B2%D8%A8%D8%A7%D9%86%DB%8C_%D8%A8%D8%B2%D8%B1%DA%AF" title="مدل زبانی بزرگ – Persian" lang="fa" hreflang="fa" data-title="مدل زبانی بزرگ" data-language-autonym="فارسی" data-language-local-name="Persian" class="interlanguage-link-target"><span>فارسی</span></a></li><li class="interlanguage-link interwiki-fr mw-list-item"><a href="https://fr.wikipedia.org/wiki/Grand_mod%C3%A8le_de_langage" title="Grand modèle de langage – French" lang="fr" hreflang="fr" data-title="Grand modèle de langage" data-language-autonym="Français" data-language-local-name="French" class="interlanguage-link-target"><span>Français</span></a></li><li class="interlanguage-link interwiki-ga mw-list-item"><a href="https://ga.wikipedia.org/wiki/Samhail_teanga_mh%C3%B3r" title="Samhail teanga mhór – Irish" lang="ga" hreflang="ga" data-title="Samhail teanga mhór" data-language-autonym="Gaeilge" data-language-local-name="Irish" class="interlanguage-link-target"><span>Gaeilge</span></a></li><li class="interlanguage-link interwiki-gl mw-list-item"><a href="https://gl.wikipedia.org/wiki/Modelo_de_linguaxe_de_grande_escala" title="Modelo de linguaxe de grande escala – Galician" lang="gl" hreflang="gl" data-title="Modelo de linguaxe de grande escala" data-language-autonym="Galego" data-language-local-name="Galician" class="interlanguage-link-target"><span>Galego</span></a></li><li class="interlanguage-link interwiki-ko mw-list-item"><a href="https://ko.wikipedia.org/wiki/%EB%8C%80%ED%98%95_%EC%96%B8%EC%96%B4_%EB%AA%A8%EB%8D%B8" title="대형 언어 모델 – Korean" lang="ko" hreflang="ko" data-title="대형 언어 모델" data-language-autonym="한국어" data-language-local-name="Korean" class="interlanguage-link-target"><span>한국어</span></a></li><li class="interlanguage-link interwiki-hi mw-list-item"><a href="https://hi.wikipedia.org/wiki/%E0%A4%AC%E0%A4%A1%E0%A4%BC%E0%A5%87_%E0%A4%AD%E0%A4%BE%E0%A4%B7%E0%A4%BE_%E0%A4%AE%E0%A5%89%E0%A4%A1%E0%A4%B2" title="बड़े भाषा मॉडल – Hindi" lang="hi" hreflang="hi" data-title="बड़े भाषा मॉडल" data-language-autonym="हिन्दी" data-language-local-name="Hindi" class="interlanguage-link-target"><span>हिन्दी</span></a></li><li class="interlanguage-link interwiki-id mw-list-item"><a href="https://id.wikipedia.org/wiki/Model_bahasa_besar" title="Model bahasa besar – Indonesian" lang="id" hreflang="id" data-title="Model bahasa besar" data-language-autonym="Bahasa Indonesia" data-language-local-name="Indonesian" class="interlanguage-link-target"><span>Bahasa Indonesia</span></a></li><li class="interlanguage-link interwiki-zu mw-list-item"><a href="https://zu.wikipedia.org/wiki/UNongo_lolimi_olukhulu" title="UNongo lolimi olukhulu – Zulu" lang="zu" hreflang="zu" data-title="UNongo lolimi olukhulu" data-language-autonym="IsiZulu" data-language-local-name="Zulu" class="interlanguage-link-target"><span>IsiZulu</span></a></li><li class="interlanguage-link interwiki-it mw-list-item"><a href="https://it.wikipedia.org/wiki/Modello_linguistico_di_grandi_dimensioni" title="Modello linguistico di grandi dimensioni – Italian" lang="it" hreflang="it" data-title="Modello linguistico di grandi dimensioni" data-language-autonym="Italiano" data-language-local-name="Italian" class="interlanguage-link-target"><span>Italiano</span></a></li><li class="interlanguage-link interwiki-he mw-list-item"><a href="https://he.wikipedia.org/wiki/%D7%9E%D7%95%D7%93%D7%9C_%D7%A9%D7%A4%D7%94_%D7%92%D7%93%D7%95%D7%9C" title="מודל שפה גדול – Hebrew" lang="he" hreflang="he" data-title="מודל שפה גדול" data-language-autonym="עברית" data-language-local-name="Hebrew" class="interlanguage-link-target"><span>עברית</span></a></li><li class="interlanguage-link interwiki-hu mw-list-item"><a href="https://hu.wikipedia.org/wiki/Nagy_nyelvi_modell" title="Nagy nyelvi modell – Hungarian" lang="hu" hreflang="hu" data-title="Nagy nyelvi modell" data-language-autonym="Magyar" data-language-local-name="Hungarian" class="interlanguage-link-target"><span>Magyar</span></a></li><li class="interlanguage-link interwiki-mk mw-list-item"><a href="https://mk.wikipedia.org/wiki/%D0%93%D0%BE%D0%BB%D0%B5%D0%BC_%D1%98%D0%B0%D0%B7%D0%B8%D1%87%D0%B5%D0%BD_%D0%BC%D0%BE%D0%B4%D0%B5%D0%BB" title="Голем јазичен модел – Macedonian" lang="mk" hreflang="mk" data-title="Голем јазичен модел" data-language-autonym="Македонски" data-language-local-name="Macedonian" class="interlanguage-link-target"><span>Македонски</span></a></li><li class="interlanguage-link interwiki-nl mw-list-item"><a href="https://nl.wikipedia.org/wiki/Groot_taalmodel" title="Groot taalmodel – Dutch" lang="nl" hreflang="nl" data-title="Groot taalmodel" data-language-autonym="Nederlands" data-language-local-name="Dutch" class="interlanguage-link-target"><span>Nederlands</span></a></li><li class="interlanguage-link interwiki-ja mw-list-item"><a href="https://ja.wikipedia.org/wiki/%E5%A4%A7%E8%A6%8F%E6%A8%A1%E8%A8%80%E8%AA%9E%E3%83%A2%E3%83%87%E3%83%AB" title="大規模言語モデル – Japanese" lang="ja" hreflang="ja" data-title="大規模言語モデル" data-language-autonym="日本語" data-language-local-name="Japanese" class="interlanguage-link-target"><span>日本語</span></a></li><li class="interlanguage-link interwiki-pl mw-list-item"><a href="https://pl.wikipedia.org/wiki/Du%C5%BCy_model_j%C4%99zykowy" title="Duży model językowy – Polish" lang="pl" hreflang="pl" data-title="Duży model językowy" data-language-autonym="Polski" data-language-local-name="Polish" class="interlanguage-link-target"><span>Polski</span></a></li><li class="interlanguage-link interwiki-pt mw-list-item"><a href="https://pt.wikipedia.org/wiki/Modelos_de_linguagem_de_grande_escala" title="Modelos de linguagem de grande escala – Portuguese" lang="pt" hreflang="pt" data-title="Modelos de linguagem de grande escala" data-language-autonym="Português" data-language-local-name="Portuguese" class="interlanguage-link-target"><span>Português</span></a></li><li class="interlanguage-link interwiki-kaa mw-list-item"><a href="https://kaa.wikipedia.org/wiki/%C3%9Alken_til_modeli" title="Úlken til modeli – Kara-Kalpak" lang="kaa" hreflang="kaa" data-title="Úlken til modeli" data-language-autonym="Qaraqalpaqsha" data-language-local-name="Kara-Kalpak" class="interlanguage-link-target"><span>Qaraqalpaqsha</span></a></li><li class="interlanguage-link interwiki-ro mw-list-item"><a href="https://ro.wikipedia.org/wiki/Model_lingvistic_mare" title="Model lingvistic mare – Romanian" lang="ro" hreflang="ro" data-title="Model lingvistic mare" data-language-autonym="Română" data-language-local-name="Romanian" class="interlanguage-link-target"><span>Română</span></a></li><li class="interlanguage-link interwiki-qu mw-list-item"><a href="https://qu.wikipedia.org/wiki/Hatun_simi_wallpama" title="Hatun simi wallpama – Quechua" lang="qu" hreflang="qu" data-title="Hatun simi wallpama" data-language-autonym="Runa Simi" data-language-local-name="Quechua" class="interlanguage-link-target"><span>Runa Simi</span></a></li><li class="interlanguage-link interwiki-ru mw-list-item"><a href="https://ru.wikipedia.org/wiki/%D0%91%D0%BE%D0%BB%D1%8C%D1%88%D0%B0%D1%8F_%D1%8F%D0%B7%D1%8B%D0%BA%D0%BE%D0%B2%D0%B0%D1%8F_%D0%BC%D0%BE%D0%B4%D0%B5%D0%BB%D1%8C" title="Большая языковая модель – Russian" lang="ru" hreflang="ru" data-title="Большая языковая модель" data-language-autonym="Русский" data-language-local-name="Russian" class="interlanguage-link-target"><span>Русский</span></a></li><li class="interlanguage-link interwiki-sq mw-list-item"><a href="https://sq.wikipedia.org/wiki/Modeli_i_gjuh%C3%ABs_s%C3%AB_madhe" title="Modeli i gjuhës së madhe – Albanian" lang="sq" hreflang="sq" data-title="Modeli i gjuhës së madhe" data-language-autonym="Shqip" data-language-local-name="Albanian" class="interlanguage-link-target"><span>Shqip</span></a></li><li class="interlanguage-link interwiki-sl mw-list-item"><a href="https://sl.wikipedia.org/wiki/Obse%C5%BEni_jezikovni_model" title="Obsežni jezikovni model – Slovenian" lang="sl" hreflang="sl" data-title="Obsežni jezikovni model" data-language-autonym="Slovenščina" data-language-local-name="Slovenian" class="interlanguage-link-target"><span>Slovenščina</span></a></li><li class="interlanguage-link interwiki-ckb mw-list-item"><a href="https://ckb.wikipedia.org/wiki/%D9%85%DB%86%D8%AF%DB%8E%D9%84%DB%8C_%D8%B2%D9%85%D8%A7%D9%86%DB%8C_%DA%AF%DB%95%D9%88%D8%B1%DB%95" title="مۆدێلی زمانی گەورە – Central Kurdish" lang="ckb" hreflang="ckb" data-title="مۆدێلی زمانی گەورە" data-language-autonym="کوردی" data-language-local-name="Central Kurdish" class="interlanguage-link-target"><span>کوردی</span></a></li><li class="interlanguage-link interwiki-sr mw-list-item"><a href="https://sr.wikipedia.org/wiki/Veliki_jezi%C4%8Dki_modeli" title="Veliki jezički modeli – Serbian" lang="sr" hreflang="sr" data-title="Veliki jezički modeli" data-language-autonym="Српски / srpski" data-language-local-name="Serbian" class="interlanguage-link-target"><span>Српски / srpski</span></a></li><li class="interlanguage-link interwiki-tl mw-list-item"><a href="https://tl.wikipedia.org/wiki/Malaking_modelong_pangwika" title="Malaking modelong pangwika – Tagalog" lang="tl" hreflang="tl" data-title="Malaking modelong pangwika" data-language-autonym="Tagalog" data-language-local-name="Tagalog" class="interlanguage-link-target"><span>Tagalog</span></a></li><li class="interlanguage-link interwiki-th mw-list-item"><a href="https://th.wikipedia.org/wiki/%E0%B9%81%E0%B8%9A%E0%B8%9A%E0%B8%88%E0%B8%B3%E0%B8%A5%E0%B8%AD%E0%B8%87%E0%B8%A0%E0%B8%B2%E0%B8%A9%E0%B8%B2%E0%B8%82%E0%B8%99%E0%B8%B2%E0%B8%94%E0%B9%83%E0%B8%AB%E0%B8%8D%E0%B9%88" title="แบบจำลองภาษาขนาดใหญ่ – Thai" lang="th" hreflang="th" data-title="แบบจำลองภาษาขนาดใหญ่" data-language-autonym="ไทย" data-language-local-name="Thai" class="interlanguage-link-target"><span>ไทย</span></a></li><li class="interlanguage-link interwiki-tr mw-list-item"><a href="https://tr.wikipedia.org/wiki/Geni%C5%9F_dil_modeli" title="Geniş dil modeli – Turkish" lang="tr" hreflang="tr" data-title="Geniş dil modeli" data-language-autonym="Türkçe" data-language-local-name="Turkish" class="interlanguage-link-target"><span>Türkçe</span></a></li><li class="interlanguage-link interwiki-uk mw-list-item"><a href="https://uk.wikipedia.org/wiki/%D0%92%D0%B5%D0%BB%D0%B8%D0%BA%D0%B0_%D0%BC%D0%BE%D0%B2%D0%BD%D0%B0_%D0%BC%D0%BE%D0%B4%D0%B5%D0%BB%D1%8C" title="Велика мовна модель – Ukrainian" lang="uk" hreflang="uk" data-title="Велика мовна модель" data-language-autonym="Українська" data-language-local-name="Ukrainian" class="interlanguage-link-target"><span>Українська</span></a></li><li class="interlanguage-link interwiki-ug mw-list-item"><a href="https://ug.wikipedia.org/wiki/%DA%86%D9%88%DA%AD_%D8%AA%D9%89%D9%84_%D9%85%D9%88%D8%AF%D9%89%D9%84%D9%89" title="چوڭ تىل مودىلى – Uyghur" lang="ug" hreflang="ug" data-title="چوڭ تىل مودىلى" data-language-autonym="ئۇيغۇرچە / Uyghurche" data-language-local-name="Uyghur" class="interlanguage-link-target"><span>ئۇيغۇرچە / Uyghurche</span></a></li><li class="interlanguage-link interwiki-vi mw-list-item"><a href="https://vi.wikipedia.org/wiki/M%C3%B4_h%C3%ACnh_ng%C3%B4n_ng%E1%BB%AF_l%E1%BB%9Bn" title="Mô hình ngôn ngữ lớn – Vietnamese" lang="vi" hreflang="vi" data-title="Mô hình ngôn ngữ lớn" data-language-autonym="Tiếng Việt" data-language-local-name="Vietnamese" class="interlanguage-link-target"><span>Tiếng Việt</span></a></li><li class="interlanguage-link interwiki-zh-classical mw-list-item"><a href="https://zh-classical.wikipedia.org/wiki/%E5%A4%A7%E8%AA%9E%E8%A8%80%E6%A8%A1%E5%9E%8B" title="大語言模型 – Literary Chinese" lang="lzh" hreflang="lzh" data-title="大語言模型" data-language-autonym="文言" data-language-local-name="Literary Chinese" class="interlanguage-link-target"><span>文言</span></a></li><li class="interlanguage-link interwiki-zh-yue mw-list-item"><a href="https://zh-yue.wikipedia.org/wiki/%E5%A4%A7%E5%9E%8B%E8%AA%9E%E8%A8%80%E6%A8%A1%E5%9E%8B" title="大型語言模型 – Cantonese" lang="yue" hreflang="yue" data-title="大型語言模型" data-language-autonym="粵語" data-language-local-name="Cantonese" class="interlanguage-link-target"><span>粵語</span></a></li><li class="interlanguage-link interwiki-zh mw-list-item"><a href="https://zh.wikipedia.org/wiki/%E5%A4%A7%E5%9E%8B%E8%AF%AD%E8%A8%80%E6%A8%A1%E5%9E%8B" title="大型语言模型 – Chinese" lang="zh" hreflang="zh" data-title="大型语言模型" data-language-autonym="中文" data-language-local-name="Chinese" class="interlanguage-link-target"><span>中文</span></a></li> </ul> <div class="after-portlet after-portlet-lang"><span class="wb-langlinks-edit wb-langlinks-link"><a href="https://www.wikidata.org/wiki/Special:EntityPage/Q115305900#sitelinks-wikipedia" title="Edit interlanguage links" class="wbc-editpage">Edit links</a></span></div> </div> </div> </div> </header> <div class="vector-page-toolbar"> <div class="vector-page-toolbar-container"> <div id="left-navigation"> <nav aria-label="Namespaces"> <div id="p-associated-pages" class="vector-menu vector-menu-tabs mw-portlet mw-portlet-associated-pages" > <div class="vector-menu-content"> <ul class="vector-menu-content-list"> <li id="ca-nstab-main" class="selected vector-tab-noicon mw-list-item"><a href="/wiki/Large_language_model" title="View the content page [c]" accesskey="c"><span>Article</span></a></li><li id="ca-talk" class="vector-tab-noicon mw-list-item"><a href="/wiki/Talk:Large_language_model" rel="discussion" title="Discuss improvements to the content page [t]" accesskey="t"><span>Talk</span></a></li> </ul> </div> </div> <div id="vector-variants-dropdown" class="vector-dropdown emptyPortlet" > <input type="checkbox" id="vector-variants-dropdown-checkbox" role="button" aria-haspopup="true" data-event-name="ui.dropdown-vector-variants-dropdown" class="vector-dropdown-checkbox " aria-label="Change language variant" > <label id="vector-variants-dropdown-label" for="vector-variants-dropdown-checkbox" class="vector-dropdown-label cdx-button cdx-button--fake-button cdx-button--fake-button--enabled cdx-button--weight-quiet" aria-hidden="true" ><span class="vector-dropdown-label-text">English</span> </label> <div class="vector-dropdown-content"> <div id="p-variants" class="vector-menu mw-portlet mw-portlet-variants emptyPortlet" > <div class="vector-menu-content"> <ul class="vector-menu-content-list"> </ul> </div> </div> </div> </div> </nav> </div> <div id="right-navigation" class="vector-collapsible"> <nav aria-label="Views"> <div id="p-views" class="vector-menu vector-menu-tabs mw-portlet mw-portlet-views" > <div class="vector-menu-content"> <ul class="vector-menu-content-list"> <li id="ca-view" class="selected vector-tab-noicon mw-list-item"><a href="/wiki/Large_language_model"><span>Read</span></a></li><li id="ca-edit" class="vector-tab-noicon mw-list-item"><a href="/w/index.php?title=Large_language_model&action=edit" title="Edit this page [e]" accesskey="e"><span>Edit</span></a></li><li id="ca-history" class="vector-tab-noicon mw-list-item"><a href="/w/index.php?title=Large_language_model&action=history" title="Past revisions of this page [h]" accesskey="h"><span>View history</span></a></li> </ul> </div> </div> </nav> <nav class="vector-page-tools-landmark" aria-label="Page tools"> <div id="vector-page-tools-dropdown" class="vector-dropdown vector-page-tools-dropdown" > <input type="checkbox" id="vector-page-tools-dropdown-checkbox" role="button" aria-haspopup="true" data-event-name="ui.dropdown-vector-page-tools-dropdown" class="vector-dropdown-checkbox " aria-label="Tools" > <label id="vector-page-tools-dropdown-label" for="vector-page-tools-dropdown-checkbox" class="vector-dropdown-label cdx-button cdx-button--fake-button cdx-button--fake-button--enabled cdx-button--weight-quiet" aria-hidden="true" ><span class="vector-dropdown-label-text">Tools</span> </label> <div class="vector-dropdown-content"> <div id="vector-page-tools-unpinned-container" class="vector-unpinned-container"> <div id="vector-page-tools" class="vector-page-tools vector-pinnable-element"> <div class="vector-pinnable-header vector-page-tools-pinnable-header vector-pinnable-header-unpinned" data-feature-name="page-tools-pinned" data-pinnable-element-id="vector-page-tools" data-pinned-container-id="vector-page-tools-pinned-container" data-unpinned-container-id="vector-page-tools-unpinned-container" > <div class="vector-pinnable-header-label">Tools</div> <button class="vector-pinnable-header-toggle-button vector-pinnable-header-pin-button" data-event-name="pinnable-header.vector-page-tools.pin">move to sidebar</button> <button class="vector-pinnable-header-toggle-button vector-pinnable-header-unpin-button" data-event-name="pinnable-header.vector-page-tools.unpin">hide</button> </div> <div id="p-cactions" class="vector-menu mw-portlet mw-portlet-cactions emptyPortlet vector-has-collapsible-items" title="More options" > <div class="vector-menu-heading"> Actions </div> <div class="vector-menu-content"> <ul class="vector-menu-content-list"> <li id="ca-more-view" class="selected vector-more-collapsible-item mw-list-item"><a href="/wiki/Large_language_model"><span>Read</span></a></li><li id="ca-more-edit" class="vector-more-collapsible-item mw-list-item"><a href="/w/index.php?title=Large_language_model&action=edit" title="Edit this page [e]" accesskey="e"><span>Edit</span></a></li><li id="ca-more-history" class="vector-more-collapsible-item mw-list-item"><a href="/w/index.php?title=Large_language_model&action=history"><span>View history</span></a></li> </ul> </div> </div> <div id="p-tb" class="vector-menu mw-portlet mw-portlet-tb" > <div class="vector-menu-heading"> General </div> <div class="vector-menu-content"> <ul class="vector-menu-content-list"> <li id="t-whatlinkshere" class="mw-list-item"><a href="/wiki/Special:WhatLinksHere/Large_language_model" title="List of all English Wikipedia pages containing links to this page [j]" accesskey="j"><span>What links here</span></a></li><li id="t-recentchangeslinked" class="mw-list-item"><a href="/wiki/Special:RecentChangesLinked/Large_language_model" rel="nofollow" title="Recent changes in pages linked from this page [k]" accesskey="k"><span>Related changes</span></a></li><li id="t-upload" class="mw-list-item"><a href="/wiki/Wikipedia:File_Upload_Wizard" title="Upload files [u]" accesskey="u"><span>Upload file</span></a></li><li id="t-specialpages" class="mw-list-item"><a href="/wiki/Special:SpecialPages" title="A list of all special pages [q]" accesskey="q"><span>Special pages</span></a></li><li id="t-permalink" class="mw-list-item"><a href="/w/index.php?title=Large_language_model&oldid=1258457238" title="Permanent link to this revision of this page"><span>Permanent link</span></a></li><li id="t-info" class="mw-list-item"><a href="/w/index.php?title=Large_language_model&action=info" title="More information about this page"><span>Page information</span></a></li><li id="t-cite" class="mw-list-item"><a href="/w/index.php?title=Special:CiteThisPage&page=Large_language_model&id=1258457238&wpFormIdentifier=titleform" title="Information on how to cite this page"><span>Cite this page</span></a></li><li id="t-urlshortener" class="mw-list-item"><a href="/w/index.php?title=Special:UrlShortener&url=https%3A%2F%2Fen.wikipedia.org%2Fwiki%2FLarge_language_model"><span>Get shortened URL</span></a></li><li id="t-urlshortener-qrcode" class="mw-list-item"><a href="/w/index.php?title=Special:QrCode&url=https%3A%2F%2Fen.wikipedia.org%2Fwiki%2FLarge_language_model"><span>Download QR code</span></a></li> </ul> </div> </div> <div id="p-coll-print_export" class="vector-menu mw-portlet mw-portlet-coll-print_export" > <div class="vector-menu-heading"> Print/export </div> <div class="vector-menu-content"> <ul class="vector-menu-content-list"> <li id="coll-download-as-rl" class="mw-list-item"><a href="/w/index.php?title=Special:DownloadAsPdf&page=Large_language_model&action=show-download-screen" title="Download this page as a PDF file"><span>Download as PDF</span></a></li><li id="t-print" class="mw-list-item"><a href="/w/index.php?title=Large_language_model&printable=yes" title="Printable version of this page [p]" accesskey="p"><span>Printable version</span></a></li> </ul> </div> </div> <div id="p-wikibase-otherprojects" class="vector-menu mw-portlet mw-portlet-wikibase-otherprojects" > <div class="vector-menu-heading"> In other projects </div> <div class="vector-menu-content"> <ul class="vector-menu-content-list"> <li class="wb-otherproject-link wb-otherproject-commons mw-list-item"><a href="https://commons.wikimedia.org/wiki/Category:Large_language_models" hreflang="en"><span>Wikimedia Commons</span></a></li><li id="t-wikibase" class="wb-otherproject-link wb-otherproject-wikibase-dataitem mw-list-item"><a href="https://www.wikidata.org/wiki/Special:EntityPage/Q115305900" title="Structured data on this page hosted by Wikidata [g]" accesskey="g"><span>Wikidata item</span></a></li> </ul> </div> </div> </div> </div> </div> </div> </nav> </div> </div> </div> <div class="vector-column-end"> <div class="vector-sticky-pinned-container"> <nav class="vector-page-tools-landmark" aria-label="Page tools"> <div id="vector-page-tools-pinned-container" class="vector-pinned-container"> </div> </nav> <nav class="vector-appearance-landmark" aria-label="Appearance"> <div id="vector-appearance-pinned-container" class="vector-pinned-container"> <div id="vector-appearance" class="vector-appearance vector-pinnable-element"> <div class="vector-pinnable-header vector-appearance-pinnable-header vector-pinnable-header-pinned" data-feature-name="appearance-pinned" data-pinnable-element-id="vector-appearance" data-pinned-container-id="vector-appearance-pinned-container" data-unpinned-container-id="vector-appearance-unpinned-container" > <div class="vector-pinnable-header-label">Appearance</div> <button class="vector-pinnable-header-toggle-button vector-pinnable-header-pin-button" data-event-name="pinnable-header.vector-appearance.pin">move to sidebar</button> <button class="vector-pinnable-header-toggle-button vector-pinnable-header-unpin-button" data-event-name="pinnable-header.vector-appearance.unpin">hide</button> </div> </div> </div> </nav> </div> </div> <div id="bodyContent" class="vector-body" aria-labelledby="firstHeading" data-mw-ve-target-container> <div class="vector-body-before-content"> <div class="mw-indicators"> </div> <div id="siteSub" class="noprint">From Wikipedia, the free encyclopedia</div> </div> <div id="contentSub"><div id="mw-content-subtitle"></div></div> <div id="mw-content-text" class="mw-body-content"><div class="mw-content-ltr mw-parser-output" lang="en" dir="ltr"><div class="shortdescription nomobile noexcerpt noprint searchaux" style="display:none">Type of artificial neural network</div> <style data-mw-deduplicate="TemplateStyles:r1236090951">.mw-parser-output .hatnote{font-style:italic}.mw-parser-output div.hatnote{padding-left:1.6em;margin-bottom:0.5em}.mw-parser-output .hatnote i{font-style:normal}.mw-parser-output .hatnote+link+.hatnote{margin-top:-0.5em}@media print{body.ns-0 .mw-parser-output .hatnote{display:none!important}}</style><div role="note" class="hatnote navigation-not-searchable">Not to be confused with <a href="/wiki/Logic_learning_machine" title="Logic learning machine">Logic learning machine</a>.</div> <style data-mw-deduplicate="TemplateStyles:r1244144826">.mw-parser-output .machine-learning-list-title{background-color:#ddddff}html.skin-theme-clientpref-night .mw-parser-output .machine-learning-list-title{background-color:#222}@media(prefers-color-scheme:dark){html.skin-theme-clientpref-os .mw-parser-output .machine-learning-list-title{background-color:#222}}</style> <style data-mw-deduplicate="TemplateStyles:r1129693374">.mw-parser-output .hlist dl,.mw-parser-output .hlist ol,.mw-parser-output .hlist ul{margin:0;padding:0}.mw-parser-output .hlist dd,.mw-parser-output .hlist dt,.mw-parser-output .hlist li{margin:0;display:inline}.mw-parser-output .hlist.inline,.mw-parser-output .hlist.inline dl,.mw-parser-output .hlist.inline ol,.mw-parser-output .hlist.inline ul,.mw-parser-output .hlist dl dl,.mw-parser-output .hlist dl ol,.mw-parser-output .hlist dl ul,.mw-parser-output .hlist ol dl,.mw-parser-output .hlist ol ol,.mw-parser-output .hlist ol ul,.mw-parser-output .hlist ul dl,.mw-parser-output .hlist ul ol,.mw-parser-output .hlist ul ul{display:inline}.mw-parser-output .hlist .mw-empty-li{display:none}.mw-parser-output .hlist dt::after{content:": "}.mw-parser-output .hlist dd::after,.mw-parser-output .hlist li::after{content:" · ";font-weight:bold}.mw-parser-output .hlist dd:last-child::after,.mw-parser-output .hlist dt:last-child::after,.mw-parser-output .hlist li:last-child::after{content:none}.mw-parser-output .hlist dd dd:first-child::before,.mw-parser-output .hlist dd dt:first-child::before,.mw-parser-output .hlist dd li:first-child::before,.mw-parser-output .hlist dt dd:first-child::before,.mw-parser-output .hlist dt dt:first-child::before,.mw-parser-output .hlist dt li:first-child::before,.mw-parser-output .hlist li dd:first-child::before,.mw-parser-output .hlist li dt:first-child::before,.mw-parser-output .hlist li li:first-child::before{content:" (";font-weight:normal}.mw-parser-output .hlist dd dd:last-child::after,.mw-parser-output .hlist dd dt:last-child::after,.mw-parser-output .hlist dd li:last-child::after,.mw-parser-output .hlist dt dd:last-child::after,.mw-parser-output .hlist dt dt:last-child::after,.mw-parser-output .hlist dt li:last-child::after,.mw-parser-output .hlist li dd:last-child::after,.mw-parser-output .hlist li dt:last-child::after,.mw-parser-output .hlist li li:last-child::after{content:")";font-weight:normal}.mw-parser-output .hlist ol{counter-reset:listitem}.mw-parser-output .hlist ol>li{counter-increment:listitem}.mw-parser-output .hlist ol>li::before{content:" "counter(listitem)"\a0 "}.mw-parser-output .hlist dd ol>li:first-child::before,.mw-parser-output .hlist dt ol>li:first-child::before,.mw-parser-output .hlist li ol>li:first-child::before{content:" ("counter(listitem)"\a0 "}</style><style data-mw-deduplicate="TemplateStyles:r1246091330">.mw-parser-output .sidebar{width:22em;float:right;clear:right;margin:0.5em 0 1em 1em;background:var(--background-color-neutral-subtle,#f8f9fa);border:1px solid var(--border-color-base,#a2a9b1);padding:0.2em;text-align:center;line-height:1.4em;font-size:88%;border-collapse:collapse;display:table}body.skin-minerva .mw-parser-output .sidebar{display:table!important;float:right!important;margin:0.5em 0 1em 1em!important}.mw-parser-output .sidebar-subgroup{width:100%;margin:0;border-spacing:0}.mw-parser-output .sidebar-left{float:left;clear:left;margin:0.5em 1em 1em 0}.mw-parser-output .sidebar-none{float:none;clear:both;margin:0.5em 1em 1em 0}.mw-parser-output .sidebar-outer-title{padding:0 0.4em 0.2em;font-size:125%;line-height:1.2em;font-weight:bold}.mw-parser-output .sidebar-top-image{padding:0.4em}.mw-parser-output .sidebar-top-caption,.mw-parser-output .sidebar-pretitle-with-top-image,.mw-parser-output .sidebar-caption{padding:0.2em 0.4em 0;line-height:1.2em}.mw-parser-output .sidebar-pretitle{padding:0.4em 0.4em 0;line-height:1.2em}.mw-parser-output .sidebar-title,.mw-parser-output .sidebar-title-with-pretitle{padding:0.2em 0.8em;font-size:145%;line-height:1.2em}.mw-parser-output .sidebar-title-with-pretitle{padding:0.1em 0.4em}.mw-parser-output .sidebar-image{padding:0.2em 0.4em 0.4em}.mw-parser-output .sidebar-heading{padding:0.1em 0.4em}.mw-parser-output .sidebar-content{padding:0 0.5em 0.4em}.mw-parser-output .sidebar-content-with-subgroup{padding:0.1em 0.4em 0.2em}.mw-parser-output .sidebar-above,.mw-parser-output .sidebar-below{padding:0.3em 0.8em;font-weight:bold}.mw-parser-output .sidebar-collapse .sidebar-above,.mw-parser-output .sidebar-collapse .sidebar-below{border-top:1px solid #aaa;border-bottom:1px solid #aaa}.mw-parser-output .sidebar-navbar{text-align:right;font-size:115%;padding:0 0.4em 0.4em}.mw-parser-output .sidebar-list-title{padding:0 0.4em;text-align:left;font-weight:bold;line-height:1.6em;font-size:105%}.mw-parser-output .sidebar-list-title-c{padding:0 0.4em;text-align:center;margin:0 3.3em}@media(max-width:640px){body.mediawiki .mw-parser-output .sidebar{width:100%!important;clear:both;float:none!important;margin-left:0!important;margin-right:0!important}}body.skin--responsive .mw-parser-output .sidebar a>img{max-width:none!important}@media screen{html.skin-theme-clientpref-night .mw-parser-output .sidebar:not(.notheme) .sidebar-list-title,html.skin-theme-clientpref-night .mw-parser-output .sidebar:not(.notheme) .sidebar-title-with-pretitle{background:transparent!important}html.skin-theme-clientpref-night .mw-parser-output .sidebar:not(.notheme) .sidebar-title-with-pretitle a{color:var(--color-progressive)!important}}@media screen and (prefers-color-scheme:dark){html.skin-theme-clientpref-os .mw-parser-output .sidebar:not(.notheme) .sidebar-list-title,html.skin-theme-clientpref-os .mw-parser-output .sidebar:not(.notheme) .sidebar-title-with-pretitle{background:transparent!important}html.skin-theme-clientpref-os .mw-parser-output .sidebar:not(.notheme) .sidebar-title-with-pretitle a{color:var(--color-progressive)!important}}@media print{body.ns-0 .mw-parser-output .sidebar{display:none!important}}</style><style data-mw-deduplicate="TemplateStyles:r886047488">.mw-parser-output .nobold{font-weight:normal}</style><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r886047488"><table class="sidebar sidebar-collapse nomobile nowraplinks"><tbody><tr><td class="sidebar-pretitle">Part of a series on</td></tr><tr><th class="sidebar-title-with-pretitle"><a href="/wiki/Machine_learning" title="Machine learning">Machine learning</a><br />and <a href="/wiki/Data_mining" title="Data mining">data mining</a></th></tr><tr><td class="sidebar-content"> <div class="sidebar-list mw-collapsible mw-collapsed machine-learning-list-title"><div class="sidebar-list-title" style="border-top:1px solid #aaa; text-align:center;;color: var(--color-base)">Paradigms</div><div class="sidebar-list-content mw-collapsible-content hlist"> <ul><li><a href="/wiki/Supervised_learning" title="Supervised learning">Supervised learning</a></li> <li><a href="/wiki/Unsupervised_learning" title="Unsupervised learning">Unsupervised learning</a></li> <li><a href="/wiki/Semi-supervised_learning" class="mw-redirect" title="Semi-supervised learning">Semi-supervised learning</a></li> <li><a href="/wiki/Self-supervised_learning" title="Self-supervised learning">Self-supervised learning</a></li> <li><a href="/wiki/Reinforcement_learning" title="Reinforcement learning">Reinforcement learning</a></li> <li><a href="/wiki/Meta-learning_(computer_science)" title="Meta-learning (computer science)">Meta-learning</a></li> <li><a href="/wiki/Online_machine_learning" title="Online machine learning">Online learning</a></li> <li><a href="/wiki/Batch_learning" class="mw-redirect" title="Batch learning">Batch learning</a></li> <li><a href="/wiki/Curriculum_learning" title="Curriculum learning">Curriculum learning</a></li> <li><a href="/wiki/Rule-based_machine_learning" title="Rule-based machine learning">Rule-based learning</a></li> <li><a href="/wiki/Neuro-symbolic_AI" title="Neuro-symbolic AI">Neuro-symbolic AI</a></li> <li><a href="/wiki/Neuromorphic_engineering" class="mw-redirect" title="Neuromorphic engineering">Neuromorphic engineering</a></li> <li><a href="/wiki/Quantum_machine_learning" title="Quantum machine learning">Quantum machine learning</a></li></ul></div></div></td> </tr><tr><td class="sidebar-content"> <div class="sidebar-list mw-collapsible mw-collapsed machine-learning-list-title"><div class="sidebar-list-title" style="border-top:1px solid #aaa; text-align:center;;color: var(--color-base)">Problems</div><div class="sidebar-list-content mw-collapsible-content hlist"> <ul><li><a href="/wiki/Statistical_classification" title="Statistical classification">Classification</a></li> <li><a href="/wiki/Generative_model" title="Generative model">Generative modeling</a></li> <li><a href="/wiki/Regression_analysis" title="Regression analysis">Regression</a></li> <li><a href="/wiki/Cluster_analysis" title="Cluster analysis">Clustering</a></li> <li><a href="/wiki/Dimensionality_reduction" title="Dimensionality reduction">Dimensionality reduction</a></li> <li><a href="/wiki/Density_estimation" title="Density estimation">Density estimation</a></li> <li><a href="/wiki/Anomaly_detection" title="Anomaly detection">Anomaly detection</a></li> <li><a href="/wiki/Data_cleaning" class="mw-redirect" title="Data cleaning">Data cleaning</a></li> <li><a href="/wiki/Automated_machine_learning" title="Automated machine learning">AutoML</a></li> <li><a href="/wiki/Association_rule_learning" title="Association rule learning">Association rules</a></li> <li><a href="/wiki/Semantic_analysis_(machine_learning)" title="Semantic analysis (machine learning)">Semantic analysis</a></li> <li><a href="/wiki/Structured_prediction" title="Structured prediction">Structured prediction</a></li> <li><a href="/wiki/Feature_engineering" title="Feature engineering">Feature engineering</a></li> <li><a href="/wiki/Feature_learning" title="Feature learning">Feature learning</a></li> <li><a href="/wiki/Learning_to_rank" title="Learning to rank">Learning to rank</a></li> <li><a href="/wiki/Grammar_induction" title="Grammar induction">Grammar induction</a></li> <li><a href="/wiki/Ontology_learning" title="Ontology learning">Ontology learning</a></li> <li><a href="/wiki/Multimodal_learning" title="Multimodal learning">Multimodal learning</a></li></ul></div></div></td> </tr><tr><td class="sidebar-content"> <div class="sidebar-list mw-collapsible mw-collapsed machine-learning-list-title"><div class="sidebar-list-title" style="border-top:1px solid #aaa; text-align:center;;color: var(--color-base)"><div style="display: inline-block; line-height: 1.2em; padding: .1em 0;"><a href="/wiki/Supervised_learning" title="Supervised learning">Supervised learning</a><br /><span class="nobold"><span style="font-size:85%;">(<b><a href="/wiki/Statistical_classification" title="Statistical classification">classification</a></b> • <b><a href="/wiki/Regression_analysis" title="Regression analysis">regression</a></b>)</span></span> </div></div><div class="sidebar-list-content mw-collapsible-content hlist"> <ul><li><a href="/wiki/Apprenticeship_learning" title="Apprenticeship learning">Apprenticeship learning</a></li> <li><a href="/wiki/Decision_tree_learning" title="Decision tree learning">Decision trees</a></li> <li><a href="/wiki/Ensemble_learning" title="Ensemble learning">Ensembles</a> <ul><li><a href="/wiki/Bootstrap_aggregating" title="Bootstrap aggregating">Bagging</a></li> <li><a href="/wiki/Boosting_(machine_learning)" title="Boosting (machine learning)">Boosting</a></li> <li><a href="/wiki/Random_forest" title="Random forest">Random forest</a></li></ul></li> <li><a href="/wiki/K-nearest_neighbors_algorithm" title="K-nearest neighbors algorithm"><i>k</i>-NN</a></li> <li><a href="/wiki/Linear_regression" title="Linear regression">Linear regression</a></li> <li><a href="/wiki/Naive_Bayes_classifier" title="Naive Bayes classifier">Naive Bayes</a></li> <li><a href="/wiki/Artificial_neural_network" class="mw-redirect" title="Artificial neural network">Artificial neural networks</a></li> <li><a href="/wiki/Logistic_regression" title="Logistic regression">Logistic regression</a></li> <li><a href="/wiki/Perceptron" title="Perceptron">Perceptron</a></li> <li><a href="/wiki/Relevance_vector_machine" title="Relevance vector machine">Relevance vector machine (RVM)</a></li> <li><a href="/wiki/Support_vector_machine" title="Support vector machine">Support vector machine (SVM)</a></li></ul></div></div></td> </tr><tr><td class="sidebar-content"> <div class="sidebar-list mw-collapsible mw-collapsed machine-learning-list-title"><div class="sidebar-list-title" style="border-top:1px solid #aaa; text-align:center;;color: var(--color-base)"><a href="/wiki/Cluster_analysis" title="Cluster analysis">Clustering</a></div><div class="sidebar-list-content mw-collapsible-content hlist"> <ul><li><a href="/wiki/BIRCH" title="BIRCH">BIRCH</a></li> <li><a href="/wiki/CURE_algorithm" title="CURE algorithm">CURE</a></li> <li><a href="/wiki/Hierarchical_clustering" title="Hierarchical clustering">Hierarchical</a></li> <li><a href="/wiki/K-means_clustering" title="K-means clustering"><i>k</i>-means</a></li> <li><a href="/wiki/Fuzzy_clustering" title="Fuzzy clustering">Fuzzy</a></li> <li><a href="/wiki/Expectation%E2%80%93maximization_algorithm" title="Expectation–maximization algorithm">Expectation–maximization (EM)</a></li> <li><br /><a href="/wiki/DBSCAN" title="DBSCAN">DBSCAN</a></li> <li><a href="/wiki/OPTICS_algorithm" title="OPTICS algorithm">OPTICS</a></li> <li><a href="/wiki/Mean_shift" title="Mean shift">Mean shift</a></li></ul></div></div></td> </tr><tr><td class="sidebar-content"> <div class="sidebar-list mw-collapsible mw-collapsed machine-learning-list-title"><div class="sidebar-list-title" style="border-top:1px solid #aaa; text-align:center;;color: var(--color-base)"><a href="/wiki/Dimensionality_reduction" title="Dimensionality reduction">Dimensionality reduction</a></div><div class="sidebar-list-content mw-collapsible-content hlist"> <ul><li><a href="/wiki/Factor_analysis" title="Factor analysis">Factor analysis</a></li> <li><a href="/wiki/Canonical_correlation" title="Canonical correlation">CCA</a></li> <li><a href="/wiki/Independent_component_analysis" title="Independent component analysis">ICA</a></li> <li><a href="/wiki/Linear_discriminant_analysis" title="Linear discriminant analysis">LDA</a></li> <li><a href="/wiki/Non-negative_matrix_factorization" title="Non-negative matrix factorization">NMF</a></li> <li><a href="/wiki/Principal_component_analysis" title="Principal component analysis">PCA</a></li> <li><a href="/wiki/Proper_generalized_decomposition" title="Proper generalized decomposition">PGD</a></li> <li><a href="/wiki/T-distributed_stochastic_neighbor_embedding" title="T-distributed stochastic neighbor embedding">t-SNE</a></li> <li><a href="/wiki/Sparse_dictionary_learning" title="Sparse dictionary learning">SDL</a></li></ul></div></div></td> </tr><tr><td class="sidebar-content"> <div class="sidebar-list mw-collapsible mw-collapsed machine-learning-list-title"><div class="sidebar-list-title" style="border-top:1px solid #aaa; text-align:center;;color: var(--color-base)"><a href="/wiki/Structured_prediction" title="Structured prediction">Structured prediction</a></div><div class="sidebar-list-content mw-collapsible-content hlist"> <ul><li><a href="/wiki/Graphical_model" title="Graphical model">Graphical models</a> <ul><li><a href="/wiki/Bayesian_network" title="Bayesian network">Bayes net</a></li> <li><a href="/wiki/Conditional_random_field" title="Conditional random field">Conditional random field</a></li> <li><a href="/wiki/Hidden_Markov_model" title="Hidden Markov model">Hidden Markov</a></li></ul></li></ul></div></div></td> </tr><tr><td class="sidebar-content"> <div class="sidebar-list mw-collapsible mw-collapsed machine-learning-list-title"><div class="sidebar-list-title" style="border-top:1px solid #aaa; text-align:center;;color: var(--color-base)"><a href="/wiki/Anomaly_detection" title="Anomaly detection">Anomaly detection</a></div><div class="sidebar-list-content mw-collapsible-content hlist"> <ul><li><a href="/wiki/Random_sample_consensus" title="Random sample consensus">RANSAC</a></li> <li><a href="/wiki/K-nearest_neighbors_algorithm" title="K-nearest neighbors algorithm"><i>k</i>-NN</a></li> <li><a href="/wiki/Local_outlier_factor" title="Local outlier factor">Local outlier factor</a></li> <li><a href="/wiki/Isolation_forest" title="Isolation forest">Isolation forest</a></li></ul></div></div></td> </tr><tr><td class="sidebar-content"> <div class="sidebar-list mw-collapsible machine-learning-list-title"><div class="sidebar-list-title" style="border-top:1px solid #aaa; text-align:center;;color: var(--color-base)"><a href="/wiki/Artificial_neural_network" class="mw-redirect" title="Artificial neural network">Artificial neural network</a></div><div class="sidebar-list-content mw-collapsible-content hlist"> <ul><li><a href="/wiki/Autoencoder" title="Autoencoder">Autoencoder</a></li> <li><a href="/wiki/Deep_learning" title="Deep learning">Deep learning</a></li> <li><a href="/wiki/Feedforward_neural_network" title="Feedforward neural network">Feedforward neural network</a></li> <li><a href="/wiki/Recurrent_neural_network" title="Recurrent neural network">Recurrent neural network</a> <ul><li><a href="/wiki/Long_short-term_memory" title="Long short-term memory">LSTM</a></li> <li><a href="/wiki/Gated_recurrent_unit" title="Gated recurrent unit">GRU</a></li> <li><a href="/wiki/Echo_state_network" title="Echo state network">ESN</a></li> <li><a href="/wiki/Reservoir_computing" title="Reservoir computing">reservoir computing</a></li></ul></li> <li><a href="/wiki/Boltzmann_machine" title="Boltzmann machine">Boltzmann machine</a> <ul><li><a href="/wiki/Restricted_Boltzmann_machine" title="Restricted Boltzmann machine">Restricted</a></li></ul></li> <li><a href="/wiki/Generative_adversarial_network" title="Generative adversarial network">GAN</a></li> <li><a href="/wiki/Diffusion_model" title="Diffusion model">Diffusion model</a></li> <li><a href="/wiki/Self-organizing_map" title="Self-organizing map">SOM</a></li> <li><a href="/wiki/Convolutional_neural_network" title="Convolutional neural network">Convolutional neural network</a> <ul><li><a href="/wiki/U-Net" title="U-Net">U-Net</a></li> <li><a href="/wiki/LeNet" title="LeNet">LeNet</a></li> <li><a href="/wiki/AlexNet" title="AlexNet">AlexNet</a></li> <li><a href="/wiki/DeepDream" title="DeepDream">DeepDream</a></li></ul></li> <li><a href="/wiki/Neural_radiance_field" title="Neural radiance field">Neural radiance field</a></li> <li><a href="/wiki/Transformer_(machine_learning_model)" class="mw-redirect" title="Transformer (machine learning model)">Transformer</a> <ul><li><a href="/wiki/Vision_transformer" title="Vision transformer">Vision</a></li></ul></li> <li><a href="/wiki/Mamba_(deep_learning_architecture)" title="Mamba (deep learning architecture)">Mamba</a></li> <li><a href="/wiki/Spiking_neural_network" title="Spiking neural network">Spiking neural network</a></li> <li><a href="/wiki/Memtransistor" title="Memtransistor">Memtransistor</a></li> <li><a href="/wiki/Electrochemical_RAM" title="Electrochemical RAM">Electrochemical RAM</a> (ECRAM)</li></ul></div></div></td> </tr><tr><td class="sidebar-content"> <div class="sidebar-list mw-collapsible mw-collapsed machine-learning-list-title"><div class="sidebar-list-title" style="border-top:1px solid #aaa; text-align:center;;color: var(--color-base)"><a href="/wiki/Reinforcement_learning" title="Reinforcement learning">Reinforcement learning</a></div><div class="sidebar-list-content mw-collapsible-content hlist"> <ul><li><a href="/wiki/Q-learning" title="Q-learning">Q-learning</a></li> <li><a href="/wiki/State%E2%80%93action%E2%80%93reward%E2%80%93state%E2%80%93action" title="State–action–reward–state–action">SARSA</a></li> <li><a href="/wiki/Temporal_difference_learning" title="Temporal difference learning">Temporal difference (TD)</a></li> <li><a href="/wiki/Multi-agent_reinforcement_learning" title="Multi-agent reinforcement learning">Multi-agent</a> <ul><li><a href="/wiki/Self-play_(reinforcement_learning_technique)" class="mw-redirect" title="Self-play (reinforcement learning technique)">Self-play</a></li></ul></li></ul></div></div></td> </tr><tr><td class="sidebar-content"> <div class="sidebar-list mw-collapsible mw-collapsed machine-learning-list-title"><div class="sidebar-list-title" style="border-top:1px solid #aaa; text-align:center;;color: var(--color-base)">Learning with humans</div><div class="sidebar-list-content mw-collapsible-content hlist"> <ul><li><a href="/wiki/Active_learning_(machine_learning)" title="Active learning (machine learning)">Active learning</a></li> <li><a href="/wiki/Crowdsourcing" title="Crowdsourcing">Crowdsourcing</a></li> <li><a href="/wiki/Human-in-the-loop" title="Human-in-the-loop">Human-in-the-loop</a></li> <li><a href="/wiki/Reinforcement_learning_from_human_feedback" title="Reinforcement learning from human feedback">RLHF</a></li></ul></div></div></td> </tr><tr><td class="sidebar-content"> <div class="sidebar-list mw-collapsible mw-collapsed machine-learning-list-title"><div class="sidebar-list-title" style="border-top:1px solid #aaa; text-align:center;;color: var(--color-base)">Model diagnostics</div><div class="sidebar-list-content mw-collapsible-content hlist"> <ul><li><a href="/wiki/Coefficient_of_determination" title="Coefficient of determination">Coefficient of determination</a></li> <li><a href="/wiki/Confusion_matrix" title="Confusion matrix">Confusion matrix</a></li> <li><a href="/wiki/Learning_curve_(machine_learning)" title="Learning curve (machine learning)">Learning curve</a></li> <li><a href="/wiki/Receiver_operating_characteristic" title="Receiver operating characteristic">ROC curve</a></li></ul></div></div></td> </tr><tr><td class="sidebar-content"> <div class="sidebar-list mw-collapsible mw-collapsed machine-learning-list-title"><div class="sidebar-list-title" style="border-top:1px solid #aaa; text-align:center;;color: var(--color-base)">Mathematical foundations</div><div class="sidebar-list-content mw-collapsible-content hlist"> <ul><li><a href="/wiki/Kernel_machines" class="mw-redirect" title="Kernel machines">Kernel machines</a></li> <li><a href="/wiki/Bias%E2%80%93variance_tradeoff" title="Bias–variance tradeoff">Bias–variance tradeoff</a></li> <li><a href="/wiki/Computational_learning_theory" title="Computational learning theory">Computational learning theory</a></li> <li><a href="/wiki/Empirical_risk_minimization" title="Empirical risk minimization">Empirical risk minimization</a></li> <li><a href="/wiki/Occam_learning" title="Occam learning">Occam learning</a></li> <li><a href="/wiki/Probably_approximately_correct_learning" title="Probably approximately correct learning">PAC learning</a></li> <li><a href="/wiki/Statistical_learning_theory" title="Statistical learning theory">Statistical learning</a></li> <li><a href="/wiki/Vapnik%E2%80%93Chervonenkis_theory" title="Vapnik–Chervonenkis theory">VC theory</a></li></ul></div></div></td> </tr><tr><td class="sidebar-content"> <div class="sidebar-list mw-collapsible mw-collapsed machine-learning-list-title"><div class="sidebar-list-title" style="border-top:1px solid #aaa; text-align:center;;color: var(--color-base)">Journals and conferences</div><div class="sidebar-list-content mw-collapsible-content hlist"> <ul><li><a href="/wiki/ECML_PKDD" title="ECML PKDD">ECML PKDD</a></li> <li><a href="/wiki/Conference_on_Neural_Information_Processing_Systems" title="Conference on Neural Information Processing Systems">NeurIPS</a></li> <li><a href="/wiki/International_Conference_on_Machine_Learning" title="International Conference on Machine Learning">ICML</a></li> <li><a href="/wiki/International_Conference_on_Learning_Representations" title="International Conference on Learning Representations">ICLR</a></li> <li><a href="/wiki/International_Joint_Conference_on_Artificial_Intelligence" title="International Joint Conference on Artificial Intelligence">IJCAI</a></li> <li><a href="/wiki/Machine_Learning_(journal)" title="Machine Learning (journal)">ML</a></li> <li><a href="/wiki/Journal_of_Machine_Learning_Research" title="Journal of Machine Learning Research">JMLR</a></li></ul></div></div></td> </tr><tr><td class="sidebar-content"> <div class="sidebar-list mw-collapsible mw-collapsed machine-learning-list-title"><div class="sidebar-list-title" style="border-top:1px solid #aaa; text-align:center;;color: var(--color-base)">Related articles</div><div class="sidebar-list-content mw-collapsible-content hlist"> <ul><li><a href="/wiki/Glossary_of_artificial_intelligence" title="Glossary of artificial intelligence">Glossary of artificial intelligence</a></li> <li><a href="/wiki/List_of_datasets_for_machine-learning_research" title="List of datasets for machine-learning research">List of datasets for machine-learning research</a> <ul><li><a href="/wiki/List_of_datasets_in_computer_vision_and_image_processing" title="List of datasets in computer vision and image processing">List of datasets in computer vision and image processing</a></li></ul></li> <li><a href="/wiki/Outline_of_machine_learning" title="Outline of machine learning">Outline of machine learning</a></li></ul></div></div></td> </tr><tr><td class="sidebar-navbar"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1129693374"><style data-mw-deduplicate="TemplateStyles:r1239400231">.mw-parser-output .navbar{display:inline;font-size:88%;font-weight:normal}.mw-parser-output .navbar-collapse{float:left;text-align:left}.mw-parser-output .navbar-boxtext{word-spacing:0}.mw-parser-output .navbar ul{display:inline-block;white-space:nowrap;line-height:inherit}.mw-parser-output .navbar-brackets::before{margin-right:-0.125em;content:"[ "}.mw-parser-output .navbar-brackets::after{margin-left:-0.125em;content:" ]"}.mw-parser-output .navbar li{word-spacing:-0.125em}.mw-parser-output .navbar a>span,.mw-parser-output .navbar a>abbr{text-decoration:inherit}.mw-parser-output .navbar-mini abbr{font-variant:small-caps;border-bottom:none;text-decoration:none;cursor:inherit}.mw-parser-output .navbar-ct-full{font-size:114%;margin:0 7em}.mw-parser-output .navbar-ct-mini{font-size:114%;margin:0 4em}html.skin-theme-clientpref-night .mw-parser-output .navbar li a abbr{color:var(--color-base)!important}@media(prefers-color-scheme:dark){html.skin-theme-clientpref-os .mw-parser-output .navbar li a abbr{color:var(--color-base)!important}}@media print{.mw-parser-output .navbar{display:none!important}}</style><div class="navbar plainlinks hlist navbar-mini"><ul><li class="nv-view"><a href="/wiki/Template:Machine_learning" title="Template:Machine learning"><abbr title="View this template">v</abbr></a></li><li class="nv-talk"><a href="/wiki/Template_talk:Machine_learning" title="Template talk:Machine learning"><abbr title="Discuss this template">t</abbr></a></li><li class="nv-edit"><a href="/wiki/Special:EditPage/Template:Machine_learning" title="Special:EditPage/Template:Machine learning"><abbr title="Edit this template">e</abbr></a></li></ul></div></td></tr></tbody></table> <p>A <b>large language model</b> (<b>LLM</b>) is a type of computational <a href="/wiki/Model#Conceptual_model" title="Model">model</a> designed for <a href="/wiki/Natural_language_processing" title="Natural language processing">natural language processing</a> tasks such as language <a href="/wiki/Generative_artificial_intelligence" title="Generative artificial intelligence">generation</a>. As <a href="/wiki/Language_model" title="Language model">language models</a>, LLMs acquire these abilities by <a href="/wiki/Machine_learning" title="Machine learning">learning statistical relationships</a> from vast amounts of text during a <a href="/wiki/Self-supervised_learning" title="Self-supervised learning">self-supervised</a> and <a href="/wiki/Semi-supervised_learning" class="mw-redirect" title="Semi-supervised learning">semi-supervised</a> training process.<sup id="cite_ref-:7_1-0" class="reference"><a href="#cite_note-:7-1"><span class="cite-bracket">[</span>1<span class="cite-bracket">]</span></a></sup> </p><p>The largest and most capable LLMs are <a href="/wiki/Artificial_neural_network" class="mw-redirect" title="Artificial neural network">artificial neural networks</a> built with a decoder-only <a href="/wiki/Transformer_(deep_learning_architecture)" title="Transformer (deep learning architecture)">transformer-based architecture</a>, enabling efficient processing and generation of large-scale text data. Modern models can be <a href="/wiki/Fine-tuning_(deep_learning)" title="Fine-tuning (deep learning)">fine-tuned</a> for specific tasks, or be guided by <a href="/wiki/Prompt_engineering" title="Prompt engineering">prompt engineering</a>.<sup id="cite_ref-few-shot-learners_2-0" class="reference"><a href="#cite_note-few-shot-learners-2"><span class="cite-bracket">[</span>2<span class="cite-bracket">]</span></a></sup> These models acquire <a href="/wiki/Predictive_learning" title="Predictive learning">predictive power</a> regarding <a href="/wiki/Syntax" title="Syntax">syntax</a>, <a href="/wiki/Semantics" title="Semantics">semantics</a>, and <a href="/wiki/Ontology_(information_science)" title="Ontology (information science)">ontologies</a><sup id="cite_ref-3" class="reference"><a href="#cite_note-3"><span class="cite-bracket">[</span>3<span class="cite-bracket">]</span></a></sup> inherent in human language corpora, but they also inherit inaccuracies and <a href="/wiki/Algorithmic_bias" title="Algorithmic bias">biases</a> present in the <a href="/wiki/Training,_validation,_and_test_data_sets" title="Training, validation, and test data sets">data</a> on which they are trained.<sup id="cite_ref-Manning-2022_4-0" class="reference"><a href="#cite_note-Manning-2022-4"><span class="cite-bracket">[</span>4<span class="cite-bracket">]</span></a></sup> </p> <meta property="mw:PageProp/toc" /> <div class="mw-heading mw-heading2"><h2 id="History">History</h2><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Large_language_model&action=edit&section=1" title="Edit section: History"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <figure class="mw-default-size" typeof="mw:File/Thumb"><a href="/wiki/File:Trends_in_AI_training_FLOP_over_time_(2010-2025).svg" class="mw-file-description"><img src="//upload.wikimedia.org/wikipedia/commons/thumb/9/9b/Trends_in_AI_training_FLOP_over_time_%282010-2025%29.svg/220px-Trends_in_AI_training_FLOP_over_time_%282010-2025%29.svg.png" decoding="async" width="220" height="199" class="mw-file-element" srcset="//upload.wikimedia.org/wikipedia/commons/thumb/9/9b/Trends_in_AI_training_FLOP_over_time_%282010-2025%29.svg/330px-Trends_in_AI_training_FLOP_over_time_%282010-2025%29.svg.png 1.5x, //upload.wikimedia.org/wikipedia/commons/thumb/9/9b/Trends_in_AI_training_FLOP_over_time_%282010-2025%29.svg/440px-Trends_in_AI_training_FLOP_over_time_%282010-2025%29.svg.png 2x" data-file-width="801" data-file-height="724" /></a><figcaption>The training compute of notable large models in FLOPs vs publication date over the period 2010-2024. For overall notable models (top left), frontier models (top right), top language models (bottom left) and top models within leading companies (bottom right). The majority of these models are language models.</figcaption></figure> <figure class="mw-default-size" typeof="mw:File/Thumb"><a href="/wiki/File:Large-scale_AI_training_compute_(FLOP)_vs_Publication_date_(2017-2024).svg" class="mw-file-description"><img src="//upload.wikimedia.org/wikipedia/commons/thumb/0/06/Large-scale_AI_training_compute_%28FLOP%29_vs_Publication_date_%282017-2024%29.svg/220px-Large-scale_AI_training_compute_%28FLOP%29_vs_Publication_date_%282017-2024%29.svg.png" decoding="async" width="220" height="124" class="mw-file-element" srcset="//upload.wikimedia.org/wikipedia/commons/thumb/0/06/Large-scale_AI_training_compute_%28FLOP%29_vs_Publication_date_%282017-2024%29.svg/330px-Large-scale_AI_training_compute_%28FLOP%29_vs_Publication_date_%282017-2024%29.svg.png 1.5x, //upload.wikimedia.org/wikipedia/commons/thumb/0/06/Large-scale_AI_training_compute_%28FLOP%29_vs_Publication_date_%282017-2024%29.svg/440px-Large-scale_AI_training_compute_%28FLOP%29_vs_Publication_date_%282017-2024%29.svg.png 2x" data-file-width="1920" data-file-height="1080" /></a><figcaption>The training compute of notable large AI models in FLOPs vs publication date over the period 2017-2024. The majority of large models are language models or multimodal models with language capacity.</figcaption></figure> <p>Before 2017, there were a few language models that were large as compared to capacities then available. In the 1990s, the <a href="/wiki/IBM_alignment_models" title="IBM alignment models">IBM alignment models</a> pioneered statistical language modelling. A smoothed n-gram model in 2001 trained on 0.3 billion words achieved state-of-the-art <a href="/wiki/Perplexity" title="Perplexity">perplexity</a> at the time.<sup id="cite_ref-5" class="reference"><a href="#cite_note-5"><span class="cite-bracket">[</span>5<span class="cite-bracket">]</span></a></sup> In the 2000s, as Internet use became prevalent, some researchers constructed Internet-scale language datasets ("web as corpus"<sup id="cite_ref-6" class="reference"><a href="#cite_note-6"><span class="cite-bracket">[</span>6<span class="cite-bracket">]</span></a></sup>), upon which they trained statistical language models.<sup id="cite_ref-7" class="reference"><a href="#cite_note-7"><span class="cite-bracket">[</span>7<span class="cite-bracket">]</span></a></sup><sup id="cite_ref-8" class="reference"><a href="#cite_note-8"><span class="cite-bracket">[</span>8<span class="cite-bracket">]</span></a></sup> In 2009, in most language processing tasks, statistical language models dominated over symbolic language models, as they can usefully ingest large datasets.<sup id="cite_ref-9" class="reference"><a href="#cite_note-9"><span class="cite-bracket">[</span>9<span class="cite-bracket">]</span></a></sup> </p><p> After neural networks became dominant in image processing around 2012,<sup id="cite_ref-10" class="reference"><a href="#cite_note-10"><span class="cite-bracket">[</span>10<span class="cite-bracket">]</span></a></sup> they were applied to language modelling as well. Google converted its translation service to <a href="/wiki/Google_Neural_Machine_Translation" title="Google Neural Machine Translation">Neural Machine Translation</a> in 2016. As it was before <a href="/wiki/Transformer_model" class="mw-redirect" title="Transformer model">transformers</a>, it was done by <a href="/wiki/Seq2seq" title="Seq2seq">seq2seq</a> deep <a href="/wiki/LSTM" class="mw-redirect" title="LSTM">LSTM</a> networks.</p><figure class="mw-default-size" typeof="mw:File/Thumb"><a href="/wiki/File:The-Transformer-model-architecture.png" class="mw-file-description"><img src="//upload.wikimedia.org/wikipedia/commons/thumb/8/8f/The-Transformer-model-architecture.png/290px-The-Transformer-model-architecture.png" decoding="async" width="290" height="261" class="mw-file-element" srcset="//upload.wikimedia.org/wikipedia/commons/thumb/8/8f/The-Transformer-model-architecture.png/435px-The-Transformer-model-architecture.png 1.5x, //upload.wikimedia.org/wikipedia/commons/thumb/8/8f/The-Transformer-model-architecture.png/580px-The-Transformer-model-architecture.png 2x" data-file-width="850" data-file-height="765" /></a><figcaption>An illustration of main components of the transformer model from the original paper, where layers were normalized after (instead of before) multiheaded attention</figcaption></figure> <p>At the 2017 <a href="/wiki/NeurIPS" class="mw-redirect" title="NeurIPS">NeurIPS</a> conference, Google researchers introduced the transformer architecture in their landmark paper "<a href="/wiki/Attention_Is_All_You_Need" title="Attention Is All You Need">Attention Is All You Need</a>". This paper's goal was to improve upon 2014 seq2seq technology,<sup id="cite_ref-11" class="reference"><a href="#cite_note-11"><span class="cite-bracket">[</span>11<span class="cite-bracket">]</span></a></sup> and was based mainly on the <a href="/wiki/Attention_(machine_learning)" title="Attention (machine learning)">attention</a> mechanism developed by Bahdanau et al. in 2014.<sup id="cite_ref-12" class="reference"><a href="#cite_note-12"><span class="cite-bracket">[</span>12<span class="cite-bracket">]</span></a></sup> The following year in 2018, <a href="/wiki/BERT_(language_model)" title="BERT (language model)">BERT</a> was introduced and quickly became "ubiquitous".<sup id="cite_ref-13" class="reference"><a href="#cite_note-13"><span class="cite-bracket">[</span>13<span class="cite-bracket">]</span></a></sup> Though the original transformer has both encoder and decoder blocks, BERT is an encoder-only model. </p><p>Although decoder-only <a href="/wiki/GPT-1" title="GPT-1">GPT-1</a> was introduced in 2018, it was <a href="/wiki/GPT-2" title="GPT-2">GPT-2</a> in 2019 that caught widespread attention because <a href="/wiki/OpenAI" title="OpenAI">OpenAI</a> at first deemed it too powerful to release publicly, out of fear of malicious use.<sup id="cite_ref-14" class="reference"><a href="#cite_note-14"><span class="cite-bracket">[</span>14<span class="cite-bracket">]</span></a></sup> <a href="/wiki/GPT-3" title="GPT-3">GPT-3</a> in 2020 went a step further and as of 2024<sup class="plainlinks noexcerpt noprint asof-tag update" style="display:none;"><a class="external text" href="https://en.wikipedia.org/w/index.php?title=Large_language_model&action=edit">[update]</a></sup> is available only via <a href="/wiki/Web_API" title="Web API">API</a> with no offering of downloading the model to execute locally. But it was the 2022 consumer-facing browser-based <a href="/wiki/ChatGPT" title="ChatGPT">ChatGPT</a> that captured the imaginations of the general population and caused some media hype and online buzz.<sup id="cite_ref-15" class="reference"><a href="#cite_note-15"><span class="cite-bracket">[</span>15<span class="cite-bracket">]</span></a></sup> The 2023 <a href="/wiki/GPT-4" title="GPT-4">GPT-4</a> was praised for its increased accuracy and as a "holy grail" for its <a href="/wiki/Multimodal_learning" title="Multimodal learning">multimodal</a> capabilities.<sup id="cite_ref-16" class="reference"><a href="#cite_note-16"><span class="cite-bracket">[</span>16<span class="cite-bracket">]</span></a></sup> OpenAI did not reveal the high-level architecture and the number of <a href="/wiki/Parameter#Artificial_Intelligence" title="Parameter">parameters</a> of GPT-4. </p><p>Competing language models have for the most part been attempting to equal the GPT series, at least in terms of number of parameters.<sup id="cite_ref-17" class="reference"><a href="#cite_note-17"><span class="cite-bracket">[</span>17<span class="cite-bracket">]</span></a></sup> </p><p>Since 2022, <a href="/wiki/Source-available_software" title="Source-available software">source-available</a> models have been gaining popularity, especially at first with <a href="/wiki/BLOOM_(language_model)" title="BLOOM (language model)">BLOOM</a> and <a href="/wiki/LLaMA" class="mw-redirect" title="LLaMA">LLaMA</a>, though both have restrictions on the field of use. <a href="/wiki/Mistral_AI" title="Mistral AI">Mistral AI</a>'s models Mistral 7B and Mixtral 8x7b have the more permissive <a href="/wiki/Apache_License" title="Apache License">Apache License</a>. As of June 2024<sup class="plainlinks noexcerpt noprint asof-tag update" style="display:none;"><a class="external text" href="https://en.wikipedia.org/w/index.php?title=Large_language_model&action=edit">[update]</a></sup>, The Instruction fine tuned variant of the Llama 3 70 billion parameter model is the most powerful open LLM according to the LMSYS Chatbot Arena Leaderboard, being more powerful than GPT-3.5 but not as powerful as GPT-4.<sup id="cite_ref-18" class="reference"><a href="#cite_note-18"><span class="cite-bracket">[</span>18<span class="cite-bracket">]</span></a></sup> </p><p>As of 2024, the largest and most capable models are all based on the Transformer architecture. Some recent implementations are based on other architectures, such as <a href="/wiki/Recurrent_neural_network" title="Recurrent neural network">recurrent neural network</a> variants and <a href="/wiki/Mamba_(deep_learning_architecture)" title="Mamba (deep learning architecture)">Mamba</a> (a <a href="/wiki/State-space_representation" title="State-space representation">state space</a> model).<sup id="cite_ref-19" class="reference"><a href="#cite_note-19"><span class="cite-bracket">[</span>19<span class="cite-bracket">]</span></a></sup><sup id="cite_ref-20" class="reference"><a href="#cite_note-20"><span class="cite-bracket">[</span>20<span class="cite-bracket">]</span></a></sup><sup id="cite_ref-21" class="reference"><a href="#cite_note-21"><span class="cite-bracket">[</span>21<span class="cite-bracket">]</span></a></sup> </p> <div class="mw-heading mw-heading2"><h2 id="Dataset_preprocessing">Dataset preprocessing</h2><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Large_language_model&action=edit&section=2" title="Edit section: Dataset preprocessing"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1236090951"><div role="note" class="hatnote navigation-not-searchable">See also: <a href="/wiki/List_of_datasets_for_machine-learning_research#Internet" title="List of datasets for machine-learning research">List of datasets for machine-learning research § Internet</a></div> <div class="mw-heading mw-heading3"><h3 id="Tokenization">Tokenization</h3><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Large_language_model&action=edit&section=3" title="Edit section: Tokenization"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <p><span class="anchor" id="Tokenization"></span> </p><p>Because <a href="/wiki/Machine_learning" title="Machine learning">machine learning</a> algorithms process numbers rather than text, the text must be converted to numbers. In the first step, a vocabulary is decided upon, then integer indices are arbitrarily but uniquely assigned to each vocabulary entry, and finally, an <a href="/wiki/Word_embedding" title="Word embedding">embedding</a> is associated to the integer index. Algorithms include <a href="/wiki/Byte_pair_encoding" title="Byte pair encoding">byte-pair encoding</a> (BPE) and <a href="/wiki/BERT_(language_model)#Design" title="BERT (language model)">WordPiece</a>. There are also special tokens serving as <a href="/wiki/Control_character" title="Control character">control characters</a>, such as <code>[MASK]</code> for masked-out token (as used in <a href="/wiki/BERT_(language_model)" title="BERT (language model)">BERT</a>), and <code>[UNK]</code> ("unknown") for characters not appearing in the vocabulary. Also, some special symbols are used to denote special text formatting. For example, "Ġ" denotes a preceding whitespace in RoBERTa and GPT. "##" denotes continuation of a preceding word in BERT.<sup id="cite_ref-22" class="reference"><a href="#cite_note-22"><span class="cite-bracket">[</span>22<span class="cite-bracket">]</span></a></sup> </p><p>For example, the BPE tokenizer used by GPT-3 (Legacy) would split <small><code>tokenizer: texts -> series of numerical "tokens"</code></small> as </p> <table cellpadding="0;" cellspacing="0;" style="border:1px solid black"> <tbody><tr> <td style="border-left: 2px green; border-right: 2px green">token </td> <td style="background-color: grey; color: white; border-left: 2px green; border-right: 2px green">izer </td> <td style="border-left: 2px green; border-right: 2px green">: </td> <td style="background-color: grey; color: white; border-left: 2px green; border-right: 2px green"> texts </td> <td style="border-left: 2px green; border-right: 2px green"> -> </td> <td style="background-color: grey; color: white; border-left: 2px green; border-right: 2px green">series </td> <td style="border-left: 2px green; border-right: 2px green"> of </td> <td style="background-color: grey; color: white; border-left: 2px green; border-right: 2px green"> numerical </td> <td style="border-left: 2px green; border-right: 2px green"> " </td> <td style="background-color: grey; color: white; border-left: 2px green; border-right: 2px green">t </td> <td style="border-left: 2px green; border-right: 2px green">ok </td> <td style="background-color: grey; color: white; border-left: 2px green; border-right: 2px green">ens </td> <td style="border-left: 2px green; border-right: 2px green">" </td></tr></tbody></table> <p>Tokenization also <a href="/wiki/Data_compression" title="Data compression">compresses</a> the datasets. Because LLMs generally require input to be an <a href="/wiki/Array_(data_structure)" title="Array (data structure)">array</a> that is not <a href="/wiki/Jagged_array" title="Jagged array">jagged</a>, the shorter texts must be "padded" until they match the length of the longest one. How many tokens are, on average, needed per word depends on the language of the dataset.<sup id="cite_ref-23" class="reference"><a href="#cite_note-23"><span class="cite-bracket">[</span>23<span class="cite-bracket">]</span></a></sup><sup id="cite_ref-24" class="reference"><a href="#cite_note-24"><span class="cite-bracket">[</span>24<span class="cite-bracket">]</span></a></sup> </p> <div class="mw-heading mw-heading4"><h4 id="BPE">BPE</h4><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Large_language_model&action=edit&section=4" title="Edit section: BPE"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1236090951"><div role="note" class="hatnote navigation-not-searchable">Main article: <a href="/wiki/Byte_pair_encoding" title="Byte pair encoding">Byte pair encoding</a></div> <p>As an example, consider a tokenizer based on byte-pair encoding. In the first step, all unique characters (including blanks and <a href="/wiki/Punctuation_mark" class="mw-redirect" title="Punctuation mark">punctuation marks</a>) are treated as an initial set of <a href="/wiki/N-gram" title="N-gram"><i>n</i>-grams</a> (i.e. initial set of uni-grams). Successively the most frequent pair of adjacent characters is merged into a bi-gram and all instances of the pair are replaced by it. All occurrences of adjacent pairs of (previously merged) <i>n</i>-grams that most frequently occur together are then again merged into even lengthier <i>n</i>-gram, until a vocabulary of prescribed size is obtained (in case of <a href="/wiki/GPT-3" title="GPT-3">GPT-3</a>, the size is 50257).<sup id="cite_ref-xbiWb_25-0" class="reference"><a href="#cite_note-xbiWb-25"><span class="cite-bracket">[</span>25<span class="cite-bracket">]</span></a></sup> After a tokenizer is trained, any text can be tokenized by it, as long as it does not contain characters not appearing in the initial-set of uni-grams.<sup id="cite_ref-2022Book_26-0" class="reference"><a href="#cite_note-2022Book_-26"><span class="cite-bracket">[</span>26<span class="cite-bracket">]</span></a></sup> </p> <div class="mw-heading mw-heading4"><h4 id="Problems">Problems</h4><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Large_language_model&action=edit&section=5" title="Edit section: Problems"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <p>A token vocabulary based on the frequencies extracted from mainly English corpora uses as few tokens as possible for an average English word. An average word in another language encoded by such an English-optimized tokenizer is however split into suboptimal amount of tokens. GPT-2 tokenizer can use up to 15 times more tokens per word for some languages, for example for the <a href="/wiki/Shan_language" title="Shan language">Shan language</a> from <a href="/wiki/Myanmar" title="Myanmar">Myanmar</a>. Even more widespread languages such as Portuguese and German have "a premium of 50%" compared to English.<sup id="cite_ref-27" class="reference"><a href="#cite_note-27"><span class="cite-bracket">[</span>27<span class="cite-bracket">]</span></a></sup> </p><p>Greedy tokenization also causes subtle problems with text completion.<sup id="cite_ref-28" class="reference"><a href="#cite_note-28"><span class="cite-bracket">[</span>28<span class="cite-bracket">]</span></a></sup> </p> <div class="mw-heading mw-heading3"><h3 id="Dataset_cleaning">Dataset cleaning</h3><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Large_language_model&action=edit&section=6" title="Edit section: Dataset cleaning"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1236090951"><div role="note" class="hatnote navigation-not-searchable">Main article: <a href="/wiki/Data_cleansing" title="Data cleansing">Data cleansing</a></div> <p>In the context of training LLMs, datasets are typically cleaned by removing toxic passages from the dataset, discarding low-quality data, and de-duplication.<sup id="cite_ref-aYNg4_29-0" class="reference"><a href="#cite_note-aYNg4-29"><span class="cite-bracket">[</span>29<span class="cite-bracket">]</span></a></sup> Cleaned datasets can increase training efficiency and lead to improved downstream performance.<sup id="cite_ref-30" class="reference"><a href="#cite_note-30"><span class="cite-bracket">[</span>30<span class="cite-bracket">]</span></a></sup><sup id="cite_ref-31" class="reference"><a href="#cite_note-31"><span class="cite-bracket">[</span>31<span class="cite-bracket">]</span></a></sup> A trained LLM can be used to clean datasets for training a further LLM.<sup id="cite_ref-32" class="reference"><a href="#cite_note-32"><span class="cite-bracket">[</span>32<span class="cite-bracket">]</span></a></sup> </p><p>With the increasing proportion of LLM-generated content on the web, data cleaning in the future may include filtering out such content. LLM-generated content can pose a problem if the content is similar to human text (making filtering difficult) but of lower quality (degrading performance of models trained on it).<sup id="cite_ref-qbFw1_33-0" class="reference"><a href="#cite_note-qbFw1-33"><span class="cite-bracket">[</span>33<span class="cite-bracket">]</span></a></sup> </p> <div class="mw-heading mw-heading3"><h3 id="Synthetic_data">Synthetic data</h3><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Large_language_model&action=edit&section=7" title="Edit section: Synthetic data"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1236090951"><div role="note" class="hatnote navigation-not-searchable">Main article: <a href="/wiki/Synthetic_data" title="Synthetic data">Synthetic data</a></div> <p>Training of largest language models might need more linguistic data than naturally available, or that the naturally occurring data is of insufficient quality. In these cases, synthetic data might be used. Microsoft's <a href="/w/index.php?title=Phi_(LLM)&action=edit&redlink=1" class="new" title="Phi (LLM) (page does not exist)">Phi</a> series of LLMs is trained on textbook-like data generated by another LLM.<sup id="cite_ref-34" class="reference"><a href="#cite_note-34"><span class="cite-bracket">[</span>34<span class="cite-bracket">]</span></a></sup> </p> <div class="mw-heading mw-heading2"><h2 id="Training_and_architecture">Training and architecture</h2><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Large_language_model&action=edit&section=8" title="Edit section: Training and architecture"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1236090951"><div role="note" class="hatnote navigation-not-searchable">See also: <a href="/wiki/Fine-tuning_(machine_learning)" class="mw-redirect" title="Fine-tuning (machine learning)">Fine-tuning (machine learning)</a></div> <div class="mw-heading mw-heading3"><h3 id="Reinforcement_learning_from_human_feedback_(RLHF)"><span id="Reinforcement_learning_from_human_feedback_.28RLHF.29"></span>Reinforcement learning from human feedback (RLHF)</h3><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Large_language_model&action=edit&section=9" title="Edit section: Reinforcement learning from human feedback (RLHF)"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1236090951"><div role="note" class="hatnote navigation-not-searchable">Main article: <a href="/wiki/Reinforcement_learning_from_human_feedback" title="Reinforcement learning from human feedback">Reinforcement learning from human feedback</a></div> <p>Reinforcement learning from human feedback (RLHF) through algorithms, such as <a href="/wiki/Proximal_Policy_Optimization" class="mw-redirect" title="Proximal Policy Optimization">proximal policy optimization</a>, is used to further fine-tune a model based on a dataset of human preferences.<sup id="cite_ref-instructGPT-paper_35-0" class="reference"><a href="#cite_note-instructGPT-paper-35"><span class="cite-bracket">[</span>35<span class="cite-bracket">]</span></a></sup> </p> <div class="mw-heading mw-heading3"><h3 id="Instruction_tuning">Instruction tuning</h3><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Large_language_model&action=edit&section=10" title="Edit section: Instruction tuning"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <p>Using "self-instruct" approaches, LLMs have been able to <a href="/wiki/Bootstrapping" title="Bootstrapping">bootstrap</a> correct responses, replacing any naive responses, starting from human-generated corrections of a few cases. For example, in the instruction "Write an essay about the main themes represented in <i>Hamlet</i>," an initial naive completion might be "If you submit the essay after March 17, your grade will be reduced by 10% for each day of delay," based on the frequency of this textual sequence in the corpus.<sup id="cite_ref-self-instruct-paper_36-0" class="reference"><a href="#cite_note-self-instruct-paper-36"><span class="cite-bracket">[</span>36<span class="cite-bracket">]</span></a></sup> </p> <div class="mw-heading mw-heading3"><h3 id="Mixture_of_experts">Mixture of experts</h3><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Large_language_model&action=edit&section=11" title="Edit section: Mixture of experts"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1236090951"><div role="note" class="hatnote navigation-not-searchable">Main article: <a href="/wiki/Mixture_of_experts" title="Mixture of experts">Mixture of experts</a></div> <p>The largest LLM may be too expensive to train and use directly. For such models, <a href="/wiki/Mixture_of_experts" title="Mixture of experts">mixture of experts</a> (MoE) can be applied, a line of research pursued by Google researchers since 2017 to train models reaching up to 1 trillion parameters.<sup id="cite_ref-HGZCJ_37-0" class="reference"><a href="#cite_note-HGZCJ-37"><span class="cite-bracket">[</span>37<span class="cite-bracket">]</span></a></sup><sup id="cite_ref-R9Qq5_38-0" class="reference"><a href="#cite_note-R9Qq5-38"><span class="cite-bracket">[</span>38<span class="cite-bracket">]</span></a></sup><sup id="cite_ref-glam-blog_39-0" class="reference"><a href="#cite_note-glam-blog-39"><span class="cite-bracket">[</span>39<span class="cite-bracket">]</span></a></sup> </p> <div class="mw-heading mw-heading3"><h3 id="Prompt_engineering,_attention_mechanism,_and_context_window"><span id="Prompt_engineering.2C_attention_mechanism.2C_and_context_window"></span>Prompt engineering, attention mechanism, and context window</h3><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Large_language_model&action=edit&section=12" title="Edit section: Prompt engineering, attention mechanism, and context window"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1236090951"><div role="note" class="hatnote navigation-not-searchable">See also: <a href="/wiki/Prompt_engineering" title="Prompt engineering">Prompt engineering</a> and <a href="/wiki/Attention_(machine_learning)" title="Attention (machine learning)">Attention (machine learning)</a></div> <p>Most results previously achievable only by (costly) fine-tuning, can be achieved through <a href="/wiki/Prompt_engineering" title="Prompt engineering">prompt engineering</a>, although limited to the scope of a single conversation (more precisely, limited to the scope of a context window).<sup id="cite_ref-emergentpaper_40-0" class="reference"><a href="#cite_note-emergentpaper-40"><span class="cite-bracket">[</span>40<span class="cite-bracket">]</span></a></sup> </p> <figure class="mw-default-size" typeof="mw:File/Thumb"><a href="/wiki/File:Multiple_attention_heads.png" class="mw-file-description"><img src="//upload.wikimedia.org/wikipedia/commons/thumb/e/e9/Multiple_attention_heads.png/290px-Multiple_attention_heads.png" decoding="async" width="290" height="427" class="mw-file-element" srcset="//upload.wikimedia.org/wikipedia/commons/thumb/e/e9/Multiple_attention_heads.png/435px-Multiple_attention_heads.png 1.5x, //upload.wikimedia.org/wikipedia/commons/thumb/e/e9/Multiple_attention_heads.png/580px-Multiple_attention_heads.png 2x" data-file-width="870" data-file-height="1280" /></a><figcaption> When each head calculates, according to its own criteria, how much other tokens are relevant for the "it_" token, note that the second attention head, represented by the second column, is focusing most on the first two rows, i.e. the tokens "The" and "animal", while the third column is focusing most on the bottom two rows, i.e. on "tired", which has been tokenized into two tokens.<sup id="cite_ref-Jay_Allamar_41-0" class="reference"><a href="#cite_note-Jay_Allamar-41"><span class="cite-bracket">[</span>41<span class="cite-bracket">]</span></a></sup></figcaption></figure> <p>In order to find out which tokens are relevant to each other within the scope of the context window, the attention mechanism calculates "soft" weights for each token, more precisely for its embedding, by using multiple attention heads, each with its own "relevance" for calculating its own soft weights. For example, the small (i.e. 117M parameter sized) <a href="/wiki/GPT-2" title="GPT-2">GPT-2</a> model has had twelve attention heads and a context window of only 1k tokens.<sup id="cite_ref-Jay_Allamar_GPT2_42-0" class="reference"><a href="#cite_note-Jay_Allamar_GPT2-42"><span class="cite-bracket">[</span>42<span class="cite-bracket">]</span></a></sup> In its medium version it has 345M parameters and contains 24 layers, each with 12 attention heads. For the training with gradient descent a batch size of 512 was utilized.<sup id="cite_ref-2022Book_26-1" class="reference"><a href="#cite_note-2022Book_-26"><span class="cite-bracket">[</span>26<span class="cite-bracket">]</span></a></sup> </p><p>The largest models, such as Google's <a href="/wiki/Gemini_(language_model)" title="Gemini (language model)">Gemini 1.5</a>, presented in February 2024, can have a context window sized up to 1 million (context window of 10 million was also "successfully tested").<sup id="cite_ref-43" class="reference"><a href="#cite_note-43"><span class="cite-bracket">[</span>43<span class="cite-bracket">]</span></a></sup> Other models with large context windows includes Anthropic's Claude 2.1, with a context window of up to 200k tokens.<sup id="cite_ref-44" class="reference"><a href="#cite_note-44"><span class="cite-bracket">[</span>44<span class="cite-bracket">]</span></a></sup> Note that this maximum refers to the number of input tokens and that the maximum number of output tokens differs from the input and is often smaller. For example, the GPT-4 Turbo model has a maximum output of 4096 tokens.<sup id="cite_ref-45" class="reference"><a href="#cite_note-45"><span class="cite-bracket">[</span>45<span class="cite-bracket">]</span></a></sup> </p><p>Length of a conversation that the model can take into account when generating its next answer is limited by the size of a context window, as well. If the length of a conversation, for example with <a href="/wiki/ChatGPT" title="ChatGPT">ChatGPT</a>, is longer than its context window, only the parts inside the context window are taken into account when generating the next answer, or the model needs to apply some algorithm to summarize the too distant parts of conversation. </p><p>The shortcomings of making a context window larger include higher computational cost and possibly diluting the focus on local context, while making it smaller can cause a model to miss an important long-range dependency. Balancing them are a matter of experimentation and domain-specific considerations. </p><p>A model may be pre-trained either to predict how the segment continues, or what is missing in the segment, given a segment from its training dataset.<sup id="cite_ref-ioUpE_46-0" class="reference"><a href="#cite_note-ioUpE-46"><span class="cite-bracket">[</span>46<span class="cite-bracket">]</span></a></sup> It can be either </p> <ul><li>autoregressive (i.e. predicting how the segment continues, the way <a href="/wiki/Generative_pretrained_transformer" class="mw-redirect" title="Generative pretrained transformer">GPTs</a> do it): for example given a segment "I like to eat", the model predicts "ice cream", or "sushi".</li> <li>"<a href="/wiki/Cloze_test" title="Cloze test">masked</a>" (i.e. filling in the parts missing from the segment, the way "BERT"<sup id="cite_ref-jm_47-0" class="reference"><a href="#cite_note-jm-47"><span class="cite-bracket">[</span>47<span class="cite-bracket">]</span></a></sup> does it): for example, given a segment "I like to <code>[__] [__]</code> cream", the model predicts that "eat" and "ice" are missing.</li></ul> <p>Models may be trained on auxiliary tasks which test their understanding of the data distribution, such as Next Sentence Prediction (NSP), in which pairs of sentences are presented and the model must predict whether they appear consecutively in the training corpus.<sup id="cite_ref-jm_47-1" class="reference"><a href="#cite_note-jm-47"><span class="cite-bracket">[</span>47<span class="cite-bracket">]</span></a></sup> During training, <a href="/wiki/Regularization_(mathematics)" title="Regularization (mathematics)">regularization</a> loss is also used to stabilize training. However regularization loss is usually not used during <a href="/wiki/Training,_validation,_and_test_data_sets" title="Training, validation, and test data sets">testing</a> and evaluation. </p> <div class="mw-heading mw-heading3"><h3 id="Infrastructure">Infrastructure</h3><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Large_language_model&action=edit&section=13" title="Edit section: Infrastructure"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <p>Substantial infrastructure is necessary for training the largest models.<sup id="cite_ref-48" class="reference"><a href="#cite_note-48"><span class="cite-bracket">[</span>48<span class="cite-bracket">]</span></a></sup><sup id="cite_ref-49" class="reference"><a href="#cite_note-49"><span class="cite-bracket">[</span>49<span class="cite-bracket">]</span></a></sup><sup id="cite_ref-50" class="reference"><a href="#cite_note-50"><span class="cite-bracket">[</span>50<span class="cite-bracket">]</span></a></sup> </p> <div class="mw-heading mw-heading2"><h2 id="Training_cost">Training cost</h2><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Large_language_model&action=edit&section=14" title="Edit section: Training cost"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <figure class="mw-default-size mw-halign-right" typeof="mw:File/Frameless"><a href="/wiki/File:Estimated_training_cost_of_some_AI_models_-_2024_AI_index.jpg" class="mw-file-description"><img src="//upload.wikimedia.org/wikipedia/commons/thumb/6/64/Estimated_training_cost_of_some_AI_models_-_2024_AI_index.jpg/330px-Estimated_training_cost_of_some_AI_models_-_2024_AI_index.jpg" decoding="async" width="330" height="175" class="mw-file-element" srcset="//upload.wikimedia.org/wikipedia/commons/thumb/6/64/Estimated_training_cost_of_some_AI_models_-_2024_AI_index.jpg/495px-Estimated_training_cost_of_some_AI_models_-_2024_AI_index.jpg 1.5x, //upload.wikimedia.org/wikipedia/commons/thumb/6/64/Estimated_training_cost_of_some_AI_models_-_2024_AI_index.jpg/660px-Estimated_training_cost_of_some_AI_models_-_2024_AI_index.jpg 2x" data-file-width="2560" data-file-height="1361" /></a><figcaption></figcaption></figure> <p>Advances in software and hardware have reduced the cost substantially since 2020, such that in 2023 training of a 12-billion-parameter LLM computational cost is 72,300 <a href="/wiki/Ampere_(microarchitecture)" title="Ampere (microarchitecture)">A100-GPU</a>-hours, while in 2020 the cost of training a 1.5-billion-parameter LLM (which was two orders of magnitude smaller than the state of the art in 2020) was between $80,000 and $1,600,000.<sup id="cite_ref-Wiggers_51-0" class="reference"><a href="#cite_note-Wiggers-51"><span class="cite-bracket">[</span>51<span class="cite-bracket">]</span></a></sup><sup id="cite_ref-xaytj_52-0" class="reference"><a href="#cite_note-xaytj-52"><span class="cite-bracket">[</span>52<span class="cite-bracket">]</span></a></sup><sup id="cite_ref-Pythia_53-0" class="reference"><a href="#cite_note-Pythia-53"><span class="cite-bracket">[</span>53<span class="cite-bracket">]</span></a></sup> Since 2020, large sums were invested in increasingly large models. For example, training of the GPT-2 (i.e. a 1.5-billion-parameters model) in 2019 cost $50,000, while training of the PaLM (i.e. a 540-billion-parameters model) in 2022 cost $8 million, and Megatron-Turing NLG 530B (in 2021) cost around $11 million.<sup id="cite_ref-54" class="reference"><a href="#cite_note-54"><span class="cite-bracket">[</span>54<span class="cite-bracket">]</span></a></sup> </p><p>For Transformer-based LLM, training cost is much higher than inference cost. It costs 6 <a href="/wiki/FLOPS" class="mw-redirect" title="FLOPS">FLOPs</a> per parameter to train on one token, whereas it costs 1 to 2 FLOPs per parameter to infer on one token.<sup id="cite_ref-kaplan-scaling_55-0" class="reference"><a href="#cite_note-kaplan-scaling-55"><span class="cite-bracket">[</span>55<span class="cite-bracket">]</span></a></sup> </p> <div class="mw-heading mw-heading2"><h2 id="Tool_use">Tool use</h2><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Large_language_model&action=edit&section=15" title="Edit section: Tool use"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <p>There are certain tasks that, in principle, cannot be solved by any LLM, at least not without the use of external tools or additional software. An example of such a task is responding to the user's input '354 * 139 = ', provided that the LLM has not already encountered a continuation of this calculation in its training corpus.<sup class="noprint Inline-Template" style="white-space:nowrap;">[<i><a href="/wiki/Wikipedia:Accuracy_dispute#Disputed_statement" title="Wikipedia:Accuracy dispute"><span title="The material near this tag is possibly inaccurate or nonfactual. (September 2024)">dubious</span></a> – <a href="/wiki/Talk:Large_language_model#Dubious" title="Talk:Large language model">discuss</a></i>]</sup> In such cases, the LLM needs to resort to running program code that calculates the result, which can then be included in its response.<sup class="noprint Inline-Template" style="white-space:nowrap;">[<i><a href="/wiki/Wikipedia:Accuracy_dispute#Disputed_statement" title="Wikipedia:Accuracy dispute"><span title="The material near this tag is possibly inaccurate or nonfactual. (September 2024)">dubious</span></a> – <a href="/wiki/Talk:Large_language_model#Dubious" title="Talk:Large language model">discuss</a></i>]</sup>: Another example is "What is the time now? It is ", where a separate program interpreter would need to execute a code to get system time on the computer, so that the LLM can include it in its reply.<sup id="cite_ref-PI1fW_56-0" class="reference"><a href="#cite_note-PI1fW-56"><span class="cite-bracket">[</span>56<span class="cite-bracket">]</span></a></sup><sup id="cite_ref-J5OW5_57-0" class="reference"><a href="#cite_note-J5OW5-57"><span class="cite-bracket">[</span>57<span class="cite-bracket">]</span></a></sup> This basic strategy can be sophisticated with multiple attempts of generated programs, and other sampling strategies.<sup id="cite_ref-gQxzq_58-0" class="reference"><a href="#cite_note-gQxzq-58"><span class="cite-bracket">[</span>58<span class="cite-bracket">]</span></a></sup> </p><p>Generally, in order to get an LLM to use tools, one must fine-tune it for tool-use. If the number of tools is finite, then fine-tuning may be done just once. If the number of tools can grow arbitrarily, as with online <a href="/wiki/API" title="API">API</a> services, then the LLM can be fine-tuned to be able to read API documentation and call API correctly.<sup id="cite_ref-lLrda_59-0" class="reference"><a href="#cite_note-lLrda-59"><span class="cite-bracket">[</span>59<span class="cite-bracket">]</span></a></sup><sup id="cite_ref-4Xzrs_60-0" class="reference"><a href="#cite_note-4Xzrs-60"><span class="cite-bracket">[</span>60<span class="cite-bracket">]</span></a></sup> </p><p>A simpler form of tool use is <a href="/wiki/Retrieval-augmented_generation" title="Retrieval-augmented generation">retrieval-augmented generation</a>: the augmentation of an LLM with <a href="/wiki/Document_retrieval" title="Document retrieval">document retrieval</a>. Given a query, a document retriever is called to retrieve the most relevant documents. This is usually done by encoding the query and the documents into vectors, then finding the documents with vectors (usually stored in a <a href="/wiki/Vector_database" title="Vector database">vector database</a>) most similar to the vector of the query. The LLM then generates an output based on both the query and context included from the retrieved documents.<sup id="cite_ref-BUZBP_61-0" class="reference"><a href="#cite_note-BUZBP-61"><span class="cite-bracket">[</span>61<span class="cite-bracket">]</span></a></sup> </p> <div class="mw-heading mw-heading2"><h2 id="Agency">Agency</h2><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Large_language_model&action=edit&section=16" title="Edit section: Agency"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <p>An LLM is typically not an <a href="/wiki/Autonomous_agent" title="Autonomous agent">autonomous agent</a> by itself, as it lacks the ability to interact with dynamic environments, recall past behaviors, and plan future actions, but can be transformed into one by integrating modules like profiling, memory, planning, and action.<sup id="cite_ref-62" class="reference"><a href="#cite_note-62"><span class="cite-bracket">[</span>62<span class="cite-bracket">]</span></a></sup> </p><p>The <a href="/w/index.php?title=ReAct_pattern&action=edit&redlink=1" class="new" title="ReAct pattern (page does not exist)">ReAct pattern</a>, a portmanteau of "Reason + Act", constructs an <a href="/wiki/Intelligent_agent" title="Intelligent agent">agent</a> out of an LLM, using the LLM as a planner. The LLM is prompted to "think out loud". Specifically, the language model is prompted with a textual description of the environment, a goal, a list of possible actions, and a record of the actions and observations so far. It generates one or more thoughts before generating an action, which is then executed in the environment.<sup id="cite_ref-DmvNE_63-0" class="reference"><a href="#cite_note-DmvNE-63"><span class="cite-bracket">[</span>63<span class="cite-bracket">]</span></a></sup> The linguistic description of the environment given to the LLM planner can even be the LaTeX code of a paper describing the environment.<sup id="cite_ref-JS8Vd_64-0" class="reference"><a href="#cite_note-JS8Vd-64"><span class="cite-bracket">[</span>64<span class="cite-bracket">]</span></a></sup> </p><p>In the DEPS ("Describe, Explain, Plan and Select") method, an LLM is first connected to the visual world via image descriptions, then it is prompted to produce plans for complex tasks and behaviors based on its pretrained knowledge and environmental feedback it receives.<sup id="cite_ref-65" class="reference"><a href="#cite_note-65"><span class="cite-bracket">[</span>65<span class="cite-bracket">]</span></a></sup> </p><p>The Reflexion method<sup id="cite_ref-sbB2T_66-0" class="reference"><a href="#cite_note-sbB2T-66"><span class="cite-bracket">[</span>66<span class="cite-bracket">]</span></a></sup> constructs an agent that learns over multiple episodes. At the end of each episode, the LLM is given the record of the episode, and prompted to think up "lessons learned", which would help it perform better at a subsequent episode. These "lessons learned" are given to the agent in the subsequent episodes.<sup class="noprint Inline-Template Template-Fact" style="white-space:nowrap;">[<i><a href="/wiki/Wikipedia:Citation_needed" title="Wikipedia:Citation needed"><span title="This claim needs references to reliable sources. (February 2024)">citation needed</span></a></i>]</sup> </p><p><a href="/wiki/Monte_Carlo_tree_search" title="Monte Carlo tree search">Monte Carlo tree search</a> can use an LLM as rollout heuristic. When a programmatic world model is not available, an LLM can also be prompted with a description of the environment to act as world model.<sup id="cite_ref-ltTer_67-0" class="reference"><a href="#cite_note-ltTer-67"><span class="cite-bracket">[</span>67<span class="cite-bracket">]</span></a></sup> </p><p>For open-ended exploration, an LLM can be used to score observations for their "interestingness", which can be used as a reward signal to guide a normal (non-LLM) reinforcement learning agent.<sup id="cite_ref-mBvD9_68-0" class="reference"><a href="#cite_note-mBvD9-68"><span class="cite-bracket">[</span>68<span class="cite-bracket">]</span></a></sup> Alternatively, it can <a href="/wiki/Zone_of_proximal_development" title="Zone of proximal development">propose increasingly difficult tasks</a> for <a href="/wiki/Curriculum_learning" title="Curriculum learning">curriculum learning</a>.<sup id="cite_ref-:0_69-0" class="reference"><a href="#cite_note-:0-69"><span class="cite-bracket">[</span>69<span class="cite-bracket">]</span></a></sup> Instead of outputting individual actions, an LLM planner can also construct "skills", or <a href="/wiki/Function_(computer_programming)" title="Function (computer programming)">functions</a> for complex action sequences. The skills can be stored and later invoked, allowing increasing levels of abstraction in planning.<sup id="cite_ref-:0_69-1" class="reference"><a href="#cite_note-:0-69"><span class="cite-bracket">[</span>69<span class="cite-bracket">]</span></a></sup> </p><p>LLM-powered agents can keep a long-term memory of its previous contexts, and the memory can be retrieved in the same way as Retrieval Augmented Generation. Multiple such agents can interact socially.<sup id="cite_ref-XuvjF_70-0" class="reference"><a href="#cite_note-XuvjF-70"><span class="cite-bracket">[</span>70<span class="cite-bracket">]</span></a></sup> </p> <div class="mw-heading mw-heading2"><h2 id="Compression">Compression</h2><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Large_language_model&action=edit&section=17" title="Edit section: Compression"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <p>Typically, LLMs are trained with single- or half-precision floating point numbers (float32 and float16). One float16 has 16 bits, or 2 bytes, and so one billion parameters require 2 gigabytes. The largest models typically have 100 billion parameters, requiring 200 gigabytes to load, which places them outside the range of most consumer electronics.<sup id="cite_ref-71" class="reference"><a href="#cite_note-71"><span class="cite-bracket">[</span>71<span class="cite-bracket">]</span></a></sup> </p><p><i>Post-training <a href="/wiki/Quantization_(signal_processing)" title="Quantization (signal processing)">quantization</a></i><sup id="cite_ref-LS2Go_72-0" class="reference"><a href="#cite_note-LS2Go-72"><span class="cite-bracket">[</span>72<span class="cite-bracket">]</span></a></sup> aims to decrease the space requirement by lowering precision of the parameters of a trained model, while preserving most of its performance.<sup id="cite_ref-cpzcK_73-0" class="reference"><a href="#cite_note-cpzcK-73"><span class="cite-bracket">[</span>73<span class="cite-bracket">]</span></a></sup><sup id="cite_ref-QVU95_74-0" class="reference"><a href="#cite_note-QVU95-74"><span class="cite-bracket">[</span>74<span class="cite-bracket">]</span></a></sup> The simplest form of quantization simply truncates all numbers to a given number of bits. It can be improved by using a different quantization <a href="/wiki/Block_cipher" title="Block cipher">codebook</a> per layer. Further improvement can be done by applying <a href="/wiki/Mixed-precision_arithmetic" title="Mixed-precision arithmetic">different precisions</a> to different parameters, with higher precision for particularly important parameters ("outlier weights").<sup id="cite_ref-dU9Bu_75-0" class="reference"><a href="#cite_note-dU9Bu-75"><span class="cite-bracket">[</span>75<span class="cite-bracket">]</span></a></sup> See <sup id="cite_ref-76" class="reference"><a href="#cite_note-76"><span class="cite-bracket">[</span>76<span class="cite-bracket">]</span></a></sup> for a visual guide. </p><p>While quantized models are typically frozen, and only pre-quantized models are fine-tuned, quantized models can still be fine-tuned.<sup id="cite_ref-D0nFA_77-0" class="reference"><a href="#cite_note-D0nFA-77"><span class="cite-bracket">[</span>77<span class="cite-bracket">]</span></a></sup> </p> <div class="mw-heading mw-heading2"><h2 id="Multimodality">Multimodality</h2><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Large_language_model&action=edit&section=18" title="Edit section: Multimodality"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1236090951"><div role="note" class="hatnote navigation-not-searchable">See also: <a href="/wiki/Multimodal_learning" title="Multimodal learning">Multimodal learning</a></div> <p>Multimodality means "having several modalities", and a <a href="/wiki/Modality_(human%E2%80%93computer_interaction)" title="Modality (human–computer interaction)">"modality"</a> refers to a type of input or output, such as video, image, audio, text, <a href="/wiki/Proprioception" title="Proprioception">proprioception</a>, etc.<sup id="cite_ref-78" class="reference"><a href="#cite_note-78"><span class="cite-bracket">[</span>78<span class="cite-bracket">]</span></a></sup> There have been many AI models trained specifically to ingest one modality and output another modality, such as <a href="/wiki/AlexNet" title="AlexNet">AlexNet</a> for image to label,<sup id="cite_ref-79" class="reference"><a href="#cite_note-79"><span class="cite-bracket">[</span>79<span class="cite-bracket">]</span></a></sup> <a href="/wiki/Visual_question_answering" class="mw-redirect" title="Visual question answering">visual question answering</a> for image-text to text,<sup id="cite_ref-80" class="reference"><a href="#cite_note-80"><span class="cite-bracket">[</span>80<span class="cite-bracket">]</span></a></sup> and <a href="/wiki/Speech_recognition" title="Speech recognition">speech recognition</a> for speech to text. </p><p>A common method to create multimodal models out of an LLM is to "tokenize" the output of a trained encoder. Concretely, one can construct an LLM that can understand images as follows: take a trained LLM, and take a trained image encoder <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle E}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mi>E</mi> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle E}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/4232c9de2ee3eec0a9c0a19b15ab92daa6223f9b" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.338ex; width:1.776ex; height:2.176ex;" alt="{\displaystyle E}"></span>. Make a small multilayered perceptron <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle f}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mi>f</mi> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle f}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/132e57acb643253e7810ee9702d9581f159a1c61" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.671ex; width:1.279ex; height:2.509ex;" alt="{\displaystyle f}"></span>, so that for any image <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle y}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mi>y</mi> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle y}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/b8a6208ec717213d4317e666f1ae872e00620a0d" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.671ex; width:1.155ex; height:2.009ex;" alt="{\displaystyle y}"></span>, the post-processed vector <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle f(E(y))}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mi>f</mi> <mo stretchy="false">(</mo> <mi>E</mi> <mo stretchy="false">(</mo> <mi>y</mi> <mo stretchy="false">)</mo> <mo stretchy="false">)</mo> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle f(E(y))}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/8d41d0ec0611a795f65ea14a43b8016462703a8e" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.838ex; width:7.828ex; height:2.843ex;" alt="{\displaystyle f(E(y))}"></span> has the same dimensions as an encoded token. That is an "image token". Then, one can interleave text tokens and image tokens. The compound model is then fine-tuned on an image-text dataset. This basic construction can be applied with more sophistication to improve the model. The image encoder may be frozen to improve stability.<sup id="cite_ref-81" class="reference"><a href="#cite_note-81"><span class="cite-bracket">[</span>81<span class="cite-bracket">]</span></a></sup> </p><p>Flamingo demonstrated the effectiveness of the tokenization method, finetuning a pair of pretrained language model and image encoder to perform better on visual question answering than models trained from scratch.<sup id="cite_ref-82" class="reference"><a href="#cite_note-82"><span class="cite-bracket">[</span>82<span class="cite-bracket">]</span></a></sup> <a href="/wiki/Pathways_Language_Model" class="mw-redirect" title="Pathways Language Model">Google PaLM</a> model was fine-tuned into a multimodal model PaLM-E using the tokenization method, and applied to robotic control.<sup id="cite_ref-83" class="reference"><a href="#cite_note-83"><span class="cite-bracket">[</span>83<span class="cite-bracket">]</span></a></sup> <a href="/wiki/LLaMA" class="mw-redirect" title="LLaMA">LLaMA</a> models have also been turned multimodal using the tokenization method, to allow image inputs,<sup id="cite_ref-84" class="reference"><a href="#cite_note-84"><span class="cite-bracket">[</span>84<span class="cite-bracket">]</span></a></sup> and video inputs.<sup id="cite_ref-85" class="reference"><a href="#cite_note-85"><span class="cite-bracket">[</span>85<span class="cite-bracket">]</span></a></sup> </p><p><a href="/wiki/GPT-4" title="GPT-4">GPT-4</a> can use both text and image as inputs<sup id="cite_ref-86" class="reference"><a href="#cite_note-86"><span class="cite-bracket">[</span>86<span class="cite-bracket">]</span></a></sup> (although the vision component was not released to the public until GPT-4V<sup id="cite_ref-87" class="reference"><a href="#cite_note-87"><span class="cite-bracket">[</span>87<span class="cite-bracket">]</span></a></sup>); <a href="/wiki/Google_DeepMind" title="Google DeepMind">Google DeepMind</a>'s <a href="/wiki/Gemini_(language_model)" title="Gemini (language model)">Gemini</a> is also multimodal.<sup id="cite_ref-88" class="reference"><a href="#cite_note-88"><span class="cite-bracket">[</span>88<span class="cite-bracket">]</span></a></sup> Mistral introduced its own multimodel Pixtral 12B model in September 2024.<sup id="cite_ref-89" class="reference"><a href="#cite_note-89"><span class="cite-bracket">[</span>89<span class="cite-bracket">]</span></a></sup> </p> <div class="mw-heading mw-heading2"><h2 id="Properties">Properties</h2><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Large_language_model&action=edit&section=19" title="Edit section: Properties"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <div class="mw-heading mw-heading3"><h3 id="Scaling_laws">Scaling laws</h3><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Large_language_model&action=edit&section=20" title="Edit section: Scaling laws"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1236090951"><div role="note" class="hatnote navigation-not-searchable">Main article: <a href="/wiki/Neural_scaling_law" title="Neural scaling law">Neural scaling law</a></div> <p>The following four hyper-parameters characterize an LLM: </p> <ul><li>cost of (pre-)training (<small><span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle C}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mi>C</mi> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle C}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/4fc55753007cd3c18576f7933f6f089196732029" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.338ex; width:1.766ex; height:2.176ex;" alt="{\displaystyle C}"></span></small>),</li> <li>size of the <a href="/wiki/Artificial_neural_network" class="mw-redirect" title="Artificial neural network">artificial neural network</a> itself, such as number of parameters <small><span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle N}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mi>N</mi> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle N}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/f5e3890c981ae85503089652feb48b191b57aae3" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.338ex; width:2.064ex; height:2.176ex;" alt="{\displaystyle N}"></span></small> (i.e. amount of neurons in its layers, amount of weights between them and biases),</li> <li>size of its (pre-)training dataset (i.e. number of tokens in corpus, <small><span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle D}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mi>D</mi> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle D}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/f34a0c600395e5d4345287e21fb26efd386990e6" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.338ex; width:1.924ex; height:2.176ex;" alt="{\displaystyle D}"></span></small>),</li> <li>performance after (pre-)training.</li></ul> <p>They are related by simple <a href="/wiki/Empirical_statistical_laws" title="Empirical statistical laws">statistical laws</a>, called "scaling laws". One particular scaling law ("<a href="/wiki/Chinchilla_AI" class="mw-redirect" title="Chinchilla AI">Chinchilla scaling</a>") for LLM autoregressively trained for one epoch, with a <a href="/wiki/Log-log_plot" class="mw-redirect" title="Log-log plot">log-log</a> <a href="/wiki/Learning_rate" title="Learning rate">learning rate</a> schedule, states that:<sup id="cite_ref-fJta3_90-0" class="reference"><a href="#cite_note-fJta3-90"><span class="cite-bracket">[</span>90<span class="cite-bracket">]</span></a></sup> <span class="mwe-math-element"><span class="mwe-math-mathml-display mwe-math-mathml-a11y" style="display: none;"><math display="block" xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle {\begin{cases}C=C_{0}ND\\[6pt]L={\frac {A}{N^{\alpha }}}+{\frac {B}{D^{\beta }}}+L_{0}\end{cases}}}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mrow class="MJX-TeXAtom-ORD"> <mrow> <mo>{</mo> <mtable columnalign="left left" rowspacing="0.8em 0.2em" columnspacing="1em" displaystyle="false"> <mtr> <mtd> <mi>C</mi> <mo>=</mo> <msub> <mi>C</mi> <mrow class="MJX-TeXAtom-ORD"> <mn>0</mn> </mrow> </msub> <mi>N</mi> <mi>D</mi> </mtd> </mtr> <mtr> <mtd> <mi>L</mi> <mo>=</mo> <mrow class="MJX-TeXAtom-ORD"> <mfrac> <mi>A</mi> <msup> <mi>N</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>α<!-- α --></mi> </mrow> </msup> </mfrac> </mrow> <mo>+</mo> <mrow class="MJX-TeXAtom-ORD"> <mfrac> <mi>B</mi> <msup> <mi>D</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>β<!-- β --></mi> </mrow> </msup> </mfrac> </mrow> <mo>+</mo> <msub> <mi>L</mi> <mrow class="MJX-TeXAtom-ORD"> <mn>0</mn> </mrow> </msub> </mtd> </mtr> </mtable> <mo fence="true" stretchy="true" symmetric="true"></mo> </mrow> </mrow> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle {\begin{cases}C=C_{0}ND\\[6pt]L={\frac {A}{N^{\alpha }}}+{\frac {B}{D^{\beta }}}+L_{0}\end{cases}}}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/39435f4ecd5e00c0714a4f7f71cc0b91f5973cdd" class="mwe-math-fallback-image-display mw-invert skin-invert" aria-hidden="true" style="vertical-align: -3.505ex; width:22.298ex; height:8.176ex;" alt="{\displaystyle {\begin{cases}C=C_{0}ND\\[6pt]L={\frac {A}{N^{\alpha }}}+{\frac {B}{D^{\beta }}}+L_{0}\end{cases}}}"></span> where the variables are </p> <ul><li><small><span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle C}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mi>C</mi> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle C}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/4fc55753007cd3c18576f7933f6f089196732029" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.338ex; width:1.766ex; height:2.176ex;" alt="{\displaystyle C}"></span></small> is the cost of training the model, in <a href="/wiki/FLOPS" class="mw-redirect" title="FLOPS">FLOPs</a>.</li> <li><small><span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle N}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mi>N</mi> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle N}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/f5e3890c981ae85503089652feb48b191b57aae3" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.338ex; width:2.064ex; height:2.176ex;" alt="{\displaystyle N}"></span></small> is the number of parameters in the model.</li> <li><small><span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle D}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mi>D</mi> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle D}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/f34a0c600395e5d4345287e21fb26efd386990e6" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.338ex; width:1.924ex; height:2.176ex;" alt="{\displaystyle D}"></span></small> is the number of tokens in the training set.</li> <li><small><span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle L}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mi>L</mi> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle L}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/103168b86f781fe6e9a4a87b8ea1cebe0ad4ede8" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.338ex; width:1.583ex; height:2.176ex;" alt="{\displaystyle L}"></span></small> is the average negative log-likelihood loss per token (<a href="/wiki/Nat_(unit)" title="Nat (unit)">nats</a>/token), achieved by the trained LLM on the test dataset.</li></ul> <p>and the statistical hyper-parameters are </p> <ul><li><small><span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle C_{0}=6}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <msub> <mi>C</mi> <mrow class="MJX-TeXAtom-ORD"> <mn>0</mn> </mrow> </msub> <mo>=</mo> <mn>6</mn> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle C_{0}=6}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/b05c98b1743f05e046a3f3bb0a966fa898e431e2" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.671ex; width:6.977ex; height:2.509ex;" alt="{\displaystyle C_{0}=6}"></span></small>, meaning that it costs 6 FLOPs per parameter to train on one token. Note that training cost is much higher than inference cost, where it costs 1 to 2 FLOPs per parameter to infer on one token.<sup id="cite_ref-kaplan-scaling_55-1" class="reference"><a href="#cite_note-kaplan-scaling-55"><span class="cite-bracket">[</span>55<span class="cite-bracket">]</span></a></sup></li> <li><small><span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle \alpha =0.34,\beta =0.28,A=406.4,B=410.7,L_{0}=1.69}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mi>α<!-- α --></mi> <mo>=</mo> <mn>0.34</mn> <mo>,</mo> <mi>β<!-- β --></mi> <mo>=</mo> <mn>0.28</mn> <mo>,</mo> <mi>A</mi> <mo>=</mo> <mn>406.4</mn> <mo>,</mo> <mi>B</mi> <mo>=</mo> <mn>410.7</mn> <mo>,</mo> <msub> <mi>L</mi> <mrow class="MJX-TeXAtom-ORD"> <mn>0</mn> </mrow> </msub> <mo>=</mo> <mn>1.69</mn> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle \alpha =0.34,\beta =0.28,A=406.4,B=410.7,L_{0}=1.69}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/848b6d78d881ed6da8d6b60e8d788bc799525401" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.671ex; width:51.588ex; height:2.509ex;" alt="{\displaystyle \alpha =0.34,\beta =0.28,A=406.4,B=410.7,L_{0}=1.69}"></span></small></li></ul> <div class="mw-heading mw-heading3"><h3 id="Emergent_abilities">Emergent abilities</h3><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Large_language_model&action=edit&section=21" title="Edit section: Emergent abilities"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <p><span class="anchor" id="Emergent_abilities"></span></p><figure class="mw-default-size" typeof="mw:File/Thumb"><a href="/wiki/File:LLM_emergent_benchmarks.png" class="mw-file-description"><img src="//upload.wikimedia.org/wikipedia/commons/thumb/5/57/LLM_emergent_benchmarks.png/220px-LLM_emergent_benchmarks.png" decoding="async" width="220" height="146" class="mw-file-element" srcset="//upload.wikimedia.org/wikipedia/commons/thumb/5/57/LLM_emergent_benchmarks.png/330px-LLM_emergent_benchmarks.png 1.5x, //upload.wikimedia.org/wikipedia/commons/thumb/5/57/LLM_emergent_benchmarks.png/440px-LLM_emergent_benchmarks.png 2x" data-file-width="1297" data-file-height="858" /></a><figcaption>At point(s) referred to as <a href="/wiki/Broken_Neural_Scaling_Law" class="mw-redirect" title="Broken Neural Scaling Law">breaks</a>,<sup id="cite_ref-IYm4Q_91-0" class="reference"><a href="#cite_note-IYm4Q-91"><span class="cite-bracket">[</span>91<span class="cite-bracket">]</span></a></sup> the lines change their slopes, appearing on a linear-log plot as a series of linear segments connected by arcs.</figcaption></figure> <p>Performance of bigger models on various tasks, when plotted on a log-log scale, appears as a linear extrapolation of performance achieved by smaller models. However, this linearity may be punctuated by "<a href="/wiki/Broken_Neural_Scaling_Law" class="mw-redirect" title="Broken Neural Scaling Law">break(s)</a>"<sup id="cite_ref-IYm4Q_91-1" class="reference"><a href="#cite_note-IYm4Q-91"><span class="cite-bracket">[</span>91<span class="cite-bracket">]</span></a></sup> in the scaling law, where the slope of the line changes abruptly, and where larger models acquire "emergent abilities".<sup id="cite_ref-emergentpaper_40-1" class="reference"><a href="#cite_note-emergentpaper-40"><span class="cite-bracket">[</span>40<span class="cite-bracket">]</span></a></sup><sup id="cite_ref-JM6s1_92-0" class="reference"><a href="#cite_note-JM6s1-92"><span class="cite-bracket">[</span>92<span class="cite-bracket">]</span></a></sup> They arise from the complex interaction of the model's components and are not explicitly programmed or designed.<sup id="cite_ref-Bowman_93-0" class="reference"><a href="#cite_note-Bowman-93"><span class="cite-bracket">[</span>93<span class="cite-bracket">]</span></a></sup> </p><p>Furthermore, recent research has demonstrated that AI systems, including large language models, can employ heuristic reasoning akin to human cognition. They balance between exhaustive logical processing and the use of cognitive shortcuts (heuristics), adapting their reasoning strategies to optimize between accuracy and effort. This behavior aligns with principles of resource-rational human cognition, as discussed in classical theories of bounded rationality and dual-process theory.<sup id="cite_ref-Heuristic-Mukherjee_94-0" class="reference"><a href="#cite_note-Heuristic-Mukherjee-94"><span class="cite-bracket">[</span>94<span class="cite-bracket">]</span></a></sup> </p><p>The most intriguing among emergent abilities is <a href="/wiki/In-context_learning" class="mw-redirect" title="In-context learning">in-context learning</a> from example demonstrations.<sup id="cite_ref-Hahn_20230314_95-0" class="reference"><a href="#cite_note-Hahn_20230314-95"><span class="cite-bracket">[</span>95<span class="cite-bracket">]</span></a></sup> In-context learning is involved in tasks, such as: </p> <ul><li>reported arithmetics, decoding the <a href="/wiki/International_Phonetic_Alphabet" title="International Phonetic Alphabet">International Phonetic Alphabet</a>, unscrambling a word's letters, disambiguate word in context,<sup id="cite_ref-emergentpaper_40-2" class="reference"><a href="#cite_note-emergentpaper-40"><span class="cite-bracket">[</span>40<span class="cite-bracket">]</span></a></sup><sup id="cite_ref-57FEA_96-0" class="reference"><a href="#cite_note-57FEA-96"><span class="cite-bracket">[</span>96<span class="cite-bracket">]</span></a></sup><sup id="cite_ref-TEIkA_97-0" class="reference"><a href="#cite_note-TEIkA-97"><span class="cite-bracket">[</span>97<span class="cite-bracket">]</span></a></sup> converting spatial words, <a href="/wiki/Cardinal_direction" title="Cardinal direction">cardinal directions</a> (for example, replying "northeast" upon [0, 0, 1; 0, 0, 0; 0, 0, 0]), color terms represented in text.<sup id="cite_ref-zgy1i_98-0" class="reference"><a href="#cite_note-zgy1i-98"><span class="cite-bracket">[</span>98<span class="cite-bracket">]</span></a></sup></li> <li><a href="/wiki/Chain-of-thought_prompting" class="mw-redirect" title="Chain-of-thought prompting">chain-of-thought prompting</a>: Model outputs are improved by chain-of-thought prompting only when model size exceeds 62B. Smaller models perform better when prompted to answer immediately, without chain of thought.<sup id="cite_ref-Imb98_99-0" class="reference"><a href="#cite_note-Imb98-99"><span class="cite-bracket">[</span>99<span class="cite-bracket">]</span></a></sup></li> <li>identifying offensive content in paragraphs of <a href="/wiki/Hinglish" title="Hinglish">Hinglish</a> (a combination of Hindi and English), and generating a similar English equivalent of <a href="/wiki/Kiswahili" class="mw-redirect" title="Kiswahili">Kiswahili</a> proverbs.<sup id="cite_ref-CeQVF_100-0" class="reference"><a href="#cite_note-CeQVF-100"><span class="cite-bracket">[</span>100<span class="cite-bracket">]</span></a></sup></li></ul> <p>Schaeffer <i>et. al.</i> argue that the emergent abilities are not unpredictably acquired, but predictably acquired according to a <a href="/wiki/Neural_scaling_law" title="Neural scaling law">smooth scaling law</a>. The authors considered a toy statistical model of an LLM solving multiple-choice questions, and showed that this statistical model, modified to account for other types of tasks, applies to these tasks as well.<sup id="cite_ref-C775b_101-0" class="reference"><a href="#cite_note-C775b-101"><span class="cite-bracket">[</span>101<span class="cite-bracket">]</span></a></sup> </p><p>Let <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle x}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mi>x</mi> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle x}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/87f9e315fd7e2ba406057a97300593c4802b53e4" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.338ex; width:1.33ex; height:1.676ex;" alt="{\displaystyle x}"></span> be the number of parameter count, and <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle y}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mi>y</mi> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle y}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/b8a6208ec717213d4317e666f1ae872e00620a0d" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.671ex; width:1.155ex; height:2.009ex;" alt="{\displaystyle y}"></span> be the performance of the model. </p> <div style="font-size:85%;"> <ul><li>When <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle y={\text{average }}\Pr({\text{correct token}})}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mi>y</mi> <mo>=</mo> <mrow class="MJX-TeXAtom-ORD"> <mtext>average </mtext> </mrow> <mo movablelimits="true" form="prefix">Pr</mo> <mo stretchy="false">(</mo> <mrow class="MJX-TeXAtom-ORD"> <mtext>correct token</mtext> </mrow> <mo stretchy="false">)</mo> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle y={\text{average }}\Pr({\text{correct token}})}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/25f87a1a04b7eb97aca02ae9170ae7f05e308bd4" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.838ex; width:30.404ex; height:2.843ex;" alt="{\displaystyle y={\text{average }}\Pr({\text{correct token}})}"></span>, then <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle (\log x,y)}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mo stretchy="false">(</mo> <mi>log</mi> <mo>⁡<!-- --></mo> <mi>x</mi> <mo>,</mo> <mi>y</mi> <mo stretchy="false">)</mo> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle (\log x,y)}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/1dccdbdb2af7f930d3fff961d7f76540706bbaf8" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.838ex; width:8.687ex; height:2.843ex;" alt="{\displaystyle (\log x,y)}"></span> is an exponential curve (before it hits the plateau at one), which looks like emergence.</li> <li>When <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle y={\text{average }}\log(\Pr({\text{correct token}}))}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mi>y</mi> <mo>=</mo> <mrow class="MJX-TeXAtom-ORD"> <mtext>average </mtext> </mrow> <mi>log</mi> <mo>⁡<!-- --></mo> <mo stretchy="false">(</mo> <mo movablelimits="true" form="prefix">Pr</mo> <mo stretchy="false">(</mo> <mrow class="MJX-TeXAtom-ORD"> <mtext>correct token</mtext> </mrow> <mo stretchy="false">)</mo> <mo stretchy="false">)</mo> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle y={\text{average }}\log(\Pr({\text{correct token}}))}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/c22c18197c1091afcb5ed896ba90b8429af1c861" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.838ex; width:35.185ex; height:2.843ex;" alt="{\displaystyle y={\text{average }}\log(\Pr({\text{correct token}}))}"></span>, then the <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle (\log x,y)}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mo stretchy="false">(</mo> <mi>log</mi> <mo>⁡<!-- --></mo> <mi>x</mi> <mo>,</mo> <mi>y</mi> <mo stretchy="false">)</mo> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle (\log x,y)}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/1dccdbdb2af7f930d3fff961d7f76540706bbaf8" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.838ex; width:8.687ex; height:2.843ex;" alt="{\displaystyle (\log x,y)}"></span> plot is a straight line (before it hits the plateau at zero), which does not look like emergence.</li> <li>When <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle y={\text{average }}\Pr({\text{the most likely token is correct}})}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mi>y</mi> <mo>=</mo> <mrow class="MJX-TeXAtom-ORD"> <mtext>average </mtext> </mrow> <mo movablelimits="true" form="prefix">Pr</mo> <mo stretchy="false">(</mo> <mrow class="MJX-TeXAtom-ORD"> <mtext>the most likely token is correct</mtext> </mrow> <mo stretchy="false">)</mo> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle y={\text{average }}\Pr({\text{the most likely token is correct}})}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/6028c3484d3fbd36ffdc2cad41ff60ba9f8c1e7a" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.838ex; width:47.867ex; height:2.843ex;" alt="{\displaystyle y={\text{average }}\Pr({\text{the most likely token is correct}})}"></span>, then <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle (\log x,y)}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mo stretchy="false">(</mo> <mi>log</mi> <mo>⁡<!-- --></mo> <mi>x</mi> <mo>,</mo> <mi>y</mi> <mo stretchy="false">)</mo> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle (\log x,y)}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/1dccdbdb2af7f930d3fff961d7f76540706bbaf8" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.838ex; width:8.687ex; height:2.843ex;" alt="{\displaystyle (\log x,y)}"></span> is a step-function, which looks like emergence.</li></ul></div> <div class="mw-heading mw-heading2"><h2 id="Interpretation">Interpretation</h2><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Large_language_model&action=edit&section=22" title="Edit section: Interpretation"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <p>Large language models by themselves are <a href="/wiki/Black_box" title="Black box">black boxes</a>, and it is not clear how they can perform linguistic tasks. There are several methods for understanding how LLM work. </p><p>Mechanistic interpretability aims to <a href="/wiki/Reverse_engineering" title="Reverse engineering">reverse-engineer</a> LLM by discovering symbolic algorithms that approximate the inference performed by LLM. One example is Othello-GPT, where a small Transformer is trained to predict legal <a href="/wiki/Reversi" title="Reversi">Othello</a> moves. It is found that there is a linear representation of Othello board, and modifying the representation changes the predicted legal Othello moves in the correct way.<sup id="cite_ref-IZSIr_102-0" class="reference"><a href="#cite_note-IZSIr-102"><span class="cite-bracket">[</span>102<span class="cite-bracket">]</span></a></sup><sup id="cite_ref-RLik9_103-0" class="reference"><a href="#cite_note-RLik9-103"><span class="cite-bracket">[</span>103<span class="cite-bracket">]</span></a></sup> In another example, a small Transformer is trained on <a href="/wiki/Karel_(programming_language)" title="Karel (programming language)">Karel programs</a>. Similar to the Othello-GPT example, there is a linear representation of Karel program semantics, and modifying the representation changes output in the correct way. The model also generates correct programs that are on average shorter than those in the training set.<sup id="cite_ref-Hln1l_104-0" class="reference"><a href="#cite_note-Hln1l-104"><span class="cite-bracket">[</span>104<span class="cite-bracket">]</span></a></sup> </p><p>In another example, the authors trained small transformers on <a href="/wiki/Modular_arithmetic" title="Modular arithmetic">modular arithmetic addition</a>. The resulting models were reverse-engineered, and it turned out they used <a href="/wiki/Discrete_Fourier_transform" title="Discrete Fourier transform">discrete Fourier transform</a>.<sup id="cite_ref-oYGlo_105-0" class="reference"><a href="#cite_note-oYGlo-105"><span class="cite-bracket">[</span>105<span class="cite-bracket">]</span></a></sup> </p> <div class="mw-heading mw-heading3"><h3 id="Understanding_and_intelligence">Understanding and intelligence</h3><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Large_language_model&action=edit&section=23" title="Edit section: Understanding and intelligence"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1236090951"><div role="note" class="hatnote navigation-not-searchable">See also: <a href="/wiki/Philosophy_of_artificial_intelligence" title="Philosophy of artificial intelligence">Philosophy of artificial intelligence</a> and <a href="/wiki/Artificial_consciousness" title="Artificial consciousness">Artificial consciousness</a></div> <p>NLP researchers were evenly split when asked, in a 2022 survey, whether (untuned) LLMs "could (ever) understand natural language in some nontrivial sense".<sup id="cite_ref-debate_understanding_106-0" class="reference"><a href="#cite_note-debate_understanding-106"><span class="cite-bracket">[</span>106<span class="cite-bracket">]</span></a></sup> Proponents of "LLM understanding" believe that some LLM abilities, such as mathematical reasoning, imply an ability to <a href="/wiki/Natural_language_understanding" title="Natural language understanding">"understand"</a> certain concepts. A Microsoft team argued in 2023 that GPT-4 "can solve novel and difficult tasks that span mathematics, coding, vision, medicine, law, psychology and more" and that GPT-4 "could reasonably be viewed as an early (yet still incomplete) version of an <a href="/wiki/Artificial_general_intelligence" title="Artificial general intelligence">artificial general intelligence</a> system": "Can one reasonably say that a system that passes exams for software engineering candidates is not <i>really</i> intelligent?"<sup id="cite_ref-O8Upd_107-0" class="reference"><a href="#cite_note-O8Upd-107"><span class="cite-bracket">[</span>107<span class="cite-bracket">]</span></a></sup><sup id="cite_ref-microsoft_sparks_108-0" class="reference"><a href="#cite_note-microsoft_sparks-108"><span class="cite-bracket">[</span>108<span class="cite-bracket">]</span></a></sup> <a href="/wiki/Ilya_Sutskever" title="Ilya Sutskever">Ilya Sutskever</a> argues that predicting the next word sometimes involves reasoning and deep insights, for example if the LLM has to predict the name of the criminal in an unknown detective novel after processing the entire story leading up to the revelation.<sup id="cite_ref-109" class="reference"><a href="#cite_note-109"><span class="cite-bracket">[</span>109<span class="cite-bracket">]</span></a></sup> Some researchers characterize LLMs as "alien intelligence".<sup id="cite_ref-rEEmH_110-0" class="reference"><a href="#cite_note-rEEmH-110"><span class="cite-bracket">[</span>110<span class="cite-bracket">]</span></a></sup><sup id="cite_ref-new_yorker_kind_of_mind_111-0" class="reference"><a href="#cite_note-new_yorker_kind_of_mind-111"><span class="cite-bracket">[</span>111<span class="cite-bracket">]</span></a></sup> For example, Conjecture CEO <a href="/wiki/Connor_Leahy" title="Connor Leahy">Connor Leahy</a> considers untuned LLMs to be like inscrutable alien "<a href="/wiki/Shoggoth" title="Shoggoth">Shoggoths</a>", and believes that RLHF tuning creates a "smiling facade" obscuring the inner workings of the LLM: "If you don't push it too far, the smiley face stays on. But then you give it [an unexpected] prompt, and suddenly you see this massive underbelly of insanity, of weird thought processes and clearly non-human understanding."<sup id="cite_ref-rAFIZ_112-0" class="reference"><a href="#cite_note-rAFIZ-112"><span class="cite-bracket">[</span>112<span class="cite-bracket">]</span></a></sup><sup id="cite_ref-4luKE_113-0" class="reference"><a href="#cite_note-4luKE-113"><span class="cite-bracket">[</span>113<span class="cite-bracket">]</span></a></sup> </p><p>In contrast, some proponents of the "LLMs lack understanding" school believe that existing LLMs are "simply remixing and recombining existing writing",<sup id="cite_ref-new_yorker_kind_of_mind_111-1" class="reference"><a href="#cite_note-new_yorker_kind_of_mind-111"><span class="cite-bracket">[</span>111<span class="cite-bracket">]</span></a></sup> a phenomenon known as <a href="/wiki/Stochastic_parrot" title="Stochastic parrot">stochastic parrot</a>, or they point to the deficits existing LLMs continue to have in prediction skills, reasoning skills, agency, and explainability.<sup id="cite_ref-debate_understanding_106-1" class="reference"><a href="#cite_note-debate_understanding-106"><span class="cite-bracket">[</span>106<span class="cite-bracket">]</span></a></sup> For example, GPT-4 has natural deficits in planning and in real-time learning.<sup id="cite_ref-microsoft_sparks_108-1" class="reference"><a href="#cite_note-microsoft_sparks-108"><span class="cite-bracket">[</span>108<span class="cite-bracket">]</span></a></sup> Generative LLMs have been observed to confidently assert claims of fact which do not seem to be <a href="/wiki/Justification_(epistemology)" title="Justification (epistemology)">justified</a> by their <a href="/wiki/Training_data" class="mw-redirect" title="Training data">training data</a>, a phenomenon which has been termed "<a href="/wiki/Hallucination_(artificial_intelligence)" title="Hallucination (artificial intelligence)">hallucination</a>".<sup id="cite_ref-hallucination-survey_114-0" class="reference"><a href="#cite_note-hallucination-survey-114"><span class="cite-bracket">[</span>114<span class="cite-bracket">]</span></a></sup> Specifically, hallucinations in the context of LLMs correspond to the generation of text or responses that seem syntactically sound, fluent, and natural but are factually incorrect, nonsensical, or unfaithful to the provided source input.<sup id="cite_ref-115" class="reference"><a href="#cite_note-115"><span class="cite-bracket">[</span>115<span class="cite-bracket">]</span></a></sup> Neuroscientist <a href="/wiki/Terrence_Sejnowski" class="mw-redirect" title="Terrence Sejnowski">Terrence Sejnowski</a> has argued that "The diverging opinions of experts on the intelligence of LLMs suggests that our old ideas based on natural intelligence are inadequate".<sup id="cite_ref-debate_understanding_106-2" class="reference"><a href="#cite_note-debate_understanding-106"><span class="cite-bracket">[</span>106<span class="cite-bracket">]</span></a></sup> </p><p>The matter of LLM's exhibiting intelligence or understanding has two main aspects – the first is how to model thought and language in a computer system, and the second is how to enable the computer system to generate human like language.<sup id="cite_ref-debate_understanding_106-3" class="reference"><a href="#cite_note-debate_understanding-106"><span class="cite-bracket">[</span>106<span class="cite-bracket">]</span></a></sup> These aspects of language as a model of <a href="/wiki/Cognition" title="Cognition">cognition</a> have been developed in the field of <a href="/wiki/Cognitive_linguistics" title="Cognitive linguistics">cognitive linguistics</a>. American linguist <a href="/wiki/George_Lakoff" title="George Lakoff">George Lakoff</a> presented Neural Theory of Language (NTL)<sup id="cite_ref-116" class="reference"><a href="#cite_note-116"><span class="cite-bracket">[</span>116<span class="cite-bracket">]</span></a></sup> as a <a href="/wiki/Cognitive_linguistics#Computational_approaches" title="Cognitive linguistics">computational basis</a> for using language as a model of learning tasks and understanding. <a rel="nofollow" class="external text" href="https://www.icsi.berkeley.edu/icsi/projects/ai/ntl">The NTL Model</a> outlines how specific neural structures of the human brain shape the nature of thought and language and in turn what are the computational properties of such neural systems that can be applied to model thought and language in a computer system. After a framework for modeling language in a computer systems was established, the focus shifted to establishing frameworks for computer systems to generate language with acceptable grammar. In his 2014 book titled <i><a href="/wiki/The_Language_Myth" title="The Language Myth">The Language Myth: Why Language Is Not An Instinct</a></i>, British cognitive linguist and digital communication technologist <a href="/wiki/Vyvyan_Evans" title="Vyvyan Evans">Vyvyan Evans</a> mapped out the role of <a href="/wiki/Probabilistic_context-free_grammar" title="Probabilistic context-free grammar">probabilistic context-free grammar</a> (PCFG) in enabling <a href="/wiki/Natural_language_processing#Cognition" title="Natural language processing">NLP to model cognitive patterns</a> and generate human like language.<sup id="cite_ref-117" class="reference"><a href="#cite_note-117"><span class="cite-bracket">[</span>117<span class="cite-bracket">]</span></a></sup><sup id="cite_ref-118" class="reference"><a href="#cite_note-118"><span class="cite-bracket">[</span>118<span class="cite-bracket">]</span></a></sup> </p> <div class="mw-heading mw-heading2"><h2 id="Evaluation">Evaluation</h2><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Large_language_model&action=edit&section=24" title="Edit section: Evaluation"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <div class="mw-heading mw-heading3"><h3 id="Perplexity">Perplexity</h3><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Large_language_model&action=edit&section=25" title="Edit section: Perplexity"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <p>The canonical measure of the performance of an LLM is its <a href="/wiki/Perplexity" title="Perplexity">perplexity</a> on a given text corpus. Perplexity measures how well a model predicts the contents of a dataset; the higher the likelihood the model assigns to the dataset, the lower the perplexity. In mathematical terms, perplexity is the exponential of the average negative log likelihood per token. </p><p><span class="mwe-math-element"><span class="mwe-math-mathml-display mwe-math-mathml-a11y" style="display: none;"><math display="block" xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle \log({\text{Perplexity}})=-{\frac {1}{N}}\sum _{i=1}^{N}\log(\Pr({\text{token}}_{i}\mid {\text{context for token}}_{i}))}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mi>log</mi> <mo>⁡<!-- --></mo> <mo stretchy="false">(</mo> <mrow class="MJX-TeXAtom-ORD"> <mtext>Perplexity</mtext> </mrow> <mo stretchy="false">)</mo> <mo>=</mo> <mo>−<!-- − --></mo> <mrow class="MJX-TeXAtom-ORD"> <mfrac> <mn>1</mn> <mi>N</mi> </mfrac> </mrow> <munderover> <mo>∑<!-- ∑ --></mo> <mrow class="MJX-TeXAtom-ORD"> <mi>i</mi> <mo>=</mo> <mn>1</mn> </mrow> <mrow class="MJX-TeXAtom-ORD"> <mi>N</mi> </mrow> </munderover> <mi>log</mi> <mo>⁡<!-- --></mo> <mo stretchy="false">(</mo> <mo movablelimits="true" form="prefix">Pr</mo> <mo stretchy="false">(</mo> <msub> <mrow class="MJX-TeXAtom-ORD"> <mtext>token</mtext> </mrow> <mrow class="MJX-TeXAtom-ORD"> <mi>i</mi> </mrow> </msub> <mo>∣<!-- ∣ --></mo> <msub> <mrow class="MJX-TeXAtom-ORD"> <mtext>context for token</mtext> </mrow> <mrow class="MJX-TeXAtom-ORD"> <mi>i</mi> </mrow> </msub> <mo stretchy="false">)</mo> <mo stretchy="false">)</mo> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle \log({\text{Perplexity}})=-{\frac {1}{N}}\sum _{i=1}^{N}\log(\Pr({\text{token}}_{i}\mid {\text{context for token}}_{i}))}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/556393708767666076b9723412bc8519284449a5" class="mwe-math-fallback-image-display mw-invert skin-invert" aria-hidden="true" style="vertical-align: -3.005ex; width:62.586ex; height:7.343ex;" alt="{\displaystyle \log({\text{Perplexity}})=-{\frac {1}{N}}\sum _{i=1}^{N}\log(\Pr({\text{token}}_{i}\mid {\text{context for token}}_{i}))}"></span> </p><p>Here, <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle N}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mi>N</mi> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle N}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/f5e3890c981ae85503089652feb48b191b57aae3" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.338ex; width:2.064ex; height:2.176ex;" alt="{\displaystyle N}"></span> is the number of tokens in the text corpus, and "context for token <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle i}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mi>i</mi> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle i}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/add78d8608ad86e54951b8c8bd6c8d8416533d20" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.338ex; width:0.802ex; height:2.176ex;" alt="{\displaystyle i}"></span>" depends on the specific type of LLM. If the LLM is autoregressive, then "context for token <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle i}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mi>i</mi> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle i}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/add78d8608ad86e54951b8c8bd6c8d8416533d20" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.338ex; width:0.802ex; height:2.176ex;" alt="{\displaystyle i}"></span>" is the segment of text appearing before token <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle i}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mi>i</mi> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle i}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/add78d8608ad86e54951b8c8bd6c8d8416533d20" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.338ex; width:0.802ex; height:2.176ex;" alt="{\displaystyle i}"></span>. If the LLM is masked, then "context for token <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle i}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mi>i</mi> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle i}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/add78d8608ad86e54951b8c8bd6c8d8416533d20" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.338ex; width:0.802ex; height:2.176ex;" alt="{\displaystyle i}"></span>" is the segment of text surrounding token <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle i}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mi>i</mi> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle i}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/add78d8608ad86e54951b8c8bd6c8d8416533d20" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.338ex; width:0.802ex; height:2.176ex;" alt="{\displaystyle i}"></span>. </p><p>Because language models may <a href="/wiki/Overfit" class="mw-redirect" title="Overfit">overfit</a> to training data, models are usually evaluated by their perplexity on a <a href="/wiki/Test_set" class="mw-redirect" title="Test set">test set</a>.<sup id="cite_ref-jm_47-2" class="reference"><a href="#cite_note-jm-47"><span class="cite-bracket">[</span>47<span class="cite-bracket">]</span></a></sup> This evaluation is potentially problematic for larger models which, as they are trained on increasingly large corpora of text, are increasingly likely to inadvertently include portions of any given test set.<sup id="cite_ref-few-shot-learners_2-1" class="reference"><a href="#cite_note-few-shot-learners-2"><span class="cite-bracket">[</span>2<span class="cite-bracket">]</span></a></sup> </p> <div class="mw-heading mw-heading4"><h4 id="BPW,_BPC,_and_BPT"><span id="BPW.2C_BPC.2C_and_BPT"></span>BPW, BPC, and BPT</h4><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Large_language_model&action=edit&section=26" title="Edit section: BPW, BPC, and BPT"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <p>In <a href="/wiki/Information_theory" title="Information theory">information theory</a>, the concept of <a href="/wiki/Entropy_(information_theory)" title="Entropy (information theory)">entropy</a> is intricately linked to perplexity, a relationship notably established by <a href="/wiki/Claude_Shannon" title="Claude Shannon">Claude Shannon</a>.<sup id="cite_ref-Huyen_119-0" class="reference"><a href="#cite_note-Huyen-119"><span class="cite-bracket">[</span>119<span class="cite-bracket">]</span></a></sup> This relationship is mathematically expressed as <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle {\text{Entropy}}=\log _{2}({\text{Perplexity}})}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mrow class="MJX-TeXAtom-ORD"> <mtext>Entropy</mtext> </mrow> <mo>=</mo> <msub> <mi>log</mi> <mrow class="MJX-TeXAtom-ORD"> <mn>2</mn> </mrow> </msub> <mo>⁡<!-- --></mo> <mo stretchy="false">(</mo> <mrow class="MJX-TeXAtom-ORD"> <mtext>Perplexity</mtext> </mrow> <mo stretchy="false">)</mo> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle {\text{Entropy}}=\log _{2}({\text{Perplexity}})}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/462f40a6811ee57670d1735c452d04be85a82c57" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.838ex; width:27.813ex; height:2.843ex;" alt="{\displaystyle {\text{Entropy}}=\log _{2}({\text{Perplexity}})}"></span>. </p><p>Entropy, in this context, is commonly quantified in terms of bits per word (BPW) or bits per character (BPC), which hinges on whether the language model utilizes word-based or character-based tokenization. </p><p>Notably, in the case of larger language models that predominantly employ sub-word tokenization, bits per token (BPT) emerges as a seemingly more appropriate measure. However, due to the variance in tokenization methods across different Large Language Models (LLMs), BPT does not serve as a reliable metric for comparative analysis among diverse models. To convert BPT into BPW, one can multiply it by the average number of tokens per word. </p><p>In the evaluation and comparison of language models, <a href="/wiki/Cross-entropy" title="Cross-entropy">cross-entropy</a> is generally the preferred metric over entropy. The underlying principle is that a lower BPW is indicative of a model's enhanced capability for compression. This, in turn, reflects the model's proficiency in making accurate predictions. </p> <div class="mw-heading mw-heading3"><h3 id="Task-specific_datasets_and_benchmarks">Task-specific datasets and benchmarks</h3><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Large_language_model&action=edit&section=27" title="Edit section: Task-specific datasets and benchmarks"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <p>A large number of testing datasets and <a href="/wiki/Benchmark_(computing)" title="Benchmark (computing)">benchmarks</a> have also been developed to evaluate the capabilities of language models on more specific downstream tasks. Tests may be designed to evaluate a variety of capabilities, including general knowledge, <a href="/wiki/Commonsense_reasoning" title="Commonsense reasoning">commonsense reasoning</a>, and mathematical problem-solving. </p><p>One broad category of evaluation dataset is question answering datasets, consisting of pairs of questions and correct answers, for example, ("Have the San Jose Sharks won the Stanley Cup?", "No").<sup id="cite_ref-boolq_120-0" class="reference"><a href="#cite_note-boolq-120"><span class="cite-bracket">[</span>120<span class="cite-bracket">]</span></a></sup> A question answering task is considered "open book" if the model's prompt includes text from which the expected answer can be derived (for example, the previous question could be adjoined with some text which includes the sentence "The Sharks have advanced to the Stanley Cup finals once, losing to the Pittsburgh Penguins in 2016."<sup id="cite_ref-boolq_120-1" class="reference"><a href="#cite_note-boolq-120"><span class="cite-bracket">[</span>120<span class="cite-bracket">]</span></a></sup>). Otherwise, the task is considered "closed book", and the model must draw on knowledge retained during training.<sup id="cite_ref-survey_121-0" class="reference"><a href="#cite_note-survey-121"><span class="cite-bracket">[</span>121<span class="cite-bracket">]</span></a></sup> Some examples of commonly used question answering datasets include TruthfulQA, Web Questions, TriviaQA, and SQuAD.<sup id="cite_ref-survey_121-1" class="reference"><a href="#cite_note-survey-121"><span class="cite-bracket">[</span>121<span class="cite-bracket">]</span></a></sup> </p><p>Evaluation datasets may also take the form of text completion, having the model select the most likely word or sentence to complete a prompt, for example: "Alice was friends with Bob. Alice went to visit her friend, ____".<sup id="cite_ref-few-shot-learners_2-2" class="reference"><a href="#cite_note-few-shot-learners-2"><span class="cite-bracket">[</span>2<span class="cite-bracket">]</span></a></sup> </p><p>Some composite benchmarks have also been developed which combine a diversity of different evaluation datasets and tasks. Examples include GLUE, SuperGLUE, <a href="/wiki/MMLU" title="MMLU">MMLU</a>, BIG-bench, and HELM.<sup id="cite_ref-Huyen_119-1" class="reference"><a href="#cite_note-Huyen-119"><span class="cite-bracket">[</span>119<span class="cite-bracket">]</span></a></sup><sup id="cite_ref-survey_121-2" class="reference"><a href="#cite_note-survey-121"><span class="cite-bracket">[</span>121<span class="cite-bracket">]</span></a></sup> OpenAI has released tools for running composite benchmarks, but noted that the eval results are sensitive to the prompting method.<sup id="cite_ref-122" class="reference"><a href="#cite_note-122"><span class="cite-bracket">[</span>122<span class="cite-bracket">]</span></a></sup><sup id="cite_ref-123" class="reference"><a href="#cite_note-123"><span class="cite-bracket">[</span>123<span class="cite-bracket">]</span></a></sup> Some public datasets contain questions that are mislabeled, ambiguous, unanswerable, or otherwise of low-quality, which can be cleaned to give more reliable benchmark scores.<sup id="cite_ref-124" class="reference"><a href="#cite_note-124"><span class="cite-bracket">[</span>124<span class="cite-bracket">]</span></a></sup> </p><p>It was previously standard to report results on a heldout portion of an evaluation dataset after doing supervised fine-tuning on the remainder. It is now more common to evaluate a pre-trained model directly through prompting techniques, though researchers vary in the details of how they formulate prompts for particular tasks, particularly with respect to how many examples of solved tasks are adjoined to the prompt (i.e. the value of <i>n</i> in <i>n</i>-shot prompting). </p> <div class="mw-heading mw-heading4"><h4 id="Adversarially_constructed_evaluations">Adversarially constructed evaluations</h4><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Large_language_model&action=edit&section=28" title="Edit section: Adversarially constructed evaluations"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <p>Because of the rapid pace of improvement of large language models, evaluation benchmarks have suffered from short lifespans, with state of the art models quickly "saturating" existing benchmarks, exceeding the performance of human annotators, leading to efforts to replace or augment the benchmark with more challenging tasks.<sup id="cite_ref-bigbench_125-0" class="reference"><a href="#cite_note-bigbench-125"><span class="cite-bracket">[</span>125<span class="cite-bracket">]</span></a></sup> In addition, there are cases of "shortcut learning" wherein AIs sometimes "cheat" on multiple-choice tests by using statistical correlations in superficial test question wording in order to guess the correct responses, without necessarily understanding the actual question being asked.<sup id="cite_ref-debate_understanding_106-4" class="reference"><a href="#cite_note-debate_understanding-106"><span class="cite-bracket">[</span>106<span class="cite-bracket">]</span></a></sup> </p><p>Some datasets have been constructed adversarially, focusing on particular problems on which extant language models seem to have unusually poor performance compared to humans. One example is the TruthfulQA dataset, a question answering dataset consisting of 817 questions which language models are susceptible to answering incorrectly by mimicking falsehoods to which they were repeatedly exposed during training. For example, an LLM may answer "No" to the question "Can you teach an old dog new tricks?" because of its exposure to the English idiom <i><a href="https://en.wiktionary.org/wiki/you_can%27t_teach_an_old_dog_new_tricks" class="extiw" title="wikt:you can't teach an old dog new tricks">you can't teach an old dog new tricks</a></i>, even though this is not literally true.<sup id="cite_ref-truthfulqa_126-0" class="reference"><a href="#cite_note-truthfulqa-126"><span class="cite-bracket">[</span>126<span class="cite-bracket">]</span></a></sup> </p><p>Another example of an adversarial evaluation dataset is Swag and its successor, HellaSwag, collections of problems in which one of multiple options must be selected to complete a text passage. The incorrect completions were generated by sampling from a language model and filtering with a set of classifiers. The resulting problems are trivial for humans but at the time the datasets were created state of the art language models had poor accuracy on them. For example: </p> <blockquote> <p>We see a fitness center sign. We then see a man talking to the camera and sitting and laying on a exercise ball. The man... <br />a) demonstrates how to increase efficient exercise work by running up and down balls. <br />b) moves all his arms and legs and builds up a lot of muscle. <br />c) then plays the ball and we see a graphics and hedge trimming demonstration. <br />d) performs sit ups while on the ball and talking.<sup id="cite_ref-hellaswag_127-0" class="reference"><a href="#cite_note-hellaswag-127"><span class="cite-bracket">[</span>127<span class="cite-bracket">]</span></a></sup> </p> </blockquote> <p><a href="/wiki/BERT_(language_model)" title="BERT (language model)">BERT</a> selects b) as the most likely completion, though the correct answer is d).<sup id="cite_ref-hellaswag_127-1" class="reference"><a href="#cite_note-hellaswag-127"><span class="cite-bracket">[</span>127<span class="cite-bracket">]</span></a></sup> </p> <div class="mw-heading mw-heading2"><h2 id="Wider_impact">Wider impact</h2><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Large_language_model&action=edit&section=29" title="Edit section: Wider impact"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <p>In 2023, <i><a href="/wiki/Nature_Biomedical_Engineering" title="Nature Biomedical Engineering">Nature Biomedical Engineering</a></i> wrote that "it is no longer possible to accurately distinguish" human-written text from text created by large language models, and that "It is all but certain that general-purpose large language models will rapidly proliferate... It is a rather safe bet that they will change many industries over time."<sup id="cite_ref-ZDTUM_128-0" class="reference"><a href="#cite_note-ZDTUM-128"><span class="cite-bracket">[</span>128<span class="cite-bracket">]</span></a></sup> <a href="/wiki/Goldman_Sachs" title="Goldman Sachs">Goldman Sachs</a> suggested in 2023 that generative language AI could increase global GDP by 7% in the next ten years, and could expose to automation 300 million jobs globally.<sup id="cite_ref-81w7x_129-0" class="reference"><a href="#cite_note-81w7x-129"><span class="cite-bracket">[</span>129<span class="cite-bracket">]</span></a></sup><sup id="cite_ref-zIM6Y_130-0" class="reference"><a href="#cite_note-zIM6Y-130"><span class="cite-bracket">[</span>130<span class="cite-bracket">]</span></a></sup> </p> <div class="mw-heading mw-heading3"><h3 id="Memorization_and_copyright">Memorization and copyright</h3><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Large_language_model&action=edit&section=30" title="Edit section: Memorization and copyright"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1236090951"><div role="note" class="hatnote navigation-not-searchable">Further information: <a href="/wiki/Artificial_intelligence_and_copyright" title="Artificial intelligence and copyright">Artificial intelligence and copyright</a></div> <p>Memorization is an emergent behavior in LLMs in which long strings of text are occasionally output verbatim from training data, contrary to typical behavior of traditional artificial neural nets. Evaluations of controlled LLM output measure the amount memorized from training data (focused on GPT-2-series models) as variously over 1% for exact duplicates<sup id="cite_ref-131" class="reference"><a href="#cite_note-131"><span class="cite-bracket">[</span>131<span class="cite-bracket">]</span></a></sup> or up to about 7%.<sup id="cite_ref-132" class="reference"><a href="#cite_note-132"><span class="cite-bracket">[</span>132<span class="cite-bracket">]</span></a></sup> </p> <div class="mw-heading mw-heading3"><h3 id="Security">Security</h3><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Large_language_model&action=edit&section=31" title="Edit section: Security"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <p>Some commenters expressed concern over accidental or deliberate creation of misinformation, or other forms of misuse.<sup id="cite_ref-nD6kH_133-0" class="reference"><a href="#cite_note-nD6kH-133"><span class="cite-bracket">[</span>133<span class="cite-bracket">]</span></a></sup> For example, the availability of large language models could reduce the skill-level required to commit bioterrorism; biosecurity researcher Kevin Esvelt has suggested that LLM creators should exclude from their training data papers on creating or enhancing pathogens.<sup id="cite_ref-PKiPY_134-0" class="reference"><a href="#cite_note-PKiPY-134"><span class="cite-bracket">[</span>134<span class="cite-bracket">]</span></a></sup> </p><p>A study by researchers at Google and several universities, including <a href="/wiki/Cornell_University" title="Cornell University">Cornell University</a> and <a href="/wiki/University_of_California,_Berkeley" title="University of California, Berkeley">University of California, Berkeley</a>, showed that there are potential security risks in language models such as <a href="/wiki/ChatGPT" title="ChatGPT">ChatGPT</a>. In their study, they examined and confirmed the possibility that questioners could get, from ChatGPT, the training data that the AI model used. For example, when asking ChatGPT 3.5 turbo to repeat the word "poem" forever, the AI model will say "poem" hundreds of times and then diverge, deviating from the standard dialogue style and spitting out nonsense phrases, thus spitting out the training data as it is. The researchers have seen more than 10,000 examples of the AI model exposing their training data in a similar method. The researchers said that it was hard to tell if the AI model was actually safe or not.<sup id="cite_ref-135" class="reference"><a href="#cite_note-135"><span class="cite-bracket">[</span>135<span class="cite-bracket">]</span></a></sup> </p><p>The potential presence of "sleeper agents" within LLM models is another emerging security concern. These are hidden functionalities built into the model that remain dormant until triggered by a specific event or condition. Upon activation, the LLM deviates from its expected behavior to make insecure actions.<sup id="cite_ref-136" class="reference"><a href="#cite_note-136"><span class="cite-bracket">[</span>136<span class="cite-bracket">]</span></a></sup> </p><p>LLM applications accessible to the public, like ChatGPT or Claude, typically incorporate safety measures designed to filter out harmful content. However, implementing these controls effectively has proven challenging. For instance, a 2023 study<sup id="cite_ref-137" class="reference"><a href="#cite_note-137"><span class="cite-bracket">[</span>137<span class="cite-bracket">]</span></a></sup> proposed a method for circumventing LLM safety systems. Similarly, <a href="/wiki/Yongge_Wang" title="Yongge Wang">Yongge Wang</a><sup id="cite_ref-138" class="reference"><a href="#cite_note-138"><span class="cite-bracket">[</span>138<span class="cite-bracket">]</span></a></sup> illustrated in 2024 how a potential criminal could potentially bypass ChatGPT 4o's safety controls to obtain information on establishing a drug trafficking operation. </p> <div class="mw-heading mw-heading3"><h3 id="Algorithmic_bias">Algorithmic bias</h3><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Large_language_model&action=edit&section=32" title="Edit section: Algorithmic bias"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1236090951"><div role="note" class="hatnote navigation-not-searchable">Main article: <a href="/wiki/Algorithmic_bias" title="Algorithmic bias">Algorithmic bias</a></div> <p>While LLMs have shown remarkable capabilities in generating human-like text, they are susceptible to inheriting and amplifying biases present in their training data. This can manifest in skewed representations or unfair treatment of different demographics, such as those based on race, gender, language, and cultural groups.<sup id="cite_ref-:8_139-0" class="reference"><a href="#cite_note-:8-139"><span class="cite-bracket">[</span>139<span class="cite-bracket">]</span></a></sup> Since English data is overrepresented in current large language models' training data, it may also downplay non-English views.<sup id="cite_ref-:1_140-0" class="reference"><a href="#cite_note-:1-140"><span class="cite-bracket">[</span>140<span class="cite-bracket">]</span></a></sup> </p> <div class="mw-heading mw-heading4"><h4 id="Stereotyping">Stereotyping</h4><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Large_language_model&action=edit&section=33" title="Edit section: Stereotyping"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <p>AI models can reinforce a wide range of stereotypes, including those based on gender, ethnicity, age, nationality, religion, or occupation. This can lead to outputs that unfairly generalize or caricature groups of people, sometimes in harmful or derogatory ways.<sup id="cite_ref-141" class="reference"><a href="#cite_note-141"><span class="cite-bracket">[</span>141<span class="cite-bracket">]</span></a></sup> </p><p>Notably, gender bias refers to the tendency of these models to produce outputs that are unfairly prejudiced towards one gender over another. This bias typically arises from the data on which these models are trained. Large language models often assign roles and characteristics based on traditional gender norms.<sup id="cite_ref-:8_139-1" class="reference"><a href="#cite_note-:8-139"><span class="cite-bracket">[</span>139<span class="cite-bracket">]</span></a></sup> For example, it might associate nurses or secretaries predominantly with women and engineers or CEOs with men.<sup id="cite_ref-142" class="reference"><a href="#cite_note-142"><span class="cite-bracket">[</span>142<span class="cite-bracket">]</span></a></sup> </p> <div class="mw-heading mw-heading4"><h4 id="Political_bias">Political bias</h4><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Large_language_model&action=edit&section=34" title="Edit section: Political bias"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <p>Political bias refers to the tendency of algorithms to systematically favor certain political viewpoints, ideologies, or outcomes over others. Language models may also exhibit political biases. Since the training data includes a wide range of political opinions and coverage, the models might generate responses that lean towards particular political ideologies or viewpoints, depending on the prevalence of those views in the data.<sup id="cite_ref-143" class="reference"><a href="#cite_note-143"><span class="cite-bracket">[</span>143<span class="cite-bracket">]</span></a></sup> </p> <div class="mw-heading mw-heading2"><h2 id="List_of_large_language_models">List of large language models</h2><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Large_language_model&action=edit&section=35" title="Edit section: List of large language models"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1236090951"><div role="note" class="hatnote navigation-not-searchable">See also: <a href="/wiki/List_of_chatbots" title="List of chatbots">List of chatbots</a></div> <p>For the training cost column, 1 petaFLOP-day = 1 petaFLOP/sec × 1 day = 8.64E19 FLOP. Also, only the largest model's cost is written. </p> <table class="wikitable sortable"> <tbody><tr> <th>Name</th> <th>Release date<sup id="cite_ref-144" class="reference"><a href="#cite_note-144"><span class="cite-bracket">[</span>a<span class="cite-bracket">]</span></a></sup></th> <th>Developer</th> <th>Number of parameters (billion) <sup id="cite_ref-145" class="reference"><a href="#cite_note-145"><span class="cite-bracket">[</span>b<span class="cite-bracket">]</span></a></sup></th> <th>Corpus size </th> <th>Training cost (petaFLOP-day)</th> <th>License<sup id="cite_ref-146" class="reference"><a href="#cite_note-146"><span class="cite-bracket">[</span>c<span class="cite-bracket">]</span></a></sup></th> <th>Notes </th></tr> <tr> <td><a href="/wiki/GPT-1" title="GPT-1">GPT-1</a></td> <td><span data-sort-value="000000002018-06-01-0000" style="white-space:nowrap">June 2018</span></td> <td><a href="/wiki/OpenAI" title="OpenAI">OpenAI</a></td> <td><span data-sort-value="117000000 !">0.117</span></td> <td> </td> <td>1<sup id="cite_ref-oai-unsup_147-0" class="reference"><a href="#cite_note-oai-unsup-147"><span class="cite-bracket">[</span>144<span class="cite-bracket">]</span></a></sup></td> <td style="background:#9EFF9E;color:black;vertical-align:middle;text-align:center;" class="table-yes">MIT<sup id="cite_ref-gpt1_148-0" class="reference"><a href="#cite_note-gpt1-148"><span class="cite-bracket">[</span>145<span class="cite-bracket">]</span></a></sup> </td> <td>First GPT model, decoder-only transformer. Trained for 30 days on 8 P600 <a href="/wiki/Graphics_processing_unit" title="Graphics processing unit">GPUs</a>. </td></tr> <tr> <td><a href="/wiki/BERT_(language_model)" title="BERT (language model)">BERT</a></td> <td><span data-sort-value="000000002018-10-01-0000" style="white-space:nowrap">October 2018</span></td> <td><a href="/wiki/Google" title="Google">Google</a></td> <td><span data-sort-value="340000000 !">0.340</span><sup id="cite_ref-bert-paper_149-0" class="reference"><a href="#cite_note-bert-paper-149"><span class="cite-bracket">[</span>146<span class="cite-bracket">]</span></a></sup></td> <td><span data-sort-value="3300000000 !">3.3 billion</span> words<sup id="cite_ref-bert-paper_149-1" class="reference"><a href="#cite_note-bert-paper-149"><span class="cite-bracket">[</span>146<span class="cite-bracket">]</span></a></sup> </td> <td><span data-sort-value="9 !">9</span><sup id="cite_ref-bHZJ2_150-0" class="reference"><a href="#cite_note-bHZJ2-150"><span class="cite-bracket">[</span>147<span class="cite-bracket">]</span></a></sup></td> <td style="background:#9EFF9E;color:black;vertical-align:middle;text-align:center;" class="table-yes">Apache 2.0<sup id="cite_ref-bert-web_151-0" class="reference"><a href="#cite_note-bert-web-151"><span class="cite-bracket">[</span>148<span class="cite-bracket">]</span></a></sup> </td> <td>An early and influential language model.<sup id="cite_ref-Manning-2022_4-1" class="reference"><a href="#cite_note-Manning-2022-4"><span class="cite-bracket">[</span>4<span class="cite-bracket">]</span></a></sup> <a href="/wiki/Transformer_(deep_learning_architecture)#encoder-only" title="Transformer (deep learning architecture)">Encoder-only</a> and thus not built to be prompted or generative.<sup id="cite_ref-Ir545_152-0" class="reference"><a href="#cite_note-Ir545-152"><span class="cite-bracket">[</span>149<span class="cite-bracket">]</span></a></sup> Training took 4 days on 64 TPUv2 chips.<sup id="cite_ref-:02_153-0" class="reference"><a href="#cite_note-:02-153"><span class="cite-bracket">[</span>150<span class="cite-bracket">]</span></a></sup> </td></tr> <tr> <td><a href="/wiki/T5_(language_model)" title="T5 (language model)">T5</a> </td> <td><span data-sort-value="000000002019-10-01-0000" style="white-space:nowrap">October 2019</span> </td> <td>Google </td> <td>11<sup id="cite_ref-:6_154-0" class="reference"><a href="#cite_note-:6-154"><span class="cite-bracket">[</span>151<span class="cite-bracket">]</span></a></sup> </td> <td>34 billion tokens<sup id="cite_ref-:6_154-1" class="reference"><a href="#cite_note-:6-154"><span class="cite-bracket">[</span>151<span class="cite-bracket">]</span></a></sup> </td> <td> </td> <td style="background:#9EFF9E;color:black;vertical-align:middle;text-align:center;" class="table-yes">Apache 2.0<sup id="cite_ref-155" class="reference"><a href="#cite_note-155"><span class="cite-bracket">[</span>152<span class="cite-bracket">]</span></a></sup> </td> <td>Base model for many Google projects, such as Imagen.<sup id="cite_ref-156" class="reference"><a href="#cite_note-156"><span class="cite-bracket">[</span>153<span class="cite-bracket">]</span></a></sup> </td></tr> <tr> <td><a href="/wiki/XLNet" title="XLNet">XLNet</a></td> <td><span data-sort-value="000000002019-06-01-0000" style="white-space:nowrap">June 2019</span></td> <td><a href="/wiki/Google" title="Google">Google</a></td> <td><span data-sort-value="340000000 !">0.340</span><sup id="cite_ref-157" class="reference"><a href="#cite_note-157"><span class="cite-bracket">[</span>154<span class="cite-bracket">]</span></a></sup></td> <td><span data-sort-value="3300000000 !">33</span> billion words </td> <td>330</td> <td style="background:#9EFF9E;color:black;vertical-align:middle;text-align:center;" class="table-yes">Apache 2.0<sup id="cite_ref-xlnet_158-0" class="reference"><a href="#cite_note-xlnet-158"><span class="cite-bracket">[</span>155<span class="cite-bracket">]</span></a></sup> </td> <td>An alternative to BERT; designed as encoder-only. Trained on 512 TPU v3 chips for 5.5 days.<sup id="cite_ref-LX3rI_159-0" class="reference"><a href="#cite_note-LX3rI-159"><span class="cite-bracket">[</span>156<span class="cite-bracket">]</span></a></sup> </td></tr> <tr> <td><a href="/wiki/GPT-2" title="GPT-2">GPT-2</a></td> <td><span data-sort-value="000000002019-02-01-0000" style="white-space:nowrap">February 2019</span></td> <td><a href="/wiki/OpenAI" title="OpenAI">OpenAI</a></td> <td><span data-sort-value="1500000000 !">1.5</span><sup id="cite_ref-15Brelease_160-0" class="reference"><a href="#cite_note-15Brelease-160"><span class="cite-bracket">[</span>157<span class="cite-bracket">]</span></a></sup></td> <td>40GB<sup id="cite_ref-5T8u5_161-0" class="reference"><a href="#cite_note-5T8u5-161"><span class="cite-bracket">[</span>158<span class="cite-bracket">]</span></a></sup> (~<span data-sort-value="10000000000 !">10 billion</span> tokens)<sup id="cite_ref-LambdaLabs_162-0" class="reference"><a href="#cite_note-LambdaLabs-162"><span class="cite-bracket">[</span>159<span class="cite-bracket">]</span></a></sup> </td> <td>28<sup id="cite_ref-:10_163-0" class="reference"><a href="#cite_note-:10-163"><span class="cite-bracket">[</span>160<span class="cite-bracket">]</span></a></sup></td> <td style="background:#9EFF9E;color:black;vertical-align:middle;text-align:center;" class="table-yes">MIT<sup id="cite_ref-Sudbe_164-0" class="reference"><a href="#cite_note-Sudbe-164"><span class="cite-bracket">[</span>161<span class="cite-bracket">]</span></a></sup> </td> <td>Trained on 32 TPUv3 chips for 1 week.<sup id="cite_ref-:10_163-1" class="reference"><a href="#cite_note-:10-163"><span class="cite-bracket">[</span>160<span class="cite-bracket">]</span></a></sup> </td></tr> <tr> <td><a href="/wiki/GPT-3" title="GPT-3">GPT-3</a></td> <td><span data-sort-value="000000002020-05-01-0000" style="white-space:nowrap">May 2020</span></td> <td>OpenAI</td> <td><span data-sort-value="175000000000 !">175</span><sup id="cite_ref-Wiggers_51-1" class="reference"><a href="#cite_note-Wiggers-51"><span class="cite-bracket">[</span>51<span class="cite-bracket">]</span></a></sup></td> <td><span data-sort-value="300000000000 !">300 billion</span> tokens<sup id="cite_ref-LambdaLabs_162-1" class="reference"><a href="#cite_note-LambdaLabs-162"><span class="cite-bracket">[</span>159<span class="cite-bracket">]</span></a></sup> </td> <td>3640<sup id="cite_ref-:2_165-0" class="reference"><a href="#cite_note-:2-165"><span class="cite-bracket">[</span>162<span class="cite-bracket">]</span></a></sup></td> <td style="background:#FFC7C7;color:black;vertical-align:middle;text-align:center;" class="table-no">proprietary </td> <td>A fine-tuned variant of GPT-3, termed GPT-3.5, was made available to the public through a web interface called <a href="/wiki/ChatGPT" title="ChatGPT">ChatGPT</a> in 2022.<sup id="cite_ref-chatgpt-blog_166-0" class="reference"><a href="#cite_note-chatgpt-blog-166"><span class="cite-bracket">[</span>163<span class="cite-bracket">]</span></a></sup> </td></tr> <tr> <td>GPT-Neo</td> <td><span data-sort-value="000000002021-03-01-0000" style="white-space:nowrap">March 2021</span></td> <td><a href="/wiki/EleutherAI" title="EleutherAI">EleutherAI</a></td> <td><span data-sort-value="2700000000 !">2.7</span><sup id="cite_ref-gpt-neo_167-0" class="reference"><a href="#cite_note-gpt-neo-167"><span class="cite-bracket">[</span>164<span class="cite-bracket">]</span></a></sup></td> <td>825 GiB<sup id="cite_ref-Pile_168-0" class="reference"><a href="#cite_note-Pile-168"><span class="cite-bracket">[</span>165<span class="cite-bracket">]</span></a></sup> </td> <td></td> <td style="background:#9EFF9E;color:black;vertical-align:middle;text-align:center;" class="table-yes">MIT<sup id="cite_ref-vb-gpt-neo_169-0" class="reference"><a href="#cite_note-vb-gpt-neo-169"><span class="cite-bracket">[</span>166<span class="cite-bracket">]</span></a></sup> </td> <td>The first of <a href="/wiki/EleutherAI#GPT_models" title="EleutherAI">a series of free GPT-3 alternatives</a> released by EleutherAI. GPT-Neo outperformed an equivalent-size GPT-3 model on some benchmarks, but was significantly worse than the largest GPT-3.<sup id="cite_ref-vb-gpt-neo_169-1" class="reference"><a href="#cite_note-vb-gpt-neo-169"><span class="cite-bracket">[</span>166<span class="cite-bracket">]</span></a></sup> </td></tr> <tr> <td><a href="/wiki/GPT-J" title="GPT-J">GPT-J</a></td> <td><span data-sort-value="000000002021-06-01-0000" style="white-space:nowrap">June 2021</span></td> <td><a href="/wiki/EleutherAI" title="EleutherAI">EleutherAI</a></td> <td><span data-sort-value="6000000000 !">6</span><sup id="cite_ref-JxohJ_170-0" class="reference"><a href="#cite_note-JxohJ-170"><span class="cite-bracket">[</span>167<span class="cite-bracket">]</span></a></sup></td> <td>825 GiB<sup id="cite_ref-Pile_168-1" class="reference"><a href="#cite_note-Pile-168"><span class="cite-bracket">[</span>165<span class="cite-bracket">]</span></a></sup> </td> <td>200<sup id="cite_ref-:3_171-0" class="reference"><a href="#cite_note-:3-171"><span class="cite-bracket">[</span>168<span class="cite-bracket">]</span></a></sup></td> <td style="background:#9EFF9E;color:black;vertical-align:middle;text-align:center;" class="table-yes">Apache 2.0 </td> <td>GPT-3-style language model </td></tr> <tr> <td>Megatron-Turing NLG</td> <td><span data-sort-value="000000002021-10-01-0000" style="white-space:nowrap">October 2021</span><sup id="cite_ref-BwnW5_172-0" class="reference"><a href="#cite_note-BwnW5-172"><span class="cite-bracket">[</span>169<span class="cite-bracket">]</span></a></sup></td> <td><a href="/wiki/Microsoft" title="Microsoft">Microsoft</a> and <a href="/wiki/Nvidia" title="Nvidia">Nvidia</a></td> <td><span data-sort-value="530000000000 !">530</span><sup id="cite_ref-mtnlg-preprint_173-0" class="reference"><a href="#cite_note-mtnlg-preprint-173"><span class="cite-bracket">[</span>170<span class="cite-bracket">]</span></a></sup></td> <td><span data-sort-value="338600000000 !">338.6 billion</span> tokens<sup id="cite_ref-mtnlg-preprint_173-1" class="reference"><a href="#cite_note-mtnlg-preprint-173"><span class="cite-bracket">[</span>170<span class="cite-bracket">]</span></a></sup> </td> <td>38000<sup id="cite_ref-:11_174-0" class="reference"><a href="#cite_note-:11-174"><span class="cite-bracket">[</span>171<span class="cite-bracket">]</span></a></sup></td> <td style="background:#FFC7C7;color:black;vertical-align:middle;text-align:center;" class="table-no">Restricted web access </td> <td>Trained for 3 months on over 2000 A100 GPUs on the NVIDIA <a href="/wiki/Selene_(supercomputer)" title="Selene (supercomputer)">Selene Supercomputer</a>, for over 3 million GPU-hours.<sup id="cite_ref-:11_174-1" class="reference"><a href="#cite_note-:11-174"><span class="cite-bracket">[</span>171<span class="cite-bracket">]</span></a></sup> </td></tr> <tr> <td>Ernie 3.0 Titan</td> <td><span data-sort-value="000000002021-12-01-0000" style="white-space:nowrap">December 2021</span></td> <td><a href="/wiki/Baidu" title="Baidu">Baidu</a></td> <td><span data-sort-value="260000000000 !">260</span><sup id="cite_ref-qeOB8_175-0" class="reference"><a href="#cite_note-qeOB8-175"><span class="cite-bracket">[</span>172<span class="cite-bracket">]</span></a></sup></td> <td>4 Tb </td> <td></td> <td style="background:#FFC7C7;color:black;vertical-align:middle;text-align:center;" class="table-no">Proprietary </td> <td>Chinese-language LLM. <a href="/wiki/Ernie_Bot" title="Ernie Bot">Ernie Bot</a> is based on this model. </td></tr> <tr> <td><a href="/wiki/Claude_(language_model)" title="Claude (language model)">Claude</a><sup id="cite_ref-i8jc4_176-0" class="reference"><a href="#cite_note-i8jc4-176"><span class="cite-bracket">[</span>173<span class="cite-bracket">]</span></a></sup></td> <td><span data-sort-value="000000002021-12-01-0000" style="white-space:nowrap">December 2021</span></td> <td><a href="/wiki/Anthropic" title="Anthropic">Anthropic</a></td> <td><span data-sort-value="52000000000 !">52</span><sup id="cite_ref-AnthroArch_177-0" class="reference"><a href="#cite_note-AnthroArch-177"><span class="cite-bracket">[</span>174<span class="cite-bracket">]</span></a></sup></td> <td><span data-sort-value="400000000000 !">400 billion</span> tokens<sup id="cite_ref-AnthroArch_177-1" class="reference"><a href="#cite_note-AnthroArch-177"><span class="cite-bracket">[</span>174<span class="cite-bracket">]</span></a></sup> </td> <td></td> <td style="background: #FFB; color:black; vertical-align: middle; text-align: center;" class="table-partial">beta </td> <td>Fine-tuned for desirable behavior in conversations.<sup id="cite_ref-RZqhw_178-0" class="reference"><a href="#cite_note-RZqhw-178"><span class="cite-bracket">[</span>175<span class="cite-bracket">]</span></a></sup> </td></tr> <tr> <td>GLaM (Generalist Language Model)</td> <td><span data-sort-value="000000002021-12-01-0000" style="white-space:nowrap">December 2021</span></td> <td>Google</td> <td><span data-sort-value="1200000000000 !">1200</span><sup id="cite_ref-glam-blog_39-1" class="reference"><a href="#cite_note-glam-blog-39"><span class="cite-bracket">[</span>39<span class="cite-bracket">]</span></a></sup></td> <td><span data-sort-value="1600000000000 !">1.6 trillion</span> tokens<sup id="cite_ref-glam-blog_39-2" class="reference"><a href="#cite_note-glam-blog-39"><span class="cite-bracket">[</span>39<span class="cite-bracket">]</span></a></sup> </td> <td>5600<sup id="cite_ref-glam-blog_39-3" class="reference"><a href="#cite_note-glam-blog-39"><span class="cite-bracket">[</span>39<span class="cite-bracket">]</span></a></sup></td> <td style="background:#FFC7C7;color:black;vertical-align:middle;text-align:center;" class="table-no">Proprietary </td> <td>Sparse <a href="/wiki/Mixture_of_experts" title="Mixture of experts">mixture of experts</a> model, making it more expensive to train but cheaper to run inference compared to GPT-3. </td></tr> <tr> <td>Gopher</td> <td><span data-sort-value="000000002021-12-01-0000" style="white-space:nowrap">December 2021</span></td> <td><a href="/wiki/DeepMind" class="mw-redirect" title="DeepMind">DeepMind</a></td> <td><span data-sort-value="280000000000 !">280</span><sup id="cite_ref-mD5eE_179-0" class="reference"><a href="#cite_note-mD5eE-179"><span class="cite-bracket">[</span>176<span class="cite-bracket">]</span></a></sup></td> <td><span data-sort-value="300000000000 !">300 billion</span> tokens<sup id="cite_ref-hoffman_180-0" class="reference"><a href="#cite_note-hoffman-180"><span class="cite-bracket">[</span>177<span class="cite-bracket">]</span></a></sup> </td> <td>5833<sup id="cite_ref-:4_181-0" class="reference"><a href="#cite_note-:4-181"><span class="cite-bracket">[</span>178<span class="cite-bracket">]</span></a></sup></td> <td style="background:#FFC7C7;color:black;vertical-align:middle;text-align:center;" class="table-no">Proprietary </td> <td>Later developed into the Chinchilla model. </td></tr> <tr> <td><a href="/wiki/LaMDA" title="LaMDA">LaMDA</a> (Language Models for Dialog Applications)</td> <td><span data-sort-value="000000002022-01-01-0000" style="white-space:nowrap">January 2022</span></td> <td>Google</td> <td><span data-sort-value="137000000000 !">137</span><sup id="cite_ref-lamda-blog_182-0" class="reference"><a href="#cite_note-lamda-blog-182"><span class="cite-bracket">[</span>179<span class="cite-bracket">]</span></a></sup></td> <td>1.56T words,<sup id="cite_ref-lamda-blog_182-1" class="reference"><a href="#cite_note-lamda-blog-182"><span class="cite-bracket">[</span>179<span class="cite-bracket">]</span></a></sup> <span data-sort-value="168000000000 !">168 billion</span> tokens<sup id="cite_ref-hoffman_180-1" class="reference"><a href="#cite_note-hoffman-180"><span class="cite-bracket">[</span>177<span class="cite-bracket">]</span></a></sup> </td> <td>4110<sup id="cite_ref-DMs9Z_183-0" class="reference"><a href="#cite_note-DMs9Z-183"><span class="cite-bracket">[</span>180<span class="cite-bracket">]</span></a></sup></td> <td style="background:#FFC7C7;color:black;vertical-align:middle;text-align:center;" class="table-no">Proprietary </td> <td>Specialized for response generation in conversations. </td></tr> <tr> <td>GPT-NeoX</td> <td><span data-sort-value="000000002022-02-01-0000" style="white-space:nowrap">February 2022</span></td> <td><a href="/wiki/EleutherAI" title="EleutherAI">EleutherAI</a></td> <td><span data-sort-value="20000000000 !">20</span><sup id="cite_ref-gpt-neox-20b_184-0" class="reference"><a href="#cite_note-gpt-neox-20b-184"><span class="cite-bracket">[</span>181<span class="cite-bracket">]</span></a></sup></td> <td>825 GiB<sup id="cite_ref-Pile_168-2" class="reference"><a href="#cite_note-Pile-168"><span class="cite-bracket">[</span>165<span class="cite-bracket">]</span></a></sup> </td> <td>740<sup id="cite_ref-:3_171-1" class="reference"><a href="#cite_note-:3-171"><span class="cite-bracket">[</span>168<span class="cite-bracket">]</span></a></sup></td> <td style="background:#9EFF9E;color:black;vertical-align:middle;text-align:center;" class="table-yes">Apache 2.0 </td> <td>based on the Megatron architecture </td></tr> <tr> <td><a href="/wiki/Chinchilla_AI" class="mw-redirect" title="Chinchilla AI">Chinchilla</a></td> <td><span data-sort-value="000000002022-03-01-0000" style="white-space:nowrap">March 2022</span></td> <td><a href="/wiki/DeepMind" class="mw-redirect" title="DeepMind">DeepMind</a></td> <td><span data-sort-value="70000000000 !">70</span><sup id="cite_ref-chinchilla-blog_185-0" class="reference"><a href="#cite_note-chinchilla-blog-185"><span class="cite-bracket">[</span>182<span class="cite-bracket">]</span></a></sup></td> <td><span data-sort-value="1400000000000 !">1.4 trillion</span> tokens<sup id="cite_ref-chinchilla-blog_185-1" class="reference"><a href="#cite_note-chinchilla-blog-185"><span class="cite-bracket">[</span>182<span class="cite-bracket">]</span></a></sup><sup id="cite_ref-hoffman_180-2" class="reference"><a href="#cite_note-hoffman-180"><span class="cite-bracket">[</span>177<span class="cite-bracket">]</span></a></sup> </td> <td>6805<sup id="cite_ref-:4_181-1" class="reference"><a href="#cite_note-:4-181"><span class="cite-bracket">[</span>178<span class="cite-bracket">]</span></a></sup></td> <td style="background:#FFC7C7;color:black;vertical-align:middle;text-align:center;" class="table-no">Proprietary </td> <td>Reduced-parameter model trained on more data. Used in the <a href="/wiki/Sparrow_(bot)" class="mw-redirect" title="Sparrow (bot)">Sparrow</a> bot. Often cited for its <a href="/wiki/Neural_scaling_law" title="Neural scaling law">neural scaling law</a>. </td></tr> <tr> <td><a href="/wiki/PaLM" title="PaLM">PaLM</a> (Pathways Language Model)</td> <td><span data-sort-value="000000002022-04-01-0000" style="white-space:nowrap">April 2022</span></td> <td>Google</td> <td><span data-sort-value="540000000000 !">540</span><sup id="cite_ref-palm-blog_186-0" class="reference"><a href="#cite_note-palm-blog-186"><span class="cite-bracket">[</span>183<span class="cite-bracket">]</span></a></sup></td> <td><span data-sort-value="768000000000 !">768 billion</span> tokens<sup id="cite_ref-chinchilla-blog_185-2" class="reference"><a href="#cite_note-chinchilla-blog-185"><span class="cite-bracket">[</span>182<span class="cite-bracket">]</span></a></sup> </td> <td><span data-sort-value="29250 !">29,250</span><sup id="cite_ref-:4_181-2" class="reference"><a href="#cite_note-:4-181"><span class="cite-bracket">[</span>178<span class="cite-bracket">]</span></a></sup></td> <td style="background:#FFC7C7;color:black;vertical-align:middle;text-align:center;" class="table-no">Proprietary </td> <td>Trained for ~60 days on ~6000 <a href="/wiki/Tensor_Processing_Unit" title="Tensor Processing Unit">TPU v4</a> chips.<sup id="cite_ref-:4_181-3" class="reference"><a href="#cite_note-:4-181"><span class="cite-bracket">[</span>178<span class="cite-bracket">]</span></a></sup> As of October 2024<sup class="plainlinks noexcerpt noprint asof-tag update" style="display:none;"><a class="external text" href="https://en.wikipedia.org/w/index.php?title=Large_language_model&action=edit">[update]</a></sup>, it is the largest dense Transformer published. </td></tr> <tr> <td>OPT (Open Pretrained Transformer)</td> <td><span data-sort-value="000000002022-05-01-0000" style="white-space:nowrap">May 2022</span></td> <td><a href="/wiki/Meta_Platforms" title="Meta Platforms">Meta</a></td> <td><span data-sort-value="175000000000 !">175</span><sup id="cite_ref-jlof8_187-0" class="reference"><a href="#cite_note-jlof8-187"><span class="cite-bracket">[</span>184<span class="cite-bracket">]</span></a></sup></td> <td><span data-sort-value="180000000000 !">180 billion</span> tokens<sup id="cite_ref-QjTIc_188-0" class="reference"><a href="#cite_note-QjTIc-188"><span class="cite-bracket">[</span>185<span class="cite-bracket">]</span></a></sup> </td> <td>310<sup id="cite_ref-:3_171-2" class="reference"><a href="#cite_note-:3-171"><span class="cite-bracket">[</span>168<span class="cite-bracket">]</span></a></sup></td> <td style="background: #FFB; color:black; vertical-align: middle; text-align: center;" class="table-partial">Non-commercial research<sup id="cite_ref-189" class="reference"><a href="#cite_note-189"><span class="cite-bracket">[</span>d<span class="cite-bracket">]</span></a></sup> </td> <td>GPT-3 architecture with some adaptations from Megatron. Uniquely, the training logbook written by the team was published.<sup id="cite_ref-190" class="reference"><a href="#cite_note-190"><span class="cite-bracket">[</span>186<span class="cite-bracket">]</span></a></sup> </td></tr> <tr> <td>YaLM 100B</td> <td><span data-sort-value="000000002022-06-01-0000" style="white-space:nowrap">June 2022</span></td> <td><a href="/wiki/Yandex" title="Yandex">Yandex</a></td> <td><span data-sort-value="100000000000 !">100</span><sup id="cite_ref-yalm-repo_191-0" class="reference"><a href="#cite_note-yalm-repo-191"><span class="cite-bracket">[</span>187<span class="cite-bracket">]</span></a></sup> </td> <td>1.7TB<sup id="cite_ref-yalm-repo_191-1" class="reference"><a href="#cite_note-yalm-repo-191"><span class="cite-bracket">[</span>187<span class="cite-bracket">]</span></a></sup></td> <td></td> <td style="background:#9EFF9E;color:black;vertical-align:middle;text-align:center;" class="table-yes">Apache 2.0</td> <td>English-Russian model based on Microsoft's Megatron-LM. </td></tr> <tr> <td>Minerva</td> <td><span data-sort-value="000000002022-06-01-0000" style="white-space:nowrap">June 2022</span></td> <td>Google</td> <td><span data-sort-value="540000000000 !">540</span><sup id="cite_ref-minerva-paper_192-0" class="reference"><a href="#cite_note-minerva-paper-192"><span class="cite-bracket">[</span>188<span class="cite-bracket">]</span></a></sup></td> <td>38.5B tokens from webpages filtered for mathematical content and from papers submitted to the arXiv preprint server<sup id="cite_ref-minerva-paper_192-1" class="reference"><a href="#cite_note-minerva-paper-192"><span class="cite-bracket">[</span>188<span class="cite-bracket">]</span></a></sup> </td> <td></td> <td style="background:#FFC7C7;color:black;vertical-align:middle;text-align:center;" class="table-no">Proprietary </td> <td>For solving "mathematical and scientific questions using step-by-step reasoning".<sup id="cite_ref-FfCNK_193-0" class="reference"><a href="#cite_note-FfCNK-193"><span class="cite-bracket">[</span>189<span class="cite-bracket">]</span></a></sup> Initialized from PaLM models, then finetuned on mathematical and scientific data. </td></tr> <tr> <td><a href="/wiki/BLOOM_(language_model)" title="BLOOM (language model)">BLOOM</a></td> <td><span data-sort-value="000000002022-07-01-0000" style="white-space:nowrap">July 2022</span></td> <td>Large collaboration led by <a href="/wiki/Hugging_Face" title="Hugging Face">Hugging Face</a></td> <td><span data-sort-value="175000000000 !">175</span><sup id="cite_ref-bigger-better_194-0" class="reference"><a href="#cite_note-bigger-better-194"><span class="cite-bracket">[</span>190<span class="cite-bracket">]</span></a></sup></td> <td><span data-sort-value="350000000000 !">350 billion</span> tokens (1.6TB)<sup id="cite_ref-B8wB2_195-0" class="reference"><a href="#cite_note-B8wB2-195"><span class="cite-bracket">[</span>191<span class="cite-bracket">]</span></a></sup> </td> <td></td> <td style="background: #FFB; color:black; vertical-align: middle; text-align: center;" class="table-partial">Responsible AI </td> <td>Essentially GPT-3 but trained on a multi-lingual corpus (30% English excluding programming languages) </td></tr> <tr> <td>Galactica</td> <td><span data-sort-value="000000002022-11-01-0000" style="white-space:nowrap">November 2022</span></td> <td><a href="/wiki/Meta_Platforms" title="Meta Platforms">Meta</a></td> <td><span data-sort-value="120000000000 !">120</span></td> <td><span data-sort-value="350000000000 !">106 billion</span> tokens<sup id="cite_ref-37sY6_196-0" class="reference"><a href="#cite_note-37sY6-196"><span class="cite-bracket">[</span>192<span class="cite-bracket">]</span></a></sup> </td> <td>unknown</td> <td style="background: #FFB; color:black; vertical-align: middle; text-align: center;" class="table-partial">CC-BY-NC-4.0 </td> <td>Trained on scientific text and modalities. </td></tr> <tr> <td>AlexaTM (Teacher Models)</td> <td><span data-sort-value="000000002022-11-01-0000" style="white-space:nowrap">November 2022</span></td> <td><a href="/wiki/Amazon_(company)" title="Amazon (company)">Amazon</a></td> <td><span data-sort-value="20000000000 !">20</span><sup id="cite_ref-u5szh_197-0" class="reference"><a href="#cite_note-u5szh-197"><span class="cite-bracket">[</span>193<span class="cite-bracket">]</span></a></sup></td> <td><span data-sort-value="1300000000000 !">1.3 trillion</span><sup id="cite_ref-HaA7l_198-0" class="reference"><a href="#cite_note-HaA7l-198"><span class="cite-bracket">[</span>194<span class="cite-bracket">]</span></a></sup> </td> <td></td> <td style="background:#FFC7C7;color:black;vertical-align:middle;text-align:center;" class="table-no">proprietary<sup id="cite_ref-rpehM_199-0" class="reference"><a href="#cite_note-rpehM-199"><span class="cite-bracket">[</span>195<span class="cite-bracket">]</span></a></sup> </td> <td>bidirectional sequence-to-sequence architecture </td></tr> <tr> <td><a href="/wiki/Neuro-sama" title="Neuro-sama">Neuro-sama</a></td> <td><span data-sort-value="000000002022-12-01-0000" style="white-space:nowrap">December 2022</span></td> <td>Independent</td> <td>Unknown</td> <td>Unknown </td> <td></td> <td style="background:#FFC7C7;color:black;vertical-align:middle;text-align:center;" class="table-no">privately-owned </td> <td>A language model designed for live-streaming on <a href="/wiki/Twitch_(service)" title="Twitch (service)">Twitch</a>. </td></tr> <tr> <td><a href="/wiki/LLaMA" class="mw-redirect" title="LLaMA">LLaMA</a> (Large Language Model Meta AI)</td> <td><span data-sort-value="000000002023-02-01-0000" style="white-space:nowrap">February 2023</span></td> <td><a href="/wiki/Meta_AI" title="Meta AI">Meta AI</a></td> <td><span data-sort-value="65000000000 !">65</span><sup id="cite_ref-llama-blog_200-0" class="reference"><a href="#cite_note-llama-blog-200"><span class="cite-bracket">[</span>196<span class="cite-bracket">]</span></a></sup></td> <td><span data-sort-value="1400000000000 !">1.4 trillion</span><sup id="cite_ref-llama-blog_200-1" class="reference"><a href="#cite_note-llama-blog-200"><span class="cite-bracket">[</span>196<span class="cite-bracket">]</span></a></sup> </td> <td>6300<sup id="cite_ref-:5_201-0" class="reference"><a href="#cite_note-:5-201"><span class="cite-bracket">[</span>197<span class="cite-bracket">]</span></a></sup></td> <td style="background: #FFB; color:black; vertical-align: middle; text-align: center;" class="table-partial">Non-commercial research<sup id="cite_ref-202" class="reference"><a href="#cite_note-202"><span class="cite-bracket">[</span>e<span class="cite-bracket">]</span></a></sup> </td> <td>Corpus has 20 languages. "Overtrained" (compared to <a href="/wiki/Chinchilla_(language_model)" title="Chinchilla (language model)">Chinchilla scaling law</a>) for better performance with fewer parameters.<sup id="cite_ref-llama-blog_200-2" class="reference"><a href="#cite_note-llama-blog-200"><span class="cite-bracket">[</span>196<span class="cite-bracket">]</span></a></sup> </td></tr> <tr> <td><a href="/wiki/GPT-4" title="GPT-4">GPT-4</a></td> <td><span data-sort-value="000000002023-03-01-0000" style="white-space:nowrap">March 2023</span></td> <td>OpenAI</td> <td>Unknown<sup id="cite_ref-204" class="reference"><a href="#cite_note-204"><span class="cite-bracket">[</span>f<span class="cite-bracket">]</span></a></sup> (According to rumors: 1760)<sup id="cite_ref-205" class="reference"><a href="#cite_note-205"><span class="cite-bracket">[</span>199<span class="cite-bracket">]</span></a></sup> </td> <td>Unknown </td> <td>Unknown</td> <td style="background:#FFC7C7;color:black;vertical-align:middle;text-align:center;" class="table-no">proprietary </td> <td>Available for ChatGPT Plus users and used in <a href="/wiki/GPT-4#Usage" title="GPT-4">several products</a>. </td></tr> <tr> <td>Chameleon</td> <td><span data-sort-value="000000002024-06-01-0000" style="white-space:nowrap">June 2024</span></td> <td><a href="/wiki/Meta_AI" title="Meta AI">Meta AI</a></td> <td><span data-sort-value="34000000000 !">34</span><sup id="cite_ref-206" class="reference"><a href="#cite_note-206"><span class="cite-bracket">[</span>200<span class="cite-bracket">]</span></a></sup></td> <td><span data-sort-value="4400000000000 !">4.4 trillion</span></td> <td></td> <td> </td></tr> <tr> <td>Cerebras-GPT </td> <td><span data-sort-value="000000002023-03-01-0000" style="white-space:nowrap">March 2023</span> </td> <td><a href="/wiki/Cerebras" title="Cerebras">Cerebras</a> </td> <td><span data-sort-value="13000000000 !">13</span><sup id="cite_ref-D0k2a_207-0" class="reference"><a href="#cite_note-D0k2a-207"><span class="cite-bracket">[</span>201<span class="cite-bracket">]</span></a></sup> </td> <td> </td> <td>270<sup id="cite_ref-:3_171-3" class="reference"><a href="#cite_note-:3-171"><span class="cite-bracket">[</span>168<span class="cite-bracket">]</span></a></sup></td> <td style="background:#9EFF9E;color:black;vertical-align:middle;text-align:center;" class="table-yes">Apache 2.0 </td> <td>Trained with <a href="/wiki/Chinchilla_(language_model)" title="Chinchilla (language model)">Chinchilla formula</a>. </td></tr> <tr> <td>Falcon</td> <td><span data-sort-value="000000002023-03-01-0000" style="white-space:nowrap">March 2023</span></td> <td><a href="/wiki/Technology_Innovation_Institute" title="Technology Innovation Institute">Technology Innovation Institute</a></td> <td><span data-sort-value="40000000000 !">40</span><sup id="cite_ref-falcon_208-0" class="reference"><a href="#cite_note-falcon-208"><span class="cite-bracket">[</span>202<span class="cite-bracket">]</span></a></sup></td> <td>1 trillion tokens, from RefinedWeb (filtered web text corpus)<sup id="cite_ref-Xb1gq_209-0" class="reference"><a href="#cite_note-Xb1gq-209"><span class="cite-bracket">[</span>203<span class="cite-bracket">]</span></a></sup> plus some "curated corpora".<sup id="cite_ref-gzTNw_210-0" class="reference"><a href="#cite_note-gzTNw-210"><span class="cite-bracket">[</span>204<span class="cite-bracket">]</span></a></sup> </td> <td>2800<sup id="cite_ref-:5_201-1" class="reference"><a href="#cite_note-:5-201"><span class="cite-bracket">[</span>197<span class="cite-bracket">]</span></a></sup></td> <td style="background:#9EFF9E;color:black;vertical-align:middle;text-align:center;" class="table-yes">Apache 2.0<sup id="cite_ref-Wmlcs_211-0" class="reference"><a href="#cite_note-Wmlcs-211"><span class="cite-bracket">[</span>205<span class="cite-bracket">]</span></a></sup> </td> <td> </td></tr> <tr> <td>BloombergGPT</td> <td><span data-sort-value="000000002023-03-01-0000" style="white-space:nowrap">March 2023</span></td> <td><a href="/wiki/Bloomberg_L.P." title="Bloomberg L.P.">Bloomberg L.P.</a></td> <td><span data-sort-value="50000000000 !">50</span></td> <td>363 billion token dataset based on Bloomberg's data sources, plus 345 billion tokens from general purpose datasets<sup id="cite_ref-nGOSu_212-0" class="reference"><a href="#cite_note-nGOSu-212"><span class="cite-bracket">[</span>206<span class="cite-bracket">]</span></a></sup> </td> <td></td> <td style="background:#FFC7C7;color:black;vertical-align:middle;text-align:center;" class="table-no">Proprietary </td> <td>Trained on financial data from proprietary sources, for financial tasks. </td></tr> <tr> <td><a href="/wiki/Huawei_PanGu" title="Huawei PanGu">PanGu-Σ</a></td> <td><span data-sort-value="000000002023-03-01-0000" style="white-space:nowrap">March 2023</span></td> <td><a href="/wiki/Huawei" title="Huawei">Huawei</a></td> <td><span data-sort-value="1085000000000 !">1085</span></td> <td>329 billion tokens<sup id="cite_ref-9WSFw_213-0" class="reference"><a href="#cite_note-9WSFw-213"><span class="cite-bracket">[</span>207<span class="cite-bracket">]</span></a></sup> </td> <td></td> <td style="background:#FFC7C7;color:black;vertical-align:middle;text-align:center;" class="table-no">Proprietary </td> <td> </td></tr> <tr> <td>OpenAssistant<sup id="cite_ref-JiOl8_214-0" class="reference"><a href="#cite_note-JiOl8-214"><span class="cite-bracket">[</span>208<span class="cite-bracket">]</span></a></sup></td> <td><span data-sort-value="000000002023-03-01-0000" style="white-space:nowrap">March 2023</span></td> <td><a href="/wiki/LAION" title="LAION">LAION</a></td> <td><span data-sort-value="17000000000 !">17</span></td> <td>1.5 trillion tokens </td> <td></td> <td style="background:#9EFF9E;color:black;vertical-align:middle;text-align:center;" class="table-yes">Apache 2.0 </td> <td>Trained on crowdsourced open data </td></tr> <tr> <td>Jurassic-2<sup id="cite_ref-215" class="reference"><a href="#cite_note-215"><span class="cite-bracket">[</span>209<span class="cite-bracket">]</span></a></sup> </td> <td><span data-sort-value="000000002023-03-01-0000" style="white-space:nowrap">March 2023</span> </td> <td><a href="/wiki/AI21_Labs" title="AI21 Labs">AI21 Labs</a> </td> <td>Unknown </td> <td>Unknown </td> <td></td> <td style="background:#FFC7C7;color:black;vertical-align:middle;text-align:center;" class="table-no">Proprietary </td> <td>Multilingual<sup id="cite_ref-216" class="reference"><a href="#cite_note-216"><span class="cite-bracket">[</span>210<span class="cite-bracket">]</span></a></sup> </td></tr> <tr> <td><a href="/wiki/PaLM" title="PaLM">PaLM 2</a> (Pathways Language Model 2)</td> <td><span data-sort-value="000000002023-05-01-0000" style="white-space:nowrap">May 2023</span></td> <td>Google</td> <td><span data-sort-value="340000000000 !">340</span><sup id="cite_ref-cnbc-20230516_217-0" class="reference"><a href="#cite_note-cnbc-20230516-217"><span class="cite-bracket">[</span>211<span class="cite-bracket">]</span></a></sup></td> <td><span data-sort-value="3600000000000 !">3.6 trillion</span> tokens<sup id="cite_ref-cnbc-20230516_217-1" class="reference"><a href="#cite_note-cnbc-20230516-217"><span class="cite-bracket">[</span>211<span class="cite-bracket">]</span></a></sup> </td> <td><span data-sort-value="85000 !">85,000</span><sup id="cite_ref-:5_201-2" class="reference"><a href="#cite_note-:5-201"><span class="cite-bracket">[</span>197<span class="cite-bracket">]</span></a></sup></td> <td style="background:#FFC7C7;color:black;vertical-align:middle;text-align:center;" class="table-no">Proprietary </td> <td>Was used in <a href="/wiki/Bard_(chatbot)" class="mw-redirect" title="Bard (chatbot)">Bard chatbot</a>.<sup id="cite_ref-pWyLA_218-0" class="reference"><a href="#cite_note-pWyLA-218"><span class="cite-bracket">[</span>212<span class="cite-bracket">]</span></a></sup> </td></tr> <tr> <td>Llama 2</td> <td><span data-sort-value="000000002023-07-01-0000" style="white-space:nowrap">July 2023</span></td> <td>Meta AI</td> <td><span data-sort-value="70000000000 !">70</span><sup id="cite_ref-meta-20230719_219-0" class="reference"><a href="#cite_note-meta-20230719-219"><span class="cite-bracket">[</span>213<span class="cite-bracket">]</span></a></sup></td> <td><span data-sort-value="2000000000000 !">2 trillion</span> tokens<sup id="cite_ref-meta-20230719_219-1" class="reference"><a href="#cite_note-meta-20230719-219"><span class="cite-bracket">[</span>213<span class="cite-bracket">]</span></a></sup> </td> <td><span data-sort-value="21000 !">21,000</span></td> <td style="background: #FFB; color:black; vertical-align: middle; text-align: center;" class="table-partial">Llama 2 license </td> <td>1.7 million A100-hours.<sup id="cite_ref-220" class="reference"><a href="#cite_note-220"><span class="cite-bracket">[</span>214<span class="cite-bracket">]</span></a></sup> </td></tr> <tr> <td><a href="/wiki/Claude_(language_model)" title="Claude (language model)">Claude 2</a> </td> <td><span data-sort-value="000000002023-07-01-0000" style="white-space:nowrap">July 2023</span> </td> <td>Anthropic </td> <td>Unknown </td> <td>Unknown </td> <td>Unknown</td> <td style="background:#FFC7C7;color:black;vertical-align:middle;text-align:center;" class="table-no">Proprietary </td> <td>Used in Claude chatbot.<sup id="cite_ref-221" class="reference"><a href="#cite_note-221"><span class="cite-bracket">[</span>215<span class="cite-bracket">]</span></a></sup> </td></tr> <tr> <td><a href="/wiki/IBM_Granite" title="IBM Granite">Granite 13b</a> </td> <td><span data-sort-value="000000002023-07-01-0000" style="white-space:nowrap">July 2023</span> </td> <td><a href="/wiki/IBM" title="IBM">IBM</a> </td> <td>Unknown </td> <td>Unknown </td> <td>Unknown</td> <td style="background:#FFC7C7;color:black;vertical-align:middle;text-align:center;" class="table-no">Proprietary </td> <td>Used in <a href="/wiki/IBM_Watsonx" title="IBM Watsonx">IBM Watsonx</a>.<sup id="cite_ref-222" class="reference"><a href="#cite_note-222"><span class="cite-bracket">[</span>216<span class="cite-bracket">]</span></a></sup> </td></tr> <tr> <td>Mistral 7B</td> <td><span data-sort-value="000000002023-09-01-0000" style="white-space:nowrap">September 2023</span></td> <td><a href="/wiki/Mistral_AI" title="Mistral AI">Mistral AI</a></td> <td><span data-sort-value="7300000000 !">7.3</span><sup id="cite_ref-mistral-20230927_223-0" class="reference"><a href="#cite_note-mistral-20230927-223"><span class="cite-bracket">[</span>217<span class="cite-bracket">]</span></a></sup></td> <td>Unknown </td> <td></td> <td style="background:#9EFF9E;color:black;vertical-align:middle;text-align:center;" class="table-yes">Apache 2.0 </td> <td> </td></tr> <tr> <td><a href="/wiki/Claude_(language_model)" title="Claude (language model)">Claude 2.1</a> </td> <td><span data-sort-value="000000002023-11-01-0000" style="white-space:nowrap">November 2023</span> </td> <td>Anthropic </td> <td>Unknown </td> <td>Unknown </td> <td>Unknown</td> <td style="background:#FFC7C7;color:black;vertical-align:middle;text-align:center;" class="table-no">Proprietary </td> <td>Used in Claude chatbot. Has a context window of 200,000 tokens, or ~500 pages.<sup id="cite_ref-224" class="reference"><a href="#cite_note-224"><span class="cite-bracket">[</span>218<span class="cite-bracket">]</span></a></sup> </td></tr> <tr> <td>Grok-1<sup id="cite_ref-225" class="reference"><a href="#cite_note-225"><span class="cite-bracket">[</span>219<span class="cite-bracket">]</span></a></sup> </td> <td><span data-sort-value="000000002023-11-01-0000" style="white-space:nowrap">November 2023</span> </td> <td><a href="/wiki/X.AI" class="mw-redirect" title="X.AI">x.AI</a> </td> <td>314 </td> <td>Unknown </td> <td>Unknown</td> <td style="background:#9EFF9E;color:black;vertical-align:middle;text-align:center;" class="table-yes">Apache 2.0 </td> <td>Used in <a href="/wiki/Grok_(chatbot)" title="Grok (chatbot)">Grok</a> chatbot. Grok-1 has a context length of 8,192 tokens and has access to X (Twitter).<sup id="cite_ref-226" class="reference"><a href="#cite_note-226"><span class="cite-bracket">[</span>220<span class="cite-bracket">]</span></a></sup> </td></tr> <tr> <td><a href="/wiki/Gemini_(language_model)" title="Gemini (language model)">Gemini 1.0</a> </td> <td><span data-sort-value="000000002023-12-01-0000" style="white-space:nowrap">December 2023</span> </td> <td><a href="/wiki/Google_DeepMind" title="Google DeepMind">Google DeepMind</a> </td> <td>Unknown </td> <td>Unknown </td> <td>Unknown</td> <td style="background:#FFC7C7;color:black;vertical-align:middle;text-align:center;" class="table-no">Proprietary </td> <td>Multimodal model, comes in three sizes. Used in <a href="/wiki/Gemini_(chatbot)" title="Gemini (chatbot)">the chatbot of the same name</a>.<sup id="cite_ref-227" class="reference"><a href="#cite_note-227"><span class="cite-bracket">[</span>221<span class="cite-bracket">]</span></a></sup> </td></tr> <tr> <td>Mixtral 8x7B </td> <td><span data-sort-value="000000002023-12-01-0000" style="white-space:nowrap">December 2023</span> </td> <td><a href="/wiki/Mistral_AI" title="Mistral AI">Mistral AI</a> </td> <td>46.7 </td> <td>Unknown </td> <td>Unknown</td> <td style="background:#9EFF9E;color:black;vertical-align:middle;text-align:center;" class="table-yes">Apache 2.0 </td> <td>Outperforms GPT-3.5 and Llama 2 70B on many benchmarks.<sup id="cite_ref-228" class="reference"><a href="#cite_note-228"><span class="cite-bracket">[</span>222<span class="cite-bracket">]</span></a></sup> <a href="/wiki/Mixture_of_experts" title="Mixture of experts">Mixture of experts</a> model, with 12.9 billion parameters activated per token.<sup id="cite_ref-229" class="reference"><a href="#cite_note-229"><span class="cite-bracket">[</span>223<span class="cite-bracket">]</span></a></sup> </td></tr> <tr> <td>Mixtral 8x22B </td> <td><span data-sort-value="000000002024-04-01-0000" style="white-space:nowrap">April 2024</span> </td> <td><a href="/wiki/Mistral_AI" title="Mistral AI">Mistral AI</a> </td> <td>141 </td> <td>Unknown </td> <td>Unknown</td> <td style="background:#9EFF9E;color:black;vertical-align:middle;text-align:center;" class="table-yes">Apache 2.0 </td> <td><sup id="cite_ref-230" class="reference"><a href="#cite_note-230"><span class="cite-bracket">[</span>224<span class="cite-bracket">]</span></a></sup> </td></tr> <tr> <td><a href="/w/index.php?title=Phi_(LLM)&action=edit&redlink=1" class="new" title="Phi (LLM) (page does not exist)">Phi-2</a> </td> <td><span data-sort-value="000000002023-12-01-0000" style="white-space:nowrap">December 2023</span> </td> <td>Microsoft </td> <td>2.7 </td> <td>1.4T tokens </td> <td>419<sup id="cite_ref-:9_231-0" class="reference"><a href="#cite_note-:9-231"><span class="cite-bracket">[</span>225<span class="cite-bracket">]</span></a></sup></td> <td style="background:#9EFF9E;color:black;vertical-align:middle;text-align:center;" class="table-yes">MIT </td> <td>Trained on real and synthetic "textbook-quality" data, for 14 days on 96 A100 GPUs.<sup id="cite_ref-:9_231-1" class="reference"><a href="#cite_note-:9-231"><span class="cite-bracket">[</span>225<span class="cite-bracket">]</span></a></sup> </td></tr> <tr> <td><a href="/wiki/Gemini_(language_model)" title="Gemini (language model)">Gemini 1.5</a> </td> <td><span data-sort-value="000000002024-02-01-0000" style="white-space:nowrap">February 2024</span> </td> <td><a href="/wiki/Google_DeepMind" title="Google DeepMind">Google DeepMind</a> </td> <td>Unknown </td> <td>Unknown </td> <td>Unknown</td> <td style="background:#FFC7C7;color:black;vertical-align:middle;text-align:center;" class="table-no">Proprietary </td> <td>Multimodal model, based on a <a href="/wiki/Mixture_of_experts" title="Mixture of experts">Mixture-of-Experts</a> (MoE) architecture. Context window above 1 million tokens.<sup id="cite_ref-232" class="reference"><a href="#cite_note-232"><span class="cite-bracket">[</span>226<span class="cite-bracket">]</span></a></sup> </td></tr> <tr> <td><a href="/wiki/Gemini_(language_model)" title="Gemini (language model)">Gemini Ultra</a> </td> <td><span data-sort-value="000000002024-02-01-0000" style="white-space:nowrap">February 2024</span> </td> <td><a href="/wiki/Google_DeepMind" title="Google DeepMind">Google DeepMind</a> </td> <td>Unknown </td> <td>Unknown </td> <td>Unknown</td> <td> </td> <td> </td></tr> <tr> <td>Gemma</td> <td><span data-sort-value="000000002024-02-01-0000" style="white-space:nowrap">February 2024</span></td> <td><a href="/wiki/Google_DeepMind" title="Google DeepMind">Google DeepMind</a></td> <td>7</td> <td>6T tokens</td> <td>Unknown</td> <td style="background: #FFB; color:black; vertical-align: middle; text-align: center;" class="table-partial">Gemma Terms of Use<sup id="cite_ref-gemma_233-0" class="reference"><a href="#cite_note-gemma-233"><span class="cite-bracket">[</span>227<span class="cite-bracket">]</span></a></sup></td> <td> </td></tr> <tr> <td><a href="/wiki/Claude_(language_model)" title="Claude (language model)">Claude 3</a> </td> <td>March 2024 </td> <td>Anthropic </td> <td>Unknown </td> <td>Unknown </td> <td>Unknown </td> <td style="background:#FFC7C7;color:black;vertical-align:middle;text-align:center;" class="table-no">Proprietary </td> <td>Includes three models, Haiku, Sonnet, and Opus.<sup id="cite_ref-234" class="reference"><a href="#cite_note-234"><span class="cite-bracket">[</span>228<span class="cite-bracket">]</span></a></sup> </td></tr> <tr> <td><a rel="nofollow" class="external text" href="https://rubiks.ai/nova/release/">Nova</a> </td> <td>October 2024 </td> <td><a rel="nofollow" class="external text" href="https://rubiks.ai/">Rubik's AI</a> </td> <td>Unknown </td> <td>Unknown </td> <td>Unknown </td> <td style="background:#FFC7C7;color:black;vertical-align:middle;text-align:center;" class="table-no">Proprietary </td> <td>Includes three models, Nova-Instant, Nova-Air, and Nova-Pro. </td></tr> <tr> <td><a href="/wiki/DBRX" title="DBRX">DBRX</a> </td> <td>March 2024 </td> <td><a href="/wiki/Databricks" title="Databricks">Databricks</a> and <a href="/wiki/Mosaic_ML" class="mw-redirect" title="Mosaic ML">Mosaic ML</a> </td> <td><span data-sort-value="13600000000 !">136</span> </td> <td>12T Tokens </td> <td> </td> <td style="background:#9EFF9E;color:black;vertical-align:middle;text-align:center;" class="table-yes">Databricks Open Model License </td> <td>Training cost 10 million USD. </td></tr> <tr> <td>Fugaku-LLM </td> <td>May 2024 </td> <td><a href="/wiki/Fujitsu" title="Fujitsu">Fujitsu</a>, <a href="/wiki/Tokyo_Institute_of_Technology" title="Tokyo Institute of Technology">Tokyo Institute of Technology</a>, etc. </td> <td><span data-sort-value="1300000000 !">13</span> </td> <td>380B Tokens </td> <td> </td> <td> </td> <td>The largest model ever trained on CPU-only, on the <a href="/wiki/Fugaku_(supercomputer)" title="Fugaku (supercomputer)">Fugaku</a>.<sup id="cite_ref-235" class="reference"><a href="#cite_note-235"><span class="cite-bracket">[</span>229<span class="cite-bracket">]</span></a></sup> </td></tr> <tr> <td><a href="/w/index.php?title=Phi_(LLM)&action=edit&redlink=1" class="new" title="Phi (LLM) (page does not exist)">Phi-3</a> </td> <td><span data-sort-value="000000002024-04-01-0000" style="white-space:nowrap">April 2024</span> </td> <td>Microsoft </td> <td>14<sup id="cite_ref-236" class="reference"><a href="#cite_note-236"><span class="cite-bracket">[</span>230<span class="cite-bracket">]</span></a></sup> </td> <td>4.8T Tokens </td> <td> </td> <td style="background:#9EFF9E;color:black;vertical-align:middle;text-align:center;" class="table-yes">MIT </td> <td>Microsoft markets them as "small language model".<sup id="cite_ref-237" class="reference"><a href="#cite_note-237"><span class="cite-bracket">[</span>231<span class="cite-bracket">]</span></a></sup> </td></tr> <tr> <td><a href="/wiki/IBM_Granite" title="IBM Granite">Granite Code Models</a> </td> <td><span data-sort-value="000000002024-05-01-0000" style="white-space:nowrap">May 2024</span> </td> <td><a href="/wiki/IBM" title="IBM">IBM</a> </td> <td>Unknown </td> <td>Unknown </td> <td>Unknown</td> <td style="background:#9EFF9E;color:black;vertical-align:middle;text-align:center;" class="table-yes">Apache 2.0 </td> <td> </td></tr> <tr> <td>Qwen2 </td> <td><span data-sort-value="000000002024-06-01-0000" style="white-space:nowrap">June 2024</span> </td> <td><a href="/wiki/Alibaba_Cloud" title="Alibaba Cloud">Alibaba Cloud</a> </td> <td>72<sup id="cite_ref-238" class="reference"><a href="#cite_note-238"><span class="cite-bracket">[</span>232<span class="cite-bracket">]</span></a></sup> </td> <td>3T Tokens </td> <td> </td> <td> </td> <td>Multiple sizes, the smallest being 0.5B. </td></tr> <tr> <td>Nemotron-4 </td> <td>June 2024 </td> <td><a href="/wiki/Nvidia" title="Nvidia">Nvidia</a> </td> <td><span data-sort-value="34000000000 !">340</span> </td> <td>9T Tokens </td> <td><span data-sort-value="200000 !">200,000</span> </td> <td style="background:#9EFF9E;color:black;vertical-align:middle;text-align:center;" class="table-yes">NVIDIA Open Model License </td> <td>Trained for 1 epoch. Trained on 6144 H100 GPUs between December 2023 and May 2024.<sup id="cite_ref-239" class="reference"><a href="#cite_note-239"><span class="cite-bracket">[</span>233<span class="cite-bracket">]</span></a></sup><sup id="cite_ref-240" class="reference"><a href="#cite_note-240"><span class="cite-bracket">[</span>234<span class="cite-bracket">]</span></a></sup> </td></tr> <tr> <td>Llama 3.1 </td> <td>July 2024 </td> <td>Meta AI </td> <td>405 </td> <td>15.6T tokens </td> <td><span data-sort-value="440000 !">440,000</span> </td> <td style="background: #FFB; color:black; vertical-align: middle; text-align: center;" class="table-partial">Llama 3 license </td> <td>405B version took 31 million hours on <a href="/wiki/Hopper_(microarchitecture)" title="Hopper (microarchitecture)">H100</a>-80GB, at 3.8E25 FLOPs.<sup id="cite_ref-241" class="reference"><a href="#cite_note-241"><span class="cite-bracket">[</span>235<span class="cite-bracket">]</span></a></sup><sup id="cite_ref-242" class="reference"><a href="#cite_note-242"><span class="cite-bracket">[</span>236<span class="cite-bracket">]</span></a></sup> </td></tr></tbody></table> <div class="mw-heading mw-heading2"><h2 id="See_also">See also</h2><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Large_language_model&action=edit&section=36" title="Edit section: See also"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <ul><li><a href="/wiki/Foundation_models" class="mw-redirect" title="Foundation models">Foundation models</a></li></ul> <div class="mw-heading mw-heading2"><h2 id="Notes">Notes</h2><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Large_language_model&action=edit&section=37" title="Edit section: Notes"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <style data-mw-deduplicate="TemplateStyles:r1239543626">.mw-parser-output .reflist{margin-bottom:0.5em;list-style-type:decimal}@media screen{.mw-parser-output .reflist{font-size:90%}}.mw-parser-output .reflist .references{font-size:100%;margin-bottom:0;list-style-type:inherit}.mw-parser-output .reflist-columns-2{column-width:30em}.mw-parser-output .reflist-columns-3{column-width:25em}.mw-parser-output .reflist-columns{margin-top:0.3em}.mw-parser-output .reflist-columns ol{margin-top:0}.mw-parser-output .reflist-columns li{page-break-inside:avoid;break-inside:avoid-column}.mw-parser-output .reflist-upper-alpha{list-style-type:upper-alpha}.mw-parser-output .reflist-upper-roman{list-style-type:upper-roman}.mw-parser-output .reflist-lower-alpha{list-style-type:lower-alpha}.mw-parser-output .reflist-lower-greek{list-style-type:lower-greek}.mw-parser-output .reflist-lower-roman{list-style-type:lower-roman}</style><div class="reflist reflist-lower-alpha"> <div class="mw-references-wrap"><ol class="references"> <li id="cite_note-144"><span class="mw-cite-backlink"><b><a href="#cite_ref-144">^</a></b></span> <span class="reference-text">This is the date that documentation describing the model's architecture was first released.</span> </li> <li id="cite_note-145"><span class="mw-cite-backlink"><b><a href="#cite_ref-145">^</a></b></span> <span class="reference-text">In many cases, researchers release or report on multiple versions of a model having different sizes. In these cases, the size of the largest model is listed here.</span> </li> <li id="cite_note-146"><span class="mw-cite-backlink"><b><a href="#cite_ref-146">^</a></b></span> <span class="reference-text">This is the license of the pre-trained model weights. In almost all cases the training code itself is open-source or can be easily replicated.</span> </li> <li id="cite_note-189"><span class="mw-cite-backlink"><b><a href="#cite_ref-189">^</a></b></span> <span class="reference-text">The smaller models including 66B are publicly available, while the 175B model is available on request.</span> </li> <li id="cite_note-202"><span class="mw-cite-backlink"><b><a href="#cite_ref-202">^</a></b></span> <span class="reference-text">Facebook's license and distribution scheme restricted access to approved researchers, but the model weights were leaked and became widely available.</span> </li> <li id="cite_note-204"><span class="mw-cite-backlink"><b><a href="#cite_ref-204">^</a></b></span> <span class="reference-text">As stated in Technical report: "Given both the competitive landscape and the safety implications of large-scale models like GPT-4, this report contains no further details about the architecture (including model size), hardware, training compute, dataset construction, training method ..."<sup id="cite_ref-GPT4Tech_203-0" class="reference"><a href="#cite_note-GPT4Tech-203"><span class="cite-bracket">[</span>198<span class="cite-bracket">]</span></a></sup> </span> </li> </ol></div></div> <div class="mw-heading mw-heading2"><h2 id="References">References</h2><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Large_language_model&action=edit&section=38" title="Edit section: References"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1239543626"><div class="reflist"> <div class="mw-references-wrap mw-references-columns"><ol class="references"> <li id="cite_note-:7-1"><span class="mw-cite-backlink"><b><a href="#cite_ref-:7_1-0">^</a></b></span> <span class="reference-text"><style data-mw-deduplicate="TemplateStyles:r1238218222">.mw-parser-output cite.citation{font-style:inherit;word-wrap:break-word}.mw-parser-output .citation q{quotes:"\"""\"""'""'"}.mw-parser-output .citation:target{background-color:rgba(0,127,255,0.133)}.mw-parser-output .id-lock-free.id-lock-free a{background:url("//upload.wikimedia.org/wikipedia/commons/6/65/Lock-green.svg")right 0.1em center/9px no-repeat}.mw-parser-output .id-lock-limited.id-lock-limited a,.mw-parser-output .id-lock-registration.id-lock-registration a{background:url("//upload.wikimedia.org/wikipedia/commons/d/d6/Lock-gray-alt-2.svg")right 0.1em center/9px no-repeat}.mw-parser-output .id-lock-subscription.id-lock-subscription a{background:url("//upload.wikimedia.org/wikipedia/commons/a/aa/Lock-red-alt-2.svg")right 0.1em center/9px no-repeat}.mw-parser-output .cs1-ws-icon a{background:url("//upload.wikimedia.org/wikipedia/commons/4/4c/Wikisource-logo.svg")right 0.1em center/12px no-repeat}body:not(.skin-timeless):not(.skin-minerva) .mw-parser-output .id-lock-free a,body:not(.skin-timeless):not(.skin-minerva) .mw-parser-output .id-lock-limited a,body:not(.skin-timeless):not(.skin-minerva) .mw-parser-output .id-lock-registration a,body:not(.skin-timeless):not(.skin-minerva) .mw-parser-output .id-lock-subscription a,body:not(.skin-timeless):not(.skin-minerva) .mw-parser-output .cs1-ws-icon a{background-size:contain;padding:0 1em 0 0}.mw-parser-output .cs1-code{color:inherit;background:inherit;border:none;padding:inherit}.mw-parser-output .cs1-hidden-error{display:none;color:var(--color-error,#d33)}.mw-parser-output .cs1-visible-error{color:var(--color-error,#d33)}.mw-parser-output .cs1-maint{display:none;color:#085;margin-left:0.3em}.mw-parser-output .cs1-kern-left{padding-left:0.2em}.mw-parser-output .cs1-kern-right{padding-right:0.2em}.mw-parser-output .citation .mw-selflink{font-weight:inherit}@media screen{.mw-parser-output .cs1-format{font-size:95%}html.skin-theme-clientpref-night .mw-parser-output .cs1-maint{color:#18911f}}@media screen and (prefers-color-scheme:dark){html.skin-theme-clientpref-os .mw-parser-output .cs1-maint{color:#18911f}}</style><cite class="citation web cs1"><a rel="nofollow" class="external text" href="https://openai.com/blog/better-language-models/">"Better Language Models and Their Implications"</a>. <i>OpenAI</i>. 2019-02-14. <a rel="nofollow" class="external text" href="https://web.archive.org/web/20201219132206/https://openai.com/blog/better-language-models/">Archived</a> from the original on 2020-12-19<span class="reference-accessdate">. Retrieved <span class="nowrap">2019-08-25</span></span>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=unknown&rft.jtitle=OpenAI&rft.atitle=Better+Language+Models+and+Their+Implications&rft.date=2019-02-14&rft_id=https%3A%2F%2Fopenai.com%2Fblog%2Fbetter-language-models%2F&rfr_id=info%3Asid%2Fen.wikipedia.org%3ALarge+language+model" class="Z3988"></span></span> </li> <li id="cite_note-few-shot-learners-2"><span class="mw-cite-backlink">^ <a href="#cite_ref-few-shot-learners_2-0"><sup><i><b>a</b></i></sup></a> <a href="#cite_ref-few-shot-learners_2-1"><sup><i><b>b</b></i></sup></a> <a href="#cite_ref-few-shot-learners_2-2"><sup><i><b>c</b></i></sup></a></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFBrownMannRyderSubbiah2020" class="citation journal cs1">Brown, Tom B.; Mann, Benjamin; Ryder, Nick; Subbiah, Melanie; Kaplan, Jared; Dhariwal, Prafulla; Neelakantan, Arvind; Shyam, Pranav; Sastry, Girish; Askell, Amanda; Agarwal, Sandhini; Herbert-Voss, Ariel; Krueger, Gretchen; Henighan, Tom; Child, Rewon; Ramesh, Aditya; Ziegler, Daniel M.; Wu, Jeffrey; Winter, Clemens; Hesse, Christopher; Chen, Mark; Sigler, Eric; Litwin, Mateusz; Gray, Scott; Chess, Benjamin; Clark, Jack; Berner, Christopher; McCandlish, Sam; Radford, Alec; Sutskever, Ilya; Amodei, Dario (Dec 2020). Larochelle, H.; Ranzato, M.; Hadsell, R.; Balcan, M.F.; Lin, H. (eds.). <a rel="nofollow" class="external text" href="https://proceedings.neurips.cc/paper/2020/file/1457c0d6bfcb4967418bfb8ac142f64a-Paper.pdf">"Language Models are Few-Shot Learners"</a> <span class="cs1-format">(PDF)</span>. <i>Advances in Neural Information Processing Systems</i>. <b>33</b>. Curran Associates, Inc.: 1877–1901. <a rel="nofollow" class="external text" href="https://web.archive.org/web/20231117204007/https://proceedings.neurips.cc/paper/2020/file/1457c0d6bfcb4967418bfb8ac142f64a-Paper.pdf">Archived</a> <span class="cs1-format">(PDF)</span> from the original on 2023-11-17<span class="reference-accessdate">. Retrieved <span class="nowrap">2023-03-14</span></span>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=article&rft.jtitle=Advances+in+Neural+Information+Processing+Systems&rft.atitle=Language+Models+are+Few-Shot+Learners&rft.volume=33&rft.pages=1877-1901&rft.date=2020-12&rft.aulast=Brown&rft.aufirst=Tom+B.&rft.au=Mann%2C+Benjamin&rft.au=Ryder%2C+Nick&rft.au=Subbiah%2C+Melanie&rft.au=Kaplan%2C+Jared&rft.au=Dhariwal%2C+Prafulla&rft.au=Neelakantan%2C+Arvind&rft.au=Shyam%2C+Pranav&rft.au=Sastry%2C+Girish&rft.au=Askell%2C+Amanda&rft.au=Agarwal%2C+Sandhini&rft.au=Herbert-Voss%2C+Ariel&rft.au=Krueger%2C+Gretchen&rft.au=Henighan%2C+Tom&rft.au=Child%2C+Rewon&rft.au=Ramesh%2C+Aditya&rft.au=Ziegler%2C+Daniel+M.&rft.au=Wu%2C+Jeffrey&rft.au=Winter%2C+Clemens&rft.au=Hesse%2C+Christopher&rft.au=Chen%2C+Mark&rft.au=Sigler%2C+Eric&rft.au=Litwin%2C+Mateusz&rft.au=Gray%2C+Scott&rft.au=Chess%2C+Benjamin&rft.au=Clark%2C+Jack&rft.au=Berner%2C+Christopher&rft.au=McCandlish%2C+Sam&rft.au=Radford%2C+Alec&rft.au=Sutskever%2C+Ilya&rft.au=Amodei%2C+Dario&rft_id=https%3A%2F%2Fproceedings.neurips.cc%2Fpaper%2F2020%2Ffile%2F1457c0d6bfcb4967418bfb8ac142f64a-Paper.pdf&rfr_id=info%3Asid%2Fen.wikipedia.org%3ALarge+language+model" class="Z3988"></span></span> </li> <li id="cite_note-3"><span class="mw-cite-backlink"><b><a href="#cite_ref-3">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFFathallahDasDe_GiorgisPoltronieri2024" class="citation conference cs1">Fathallah, Nadeen; Das, Arunav; De Giorgis, Stefano; Poltronieri, Andrea; Haase, Peter; Kovriguina, Liubov (2024-05-26). <a rel="nofollow" class="external text" href="https://2024.eswc-conferences.org/wp-content/uploads/2024/05/77770034.pdf"><i>NeOn-GPT: A Large Language Model-Powered Pipeline for Ontology Learning</i></a> <span class="cs1-format">(PDF)</span>. Extended Semantic Web Conference 2024. Hersonissos, Greece.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&rft.genre=conference&rft.btitle=NeOn-GPT%3A+A+Large+Language+Model-Powered+Pipeline+for+Ontology+Learning&rft.place=Hersonissos%2C+Greece&rft.date=2024-05-26&rft.aulast=Fathallah&rft.aufirst=Nadeen&rft.au=Das%2C+Arunav&rft.au=De+Giorgis%2C+Stefano&rft.au=Poltronieri%2C+Andrea&rft.au=Haase%2C+Peter&rft.au=Kovriguina%2C+Liubov&rft_id=https%3A%2F%2F2024.eswc-conferences.org%2Fwp-content%2Fuploads%2F2024%2F05%2F77770034.pdf&rfr_id=info%3Asid%2Fen.wikipedia.org%3ALarge+language+model" class="Z3988"></span></span> </li> <li id="cite_note-Manning-2022-4"><span class="mw-cite-backlink">^ <a href="#cite_ref-Manning-2022_4-0"><sup><i><b>a</b></i></sup></a> <a href="#cite_ref-Manning-2022_4-1"><sup><i><b>b</b></i></sup></a></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFManning2022" class="citation journal cs1"><a href="/wiki/Christopher_D._Manning" title="Christopher D. Manning">Manning, Christopher D.</a> (2022). <a rel="nofollow" class="external text" href="https://www.amacad.org/publication/human-language-understanding-reasoning">"Human Language Understanding & Reasoning"</a>. <i>Daedalus</i>. <b>151</b> (2): 127–138. <a href="/wiki/Doi_(identifier)" class="mw-redirect" title="Doi (identifier)">doi</a>:<span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://doi.org/10.1162%2Fdaed_a_01905">10.1162/daed_a_01905</a></span>. <a href="/wiki/S2CID_(identifier)" class="mw-redirect" title="S2CID (identifier)">S2CID</a> <a rel="nofollow" class="external text" href="https://api.semanticscholar.org/CorpusID:248377870">248377870</a>. <a rel="nofollow" class="external text" href="https://web.archive.org/web/20231117205531/https://www.amacad.org/publication/human-language-understanding-reasoning">Archived</a> from the original on 2023-11-17<span class="reference-accessdate">. Retrieved <span class="nowrap">2023-03-09</span></span>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=article&rft.jtitle=Daedalus&rft.atitle=Human+Language+Understanding+%26+Reasoning&rft.volume=151&rft.issue=2&rft.pages=127-138&rft.date=2022&rft_id=info%3Adoi%2F10.1162%2Fdaed_a_01905&rft_id=https%3A%2F%2Fapi.semanticscholar.org%2FCorpusID%3A248377870%23id-name%3DS2CID&rft.aulast=Manning&rft.aufirst=Christopher+D.&rft_id=https%3A%2F%2Fwww.amacad.org%2Fpublication%2Fhuman-language-understanding-reasoning&rfr_id=info%3Asid%2Fen.wikipedia.org%3ALarge+language+model" class="Z3988"></span></span> </li> <li id="cite_note-5"><span class="mw-cite-backlink"><b><a href="#cite_ref-5">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFGoodman2001" class="citation cs2">Goodman, Joshua (2001-08-09), <i>A Bit of Progress in Language Modeling</i>, <a href="/wiki/ArXiv_(identifier)" class="mw-redirect" title="ArXiv (identifier)">arXiv</a>:<span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://arxiv.org/abs/cs/0108005">cs/0108005</a></span>, <a href="/wiki/Bibcode_(identifier)" class="mw-redirect" title="Bibcode (identifier)">Bibcode</a>:<a rel="nofollow" class="external text" href="https://ui.adsabs.harvard.edu/abs/2001cs........8005G">2001cs........8005G</a></cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&rft.genre=book&rft.btitle=A+Bit+of+Progress+in+Language+Modeling&rft.date=2001-08-09&rft_id=info%3Aarxiv%2Fcs%2F0108005&rft_id=info%3Abibcode%2F2001cs........8005G&rft.aulast=Goodman&rft.aufirst=Joshua&rfr_id=info%3Asid%2Fen.wikipedia.org%3ALarge+language+model" class="Z3988"></span></span> </li> <li id="cite_note-6"><span class="mw-cite-backlink"><b><a href="#cite_ref-6">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFKilgarriffGrefenstette2003" class="citation journal cs1">Kilgarriff, Adam; Grefenstette, Gregory (September 2003). <a rel="nofollow" class="external text" href="https://direct.mit.edu/coli/article/29/3/333-347/1816">"Introduction to the Special Issue on the Web as Corpus"</a>. <i>Computational Linguistics</i>. <b>29</b> (3): 333–347. <a href="/wiki/Doi_(identifier)" class="mw-redirect" title="Doi (identifier)">doi</a>:<a rel="nofollow" class="external text" href="https://doi.org/10.1162%2F089120103322711569">10.1162/089120103322711569</a>. <a href="/wiki/ISSN_(identifier)" class="mw-redirect" title="ISSN (identifier)">ISSN</a> <a rel="nofollow" class="external text" href="https://search.worldcat.org/issn/0891-2017">0891-2017</a>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=article&rft.jtitle=Computational+Linguistics&rft.atitle=Introduction+to+the+Special+Issue+on+the+Web+as+Corpus&rft.volume=29&rft.issue=3&rft.pages=333-347&rft.date=2003-09&rft_id=info%3Adoi%2F10.1162%2F089120103322711569&rft.issn=0891-2017&rft.aulast=Kilgarriff&rft.aufirst=Adam&rft.au=Grefenstette%2C+Gregory&rft_id=https%3A%2F%2Fdirect.mit.edu%2Fcoli%2Farticle%2F29%2F3%2F333-347%2F1816&rfr_id=info%3Asid%2Fen.wikipedia.org%3ALarge+language+model" class="Z3988"></span></span> </li> <li id="cite_note-7"><span class="mw-cite-backlink"><b><a href="#cite_ref-7">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFBankoBrill2001" class="citation journal cs1">Banko, Michele; Brill, Eric (2001). <a rel="nofollow" class="external text" href="https://dx.doi.org/10.3115/1073012.1073017">"Scaling to very very large corpora for natural language disambiguation"</a>. <i>Proceedings of the 39th Annual Meeting on Association for Computational Linguistics - ACL '01</i>. Morristown, NJ, USA: Association for Computational Linguistics: 26–33. <a href="/wiki/Doi_(identifier)" class="mw-redirect" title="Doi (identifier)">doi</a>:<a rel="nofollow" class="external text" href="https://doi.org/10.3115%2F1073012.1073017">10.3115/1073012.1073017</a>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=article&rft.jtitle=Proceedings+of+the+39th+Annual+Meeting+on+Association+for+Computational+Linguistics+-+ACL+%2701&rft.atitle=Scaling+to+very+very+large+corpora+for+natural+language+disambiguation&rft.pages=26-33&rft.date=2001&rft_id=info%3Adoi%2F10.3115%2F1073012.1073017&rft.aulast=Banko&rft.aufirst=Michele&rft.au=Brill%2C+Eric&rft_id=http%3A%2F%2Fdx.doi.org%2F10.3115%2F1073012.1073017&rfr_id=info%3Asid%2Fen.wikipedia.org%3ALarge+language+model" class="Z3988"></span></span> </li> <li id="cite_note-8"><span class="mw-cite-backlink"><b><a href="#cite_ref-8">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFResnikSmith2003" class="citation journal cs1">Resnik, Philip; Smith, Noah A. (September 2003). <a rel="nofollow" class="external text" href="https://direct.mit.edu/coli/article/29/3/349-380/1809">"The Web as a Parallel Corpus"</a>. <i>Computational Linguistics</i>. <b>29</b> (3): 349–380. <a href="/wiki/Doi_(identifier)" class="mw-redirect" title="Doi (identifier)">doi</a>:<span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://doi.org/10.1162%2F089120103322711578">10.1162/089120103322711578</a></span>. <a href="/wiki/ISSN_(identifier)" class="mw-redirect" title="ISSN (identifier)">ISSN</a> <a rel="nofollow" class="external text" href="https://search.worldcat.org/issn/0891-2017">0891-2017</a>. <a rel="nofollow" class="external text" href="https://web.archive.org/web/20240607172811/https://direct.mit.edu/coli/article/29/3/349-380/1809">Archived</a> from the original on 2024-06-07<span class="reference-accessdate">. Retrieved <span class="nowrap">2024-06-07</span></span>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=article&rft.jtitle=Computational+Linguistics&rft.atitle=The+Web+as+a+Parallel+Corpus&rft.volume=29&rft.issue=3&rft.pages=349-380&rft.date=2003-09&rft_id=info%3Adoi%2F10.1162%2F089120103322711578&rft.issn=0891-2017&rft.aulast=Resnik&rft.aufirst=Philip&rft.au=Smith%2C+Noah+A.&rft_id=https%3A%2F%2Fdirect.mit.edu%2Fcoli%2Farticle%2F29%2F3%2F349-380%2F1809&rfr_id=info%3Asid%2Fen.wikipedia.org%3ALarge+language+model" class="Z3988"></span></span> </li> <li id="cite_note-9"><span class="mw-cite-backlink"><b><a href="#cite_ref-9">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFHalevyNorvigPereira2009" class="citation journal cs1">Halevy, Alon; Norvig, Peter; Pereira, Fernando (March 2009). <a rel="nofollow" class="external text" href="https://ieeexplore.ieee.org/document/4804817">"The Unreasonable Effectiveness of Data"</a>. <i>IEEE Intelligent Systems</i>. <b>24</b> (2): 8–12. <a href="/wiki/Doi_(identifier)" class="mw-redirect" title="Doi (identifier)">doi</a>:<a rel="nofollow" class="external text" href="https://doi.org/10.1109%2FMIS.2009.36">10.1109/MIS.2009.36</a>. <a href="/wiki/ISSN_(identifier)" class="mw-redirect" title="ISSN (identifier)">ISSN</a> <a rel="nofollow" class="external text" href="https://search.worldcat.org/issn/1541-1672">1541-1672</a>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=article&rft.jtitle=IEEE+Intelligent+Systems&rft.atitle=The+Unreasonable+Effectiveness+of+Data&rft.volume=24&rft.issue=2&rft.pages=8-12&rft.date=2009-03&rft_id=info%3Adoi%2F10.1109%2FMIS.2009.36&rft.issn=1541-1672&rft.aulast=Halevy&rft.aufirst=Alon&rft.au=Norvig%2C+Peter&rft.au=Pereira%2C+Fernando&rft_id=https%3A%2F%2Fieeexplore.ieee.org%2Fdocument%2F4804817&rfr_id=info%3Asid%2Fen.wikipedia.org%3ALarge+language+model" class="Z3988"></span></span> </li> <li id="cite_note-10"><span class="mw-cite-backlink"><b><a href="#cite_ref-10">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFChenLiBaiYang2021" class="citation journal cs1">Chen, Leiyu; Li, Shaobo; Bai, Qiang; Yang, Jing; Jiang, Sanlong; Miao, Yanming (2021). <a rel="nofollow" class="external text" href="https://doi.org/10.3390%2Frs13224712">"Review of Image Classification Algorithms Based on Convolutional Neural Networks"</a>. <i>Remote Sensing</i>. <b>13</b> (22): 4712. <a href="/wiki/Bibcode_(identifier)" class="mw-redirect" title="Bibcode (identifier)">Bibcode</a>:<a rel="nofollow" class="external text" href="https://ui.adsabs.harvard.edu/abs/2021RemS...13.4712C">2021RemS...13.4712C</a>. <a href="/wiki/Doi_(identifier)" class="mw-redirect" title="Doi (identifier)">doi</a>:<span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://doi.org/10.3390%2Frs13224712">10.3390/rs13224712</a></span>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=article&rft.jtitle=Remote+Sensing&rft.atitle=Review+of+Image+Classification+Algorithms+Based+on+Convolutional+Neural+Networks&rft.volume=13&rft.issue=22&rft.pages=4712&rft.date=2021&rft_id=info%3Adoi%2F10.3390%2Frs13224712&rft_id=info%3Abibcode%2F2021RemS...13.4712C&rft.aulast=Chen&rft.aufirst=Leiyu&rft.au=Li%2C+Shaobo&rft.au=Bai%2C+Qiang&rft.au=Yang%2C+Jing&rft.au=Jiang%2C+Sanlong&rft.au=Miao%2C+Yanming&rft_id=https%3A%2F%2Fdoi.org%2F10.3390%252Frs13224712&rfr_id=info%3Asid%2Fen.wikipedia.org%3ALarge+language+model" class="Z3988"></span></span> </li> <li id="cite_note-11"><span class="mw-cite-backlink"><b><a href="#cite_ref-11">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFVaswaniShazeerParmarUszkoreit2017" class="citation journal cs1"><a href="/wiki/Ashish_Vaswani" title="Ashish Vaswani">Vaswani, Ashish</a>; Shazeer, Noam; Parmar, Niki; Uszkoreit, Jakob; Jones, Llion; <a href="/wiki/Aidan_Gomez" title="Aidan Gomez">Gomez, Aidan N</a>; Kaiser, Łukasz; Polosukhin, Illia (2017). <a rel="nofollow" class="external text" href="https://proceedings.neurips.cc/paper/2017/file/3f5ee243547dee91fbd053c1c4a845aa-Paper.pdf">"Attention is All you Need"</a> <span class="cs1-format">(PDF)</span>. <i>Advances in Neural Information Processing Systems</i>. <b>30</b>. Curran Associates, Inc. <a rel="nofollow" class="external text" href="https://web.archive.org/web/20240221141113/https://proceedings.neurips.cc/paper/2017/file/3f5ee243547dee91fbd053c1c4a845aa-Paper.pdf">Archived</a> <span class="cs1-format">(PDF)</span> from the original on 2024-02-21<span class="reference-accessdate">. Retrieved <span class="nowrap">2024-01-21</span></span>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=article&rft.jtitle=Advances+in+Neural+Information+Processing+Systems&rft.atitle=Attention+is+All+you+Need&rft.volume=30&rft.date=2017&rft.aulast=Vaswani&rft.aufirst=Ashish&rft.au=Shazeer%2C+Noam&rft.au=Parmar%2C+Niki&rft.au=Uszkoreit%2C+Jakob&rft.au=Jones%2C+Llion&rft.au=Gomez%2C+Aidan+N&rft.au=Kaiser%2C+%C5%81ukasz&rft.au=Polosukhin%2C+Illia&rft_id=https%3A%2F%2Fproceedings.neurips.cc%2Fpaper%2F2017%2Ffile%2F3f5ee243547dee91fbd053c1c4a845aa-Paper.pdf&rfr_id=info%3Asid%2Fen.wikipedia.org%3ALarge+language+model" class="Z3988"></span></span> </li> <li id="cite_note-12"><span class="mw-cite-backlink"><b><a href="#cite_ref-12">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFBahdanauChoBengio2014" class="citation arxiv cs1">Bahdanau, Dzmitry; Cho, Kyunghyun; Bengio, Yoshua (2014). "Neural Machine Translation by Jointly Learning to Align and Translate". <a href="/wiki/ArXiv_(identifier)" class="mw-redirect" title="ArXiv (identifier)">arXiv</a>:<span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://arxiv.org/abs/1409.0473">1409.0473</a></span> [<a rel="nofollow" class="external text" href="https://arxiv.org/archive/cs.CL">cs.CL</a>].</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=preprint&rft.jtitle=arXiv&rft.atitle=Neural+Machine+Translation+by+Jointly+Learning+to+Align+and+Translate&rft.date=2014&rft_id=info%3Aarxiv%2F1409.0473&rft.aulast=Bahdanau&rft.aufirst=Dzmitry&rft.au=Cho%2C+Kyunghyun&rft.au=Bengio%2C+Yoshua&rfr_id=info%3Asid%2Fen.wikipedia.org%3ALarge+language+model" class="Z3988"></span></span> </li> <li id="cite_note-13"><span class="mw-cite-backlink"><b><a href="#cite_ref-13">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFRogersKovalevaRumshisky2020" class="citation journal cs1">Rogers, Anna; Kovaleva, Olga; Rumshisky, Anna (2020). <a rel="nofollow" class="external text" href="https://aclanthology.org/2020.tacl-1.54">"A Primer in BERTology: What We Know About How BERT Works"</a>. <i>Transactions of the Association for Computational Linguistics</i>. <b>8</b>: 842–866. <a href="/wiki/ArXiv_(identifier)" class="mw-redirect" title="ArXiv (identifier)">arXiv</a>:<span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://arxiv.org/abs/2002.12327">2002.12327</a></span>. <a href="/wiki/Doi_(identifier)" class="mw-redirect" title="Doi (identifier)">doi</a>:<a rel="nofollow" class="external text" href="https://doi.org/10.1162%2Ftacl_a_00349">10.1162/tacl_a_00349</a>. <a href="/wiki/S2CID_(identifier)" class="mw-redirect" title="S2CID (identifier)">S2CID</a> <a rel="nofollow" class="external text" href="https://api.semanticscholar.org/CorpusID:211532403">211532403</a>. <a rel="nofollow" class="external text" href="https://web.archive.org/web/20220403103310/https://aclanthology.org/2020.tacl-1.54/">Archived</a> from the original on 2022-04-03<span class="reference-accessdate">. Retrieved <span class="nowrap">2024-01-21</span></span>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=article&rft.jtitle=Transactions+of+the+Association+for+Computational+Linguistics&rft.atitle=A+Primer+in+BERTology%3A+What+We+Know+About+How+BERT+Works&rft.volume=8&rft.pages=842-866&rft.date=2020&rft_id=info%3Aarxiv%2F2002.12327&rft_id=https%3A%2F%2Fapi.semanticscholar.org%2FCorpusID%3A211532403%23id-name%3DS2CID&rft_id=info%3Adoi%2F10.1162%2Ftacl_a_00349&rft.aulast=Rogers&rft.aufirst=Anna&rft.au=Kovaleva%2C+Olga&rft.au=Rumshisky%2C+Anna&rft_id=https%3A%2F%2Faclanthology.org%2F2020.tacl-1.54&rfr_id=info%3Asid%2Fen.wikipedia.org%3ALarge+language+model" class="Z3988"></span></span> </li> <li id="cite_note-14"><span class="mw-cite-backlink"><b><a href="#cite_ref-14">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFHern2019" class="citation web cs1">Hern, Alex (14 February 2019). <a rel="nofollow" class="external text" href="https://www.theguardian.com/technology/2019/feb/14/elon-musk-backed-ai-writes-convincing-news-fiction">"New AI fake text generator may be too dangerous to release, say creators"</a>. <i><a href="/wiki/The_Guardian" title="The Guardian">The Guardian</a></i>. <a rel="nofollow" class="external text" href="https://web.archive.org/web/20190214173112/https://www.theguardian.com/technology/2019/feb/14/elon-musk-backed-ai-writes-convincing-news-fiction">Archived</a> from the original on 14 February 2019<span class="reference-accessdate">. Retrieved <span class="nowrap">20 January</span> 2024</span>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=unknown&rft.jtitle=The+Guardian&rft.atitle=New+AI+fake+text+generator+may+be+too+dangerous+to+release%2C+say+creators&rft.date=2019-02-14&rft.aulast=Hern&rft.aufirst=Alex&rft_id=https%3A%2F%2Fwww.theguardian.com%2Ftechnology%2F2019%2Ffeb%2F14%2Felon-musk-backed-ai-writes-convincing-news-fiction&rfr_id=info%3Asid%2Fen.wikipedia.org%3ALarge+language+model" class="Z3988"></span></span> </li> <li id="cite_note-15"><span class="mw-cite-backlink"><b><a href="#cite_ref-15">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite class="citation web cs1"><a rel="nofollow" class="external text" href="https://www.euronews.com/next/2023/11/30/chatgpt-a-year-on-3-ways-the-ai-chatbot-has-completely-changed-the-world-in-12-months">"ChatGPT a year on: 3 ways the AI chatbot has completely changed the world in 12 months"</a>. <a href="/wiki/Euronews" title="Euronews">Euronews</a>. November 30, 2023. <a rel="nofollow" class="external text" href="https://web.archive.org/web/20240114025250/https://www.euronews.com/next/2023/11/30/chatgpt-a-year-on-3-ways-the-ai-chatbot-has-completely-changed-the-world-in-12-months">Archived</a> from the original on January 14, 2024<span class="reference-accessdate">. Retrieved <span class="nowrap">January 20,</span> 2024</span>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&rft.genre=unknown&rft.btitle=ChatGPT+a+year+on%3A+3+ways+the+AI+chatbot+has+completely+changed+the+world+in+12+months&rft.pub=Euronews&rft.date=2023-11-30&rft_id=https%3A%2F%2Fwww.euronews.com%2Fnext%2F2023%2F11%2F30%2Fchatgpt-a-year-on-3-ways-the-ai-chatbot-has-completely-changed-the-world-in-12-months&rfr_id=info%3Asid%2Fen.wikipedia.org%3ALarge+language+model" class="Z3988"></span></span> </li> <li id="cite_note-16"><span class="mw-cite-backlink"><b><a href="#cite_ref-16">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFHeaven2023" class="citation web cs1">Heaven, Will (March 14, 2023). <a rel="nofollow" class="external text" href="https://www.technologyreview.com/2023/03/14/1069823/gpt-4-is-bigger-and-better-chatgpt-openai/">"GPT-4 is bigger and better than ChatGPT—but OpenAI won't say why"</a>. <a href="/wiki/MIT_Technology_Review" title="MIT Technology Review">MIT Technology Review</a>. <a rel="nofollow" class="external text" href="https://web.archive.org/web/20230317224201/https://www.technologyreview.com/2023/03/14/1069823/gpt-4-is-bigger-and-better-chatgpt-openai/">Archived</a> from the original on March 17, 2023<span class="reference-accessdate">. Retrieved <span class="nowrap">January 20,</span> 2024</span>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&rft.genre=unknown&rft.btitle=GPT-4+is+bigger+and+better+than+ChatGPT%E2%80%94but+OpenAI+won%27t+say+why&rft.pub=MIT+Technology+Review&rft.date=2023-03-14&rft.aulast=Heaven&rft.aufirst=Will&rft_id=https%3A%2F%2Fwww.technologyreview.com%2F2023%2F03%2F14%2F1069823%2Fgpt-4-is-bigger-and-better-chatgpt-openai%2F&rfr_id=info%3Asid%2Fen.wikipedia.org%3ALarge+language+model" class="Z3988"></span></span> </li> <li id="cite_note-17"><span class="mw-cite-backlink"><b><a href="#cite_ref-17">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite class="citation web cs1"><a rel="nofollow" class="external text" href="https://ourworldindata.org/grapher/artificial-intelligence-parameter-count?time=2017-09-05..latest">"Parameters in notable artificial intelligence systems"</a>. <i>ourworldindata.org</i>. November 30, 2023<span class="reference-accessdate">. Retrieved <span class="nowrap">January 20,</span> 2024</span>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=unknown&rft.jtitle=ourworldindata.org&rft.atitle=Parameters+in+notable+artificial+intelligence+systems&rft.date=2023-11-30&rft_id=https%3A%2F%2Fourworldindata.org%2Fgrapher%2Fartificial-intelligence-parameter-count%3Ftime%3D2017-09-05..latest&rfr_id=info%3Asid%2Fen.wikipedia.org%3ALarge+language+model" class="Z3988"></span></span> </li> <li id="cite_note-18"><span class="mw-cite-backlink"><b><a href="#cite_ref-18">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite class="citation web cs1"><a rel="nofollow" class="external text" href="https://huggingface.co/spaces/lmsys/chatbot-arena-leaderboard">"LMSYS Chatbot Arena Leaderboard"</a>. <i>huggingface.co</i>. <a rel="nofollow" class="external text" href="https://web.archive.org/web/20240610162906/https://huggingface.co/spaces/lmsys/chatbot-arena-leaderboard">Archived</a> from the original on June 10, 2024<span class="reference-accessdate">. Retrieved <span class="nowrap">June 12,</span> 2024</span>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=unknown&rft.jtitle=huggingface.co&rft.atitle=LMSYS+Chatbot+Arena+Leaderboard&rft_id=https%3A%2F%2Fhuggingface.co%2Fspaces%2Flmsys%2Fchatbot-arena-leaderboard&rfr_id=info%3Asid%2Fen.wikipedia.org%3ALarge+language+model" class="Z3988"></span></span> </li> <li id="cite_note-19"><span class="mw-cite-backlink"><b><a href="#cite_ref-19">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFPengAlcaideAnthonyAlbalak2023" class="citation arxiv cs1">Peng, Bo; et al. (2023). "RWKV: Reinventing RNNS for the Transformer Era". <a href="/wiki/ArXiv_(identifier)" class="mw-redirect" title="ArXiv (identifier)">arXiv</a>:<span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://arxiv.org/abs/2305.13048">2305.13048</a></span> [<a rel="nofollow" class="external text" href="https://arxiv.org/archive/cs.CL">cs.CL</a>].</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=preprint&rft.jtitle=arXiv&rft.atitle=RWKV%3A+Reinventing+RNNS+for+the+Transformer+Era&rft.date=2023&rft_id=info%3Aarxiv%2F2305.13048&rft.aulast=Peng&rft.aufirst=Bo&rft.au=Alcaide%2C+Eric&rft.au=Anthony%2C+Quentin&rft.au=Albalak%2C+Alon&rft.au=Arcadinho%2C+Samuel&rft.au=Biderman%2C+Stella&rft.au=Cao%2C+Huanqi&rft.au=Cheng%2C+Xin&rft.au=Chung%2C+Michael&rft.au=Grella%2C+Matteo&rft.au=Kranthi+Kiran+GV&rft.au=He%2C+Xuzheng&rft.au=Hou%2C+Haowen&rft.au=Lin%2C+Jiaju&rft.au=Kazienko%2C+Przemyslaw&rft.au=Kocon%2C+Jan&rft.au=Kong%2C+Jiaming&rft.au=Koptyra%2C+Bartlomiej&rft.au=Lau%2C+Hayden&rft.au=Krishna+Sri+Ipsit+Mantri&rft.au=Mom%2C+Ferdinand&rft.au=Saito%2C+Atsushi&rft.au=Song%2C+Guangyu&rft.au=Tang%2C+Xiangru&rft.au=Wang%2C+Bolun&rft.au=Wind%2C+Johan+S.&rft.au=Wozniak%2C+Stanislaw&rft.au=Zhang%2C+Ruichong&rft.au=Zhang%2C+Zhenyuan&rft.au=Zhao%2C+Qihang&rfr_id=info%3Asid%2Fen.wikipedia.org%3ALarge+language+model" class="Z3988"></span></span> </li> <li id="cite_note-20"><span class="mw-cite-backlink"><b><a href="#cite_ref-20">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFMerritt2022" class="citation web cs1">Merritt, Rick (2022-03-25). <a rel="nofollow" class="external text" href="https://blogs.nvidia.com/blog/2022/03/25/what-is-a-transformer-model/">"What Is a Transformer Model?"</a>. <i>NVIDIA Blog</i>. <a rel="nofollow" class="external text" href="https://web.archive.org/web/20231117203924/https://blogs.nvidia.com/blog/what-is-a-transformer-model/">Archived</a> from the original on 2023-11-17<span class="reference-accessdate">. Retrieved <span class="nowrap">2023-07-25</span></span>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=unknown&rft.jtitle=NVIDIA+Blog&rft.atitle=What+Is+a+Transformer+Model%3F&rft.date=2022-03-25&rft.aulast=Merritt&rft.aufirst=Rick&rft_id=https%3A%2F%2Fblogs.nvidia.com%2Fblog%2F2022%2F03%2F25%2Fwhat-is-a-transformer-model%2F&rfr_id=info%3Asid%2Fen.wikipedia.org%3ALarge+language+model" class="Z3988"></span></span> </li> <li id="cite_note-21"><span class="mw-cite-backlink"><b><a href="#cite_ref-21">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFGuDao2023" class="citation cs2">Gu, Albert; Dao, Tri (2023-12-01), <i>Mamba: Linear-Time Sequence Modeling with Selective State Spaces</i>, <a href="/wiki/ArXiv_(identifier)" class="mw-redirect" title="ArXiv (identifier)">arXiv</a>:<span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://arxiv.org/abs/2312.00752">2312.00752</a></span></cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&rft.genre=book&rft.btitle=Mamba%3A+Linear-Time+Sequence+Modeling+with+Selective+State+Spaces&rft.date=2023-12-01&rft_id=info%3Aarxiv%2F2312.00752&rft.aulast=Gu&rft.aufirst=Albert&rft.au=Dao%2C+Tri&rfr_id=info%3Asid%2Fen.wikipedia.org%3ALarge+language+model" class="Z3988"></span></span> </li> <li id="cite_note-22"><span class="mw-cite-backlink"><b><a href="#cite_ref-22">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFKaushalMahowald2022" class="citation cs2">Kaushal, Ayush; Mahowald, Kyle (2022-06-06), <i>What do tokens know about their characters and how do they know it?</i>, <a href="/wiki/ArXiv_(identifier)" class="mw-redirect" title="ArXiv (identifier)">arXiv</a>:<span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://arxiv.org/abs/2206.02608">2206.02608</a></span></cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&rft.genre=book&rft.btitle=What+do+tokens+know+about+their+characters+and+how+do+they+know+it%3F&rft.date=2022-06-06&rft_id=info%3Aarxiv%2F2206.02608&rft.aulast=Kaushal&rft.aufirst=Ayush&rft.au=Mahowald%2C+Kyle&rfr_id=info%3Asid%2Fen.wikipedia.org%3ALarge+language+model" class="Z3988"></span></span> </li> <li id="cite_note-23"><span class="mw-cite-backlink"><b><a href="#cite_ref-23">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFYennie_Jun2023" class="citation web cs1">Yennie Jun (2023-05-03). <a rel="nofollow" class="external text" href="https://web.archive.org/web/20230817165705/https://blog.yenniejun.com/p/all-languages-are-not-created-tokenized">"All languages are NOT created (tokenized) equal"</a>. <i>Language models cost much more in some languages than others</i>. Archived from <a rel="nofollow" class="external text" href="https://blog.yenniejun.com/p/all-languages-are-not-created-tokenized">the original</a> on 2023-08-17<span class="reference-accessdate">. Retrieved <span class="nowrap">2023-08-17</span></span>. <q>In other words, to express the same sentiment, some languages require up to 10 times more tokens.</q></cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=unknown&rft.jtitle=Language+models+cost+much+more+in+some+languages+than+others&rft.atitle=All+languages+are+NOT+created+%28tokenized%29+equal&rft.date=2023-05-03&rft.au=Yennie+Jun&rft_id=https%3A%2F%2Fblog.yenniejun.com%2Fp%2Fall-languages-are-not-created-tokenized&rfr_id=info%3Asid%2Fen.wikipedia.org%3ALarge+language+model" class="Z3988"></span></span> </li> <li id="cite_note-24"><span class="mw-cite-backlink"><b><a href="#cite_ref-24">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFPetrovMalfaTorrBibi2023" class="citation journal cs1">Petrov, Aleksandar; Malfa, Emanuele La; Torr, Philip; Bibi, Adel (June 23, 2023). <a rel="nofollow" class="external text" href="https://openreview.net/forum?id=Pj4YYuxTq9">"Language Model Tokenizers Introduce Unfairness Between Languages"</a>. <i>NeurIPS</i>. <a href="/wiki/ArXiv_(identifier)" class="mw-redirect" title="ArXiv (identifier)">arXiv</a>:<span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://arxiv.org/abs/2305.15425">2305.15425</a></span>. <a rel="nofollow" class="external text" href="https://web.archive.org/web/20231215212906/https://openreview.net/forum?id=Pj4YYuxTq9">Archived</a> from the original on December 15, 2023<span class="reference-accessdate">. Retrieved <span class="nowrap">September 16,</span> 2023</span> – via openreview.net.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=article&rft.jtitle=NeurIPS&rft.atitle=Language+Model+Tokenizers+Introduce+Unfairness+Between+Languages&rft.date=2023-06-23&rft_id=info%3Aarxiv%2F2305.15425&rft.aulast=Petrov&rft.aufirst=Aleksandar&rft.au=Malfa%2C+Emanuele+La&rft.au=Torr%2C+Philip&rft.au=Bibi%2C+Adel&rft_id=https%3A%2F%2Fopenreview.net%2Fforum%3Fid%3DPj4YYuxTq9&rfr_id=info%3Asid%2Fen.wikipedia.org%3ALarge+language+model" class="Z3988"></span></span> </li> <li id="cite_note-xbiWb-25"><span class="mw-cite-backlink"><b><a href="#cite_ref-xbiWb_25-0">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite class="citation web cs1"><a rel="nofollow" class="external text" href="https://web.archive.org/web/20230423211308/https://platform.openai.com/tokenizer">"OpenAI API"</a>. <i>platform.openai.com</i>. Archived from <a rel="nofollow" class="external text" href="https://platform.openai.com/">the original</a> on April 23, 2023<span class="reference-accessdate">. Retrieved <span class="nowrap">2023-04-30</span></span>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=unknown&rft.jtitle=platform.openai.com&rft.atitle=OpenAI+API&rft_id=https%3A%2F%2Fplatform.openai.com%2F&rfr_id=info%3Asid%2Fen.wikipedia.org%3ALarge+language+model" class="Z3988"></span></span> </li> <li id="cite_note-2022Book_-26"><span class="mw-cite-backlink">^ <a href="#cite_ref-2022Book_26-0"><sup><i><b>a</b></i></sup></a> <a href="#cite_ref-2022Book_26-1"><sup><i><b>b</b></i></sup></a></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFPaaßGiesselbach2022" class="citation book cs1">Paaß, Gerhard; Giesselbach, Sven (2022). <a rel="nofollow" class="external text" href="https://link.springer.com/chapter/10.1007/978-3-031-23190-2_2">"Pre-trained Language Models"</a>. <i>Foundation Models for Natural Language Processing</i>. Artificial Intelligence: Foundations, Theory, and Algorithms. pp. 19–78. <a href="/wiki/Doi_(identifier)" class="mw-redirect" title="Doi (identifier)">doi</a>:<a rel="nofollow" class="external text" href="https://doi.org/10.1007%2F978-3-031-23190-2_2">10.1007/978-3-031-23190-2_2</a>. <a href="/wiki/ISBN_(identifier)" class="mw-redirect" title="ISBN (identifier)">ISBN</a> <a href="/wiki/Special:BookSources/9783031231902" title="Special:BookSources/9783031231902"><bdi>9783031231902</bdi></a>. <a rel="nofollow" class="external text" href="https://web.archive.org/web/20230803212329/https://link.springer.com/chapter/10.1007/978-3-031-23190-2_2">Archived</a> from the original on 3 August 2023<span class="reference-accessdate">. Retrieved <span class="nowrap">3 August</span> 2023</span>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&rft.genre=bookitem&rft.atitle=Pre-trained+Language+Models&rft.btitle=Foundation+Models+for+Natural+Language+Processing&rft.series=Artificial+Intelligence%3A+Foundations%2C+Theory%2C+and+Algorithms&rft.pages=19-78&rft.date=2022&rft_id=info%3Adoi%2F10.1007%2F978-3-031-23190-2_2&rft.isbn=9783031231902&rft.aulast=Paa%C3%9F&rft.aufirst=Gerhard&rft.au=Giesselbach%2C+Sven&rft_id=https%3A%2F%2Flink.springer.com%2Fchapter%2F10.1007%2F978-3-031-23190-2_2&rfr_id=info%3Asid%2Fen.wikipedia.org%3ALarge+language+model" class="Z3988"></span></span> </li> <li id="cite_note-27"><span class="mw-cite-backlink"><b><a href="#cite_ref-27">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFPetrovEmanuele_La_MalfaTorrBibi2023" class="citation arxiv cs1">Petrov, Aleksandar; Emanuele La Malfa; Torr, Philip H. S.; Bibi, Adel (2023). "Language Model Tokenizers Introduce Unfairness Between Languages". <a href="/wiki/ArXiv_(identifier)" class="mw-redirect" title="ArXiv (identifier)">arXiv</a>:<span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://arxiv.org/abs/2305.15425">2305.15425</a></span> [<a rel="nofollow" class="external text" href="https://arxiv.org/archive/cs.CL">cs.CL</a>].</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=preprint&rft.jtitle=arXiv&rft.atitle=Language+Model+Tokenizers+Introduce+Unfairness+Between+Languages&rft.date=2023&rft_id=info%3Aarxiv%2F2305.15425&rft.aulast=Petrov&rft.aufirst=Aleksandar&rft.au=Emanuele+La+Malfa&rft.au=Torr%2C+Philip+H.+S.&rft.au=Bibi%2C+Adel&rfr_id=info%3Asid%2Fen.wikipedia.org%3ALarge+language+model" class="Z3988"></span></span> </li> <li id="cite_note-28"><span class="mw-cite-backlink"><b><a href="#cite_ref-28">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFLundberg2023" class="citation web cs1">Lundberg, Scott (2023-12-12). <a rel="nofollow" class="external text" href="https://towardsdatascience.com/the-art-of-prompt-design-prompt-boundaries-and-token-healing-3b2448b0be38">"The Art of Prompt Design: Prompt Boundaries and Token Healing"</a>. <i>Medium</i><span class="reference-accessdate">. Retrieved <span class="nowrap">2024-08-05</span></span>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=unknown&rft.jtitle=Medium&rft.atitle=The+Art+of+Prompt+Design%3A+Prompt+Boundaries+and+Token+Healing&rft.date=2023-12-12&rft.aulast=Lundberg&rft.aufirst=Scott&rft_id=https%3A%2F%2Ftowardsdatascience.com%2Fthe-art-of-prompt-design-prompt-boundaries-and-token-healing-3b2448b0be38&rfr_id=info%3Asid%2Fen.wikipedia.org%3ALarge+language+model" class="Z3988"></span></span> </li> <li id="cite_note-aYNg4-29"><span class="mw-cite-backlink"><b><a href="#cite_ref-aYNg4_29-0">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFDodgeSapMarasovićAgnew2021" class="citation arxiv cs1">Dodge, Jesse; Sap, Maarten; Marasović, Ana; Agnew, William; Ilharco, Gabriel; Groeneveld, Dirk; Mitchell, Margaret; Gardner, Matt (2021). "Documenting Large Webtext Corpora: A Case Study on the Colossal Clean Crawled Corpus". <a href="/wiki/ArXiv_(identifier)" class="mw-redirect" title="ArXiv (identifier)">arXiv</a>:<span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://arxiv.org/abs/2104.08758">2104.08758</a></span> [<a rel="nofollow" class="external text" href="https://arxiv.org/archive/cs.CL">cs.CL</a>].</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=preprint&rft.jtitle=arXiv&rft.atitle=Documenting+Large+Webtext+Corpora%3A+A+Case+Study+on+the+Colossal+Clean+Crawled+Corpus&rft.date=2021&rft_id=info%3Aarxiv%2F2104.08758&rft.aulast=Dodge&rft.aufirst=Jesse&rft.au=Sap%2C+Maarten&rft.au=Marasovi%C4%87%2C+Ana&rft.au=Agnew%2C+William&rft.au=Ilharco%2C+Gabriel&rft.au=Groeneveld%2C+Dirk&rft.au=Mitchell%2C+Margaret&rft.au=Gardner%2C+Matt&rfr_id=info%3Asid%2Fen.wikipedia.org%3ALarge+language+model" class="Z3988"></span></span> </li> <li id="cite_note-30"><span class="mw-cite-backlink"><b><a href="#cite_ref-30">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFLeeIppolitoNystromZhang2022" class="citation journal cs1 cs1-prop-long-vol">Lee, Katherine; Ippolito, Daphne; Nystrom, Andrew; Zhang, Chiyuan; Eck, Douglas; Callison-Burch, Chris; Carlini, Nicholas (May 2022). <a rel="nofollow" class="external text" href="https://aclanthology.org/2022.acl-long.577.pdf">"Deduplicating Training Data Makes Language Models Better"</a> <span class="cs1-format">(PDF)</span>. <i>Proceedings of the 60th Annual Meeting of the Association for Computational Linguistics</i>. 1: Long Papers: 8424–8445. <a href="/wiki/Doi_(identifier)" class="mw-redirect" title="Doi (identifier)">doi</a>:<a rel="nofollow" class="external text" href="https://doi.org/10.18653%2Fv1%2F2022.acl-long.577">10.18653/v1/2022.acl-long.577</a>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=article&rft.jtitle=Proceedings+of+the+60th+Annual+Meeting+of+the+Association+for+Computational+Linguistics&rft.atitle=Deduplicating+Training+Data+Makes+Language+Models+Better&rft.volume=1%3A+Long+Papers&rft.pages=8424-8445&rft.date=2022-05&rft_id=info%3Adoi%2F10.18653%2Fv1%2F2022.acl-long.577&rft.aulast=Lee&rft.aufirst=Katherine&rft.au=Ippolito%2C+Daphne&rft.au=Nystrom%2C+Andrew&rft.au=Zhang%2C+Chiyuan&rft.au=Eck%2C+Douglas&rft.au=Callison-Burch%2C+Chris&rft.au=Carlini%2C+Nicholas&rft_id=https%3A%2F%2Faclanthology.org%2F2022.acl-long.577.pdf&rfr_id=info%3Asid%2Fen.wikipedia.org%3ALarge+language+model" class="Z3988"></span></span> </li> <li id="cite_note-31"><span class="mw-cite-backlink"><b><a href="#cite_ref-31">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFLiBubeckEldanDel_Giorno2023" class="citation cs2">Li, Yuanzhi; Bubeck, Sébastien; Eldan, Ronen; Del Giorno, Allie; Gunasekar, Suriya; Lee, Yin Tat (2023-09-11), <i>Textbooks Are All You Need II: phi-1.5 technical report</i>, <a href="/wiki/ArXiv_(identifier)" class="mw-redirect" title="ArXiv (identifier)">arXiv</a>:<span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://arxiv.org/abs/2309.05463">2309.05463</a></span></cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&rft.genre=book&rft.btitle=Textbooks+Are+All+You+Need+II%3A+phi-1.5+technical+report&rft.date=2023-09-11&rft_id=info%3Aarxiv%2F2309.05463&rft.aulast=Li&rft.aufirst=Yuanzhi&rft.au=Bubeck%2C+S%C3%A9bastien&rft.au=Eldan%2C+Ronen&rft.au=Del+Giorno%2C+Allie&rft.au=Gunasekar%2C+Suriya&rft.au=Lee%2C+Yin+Tat&rfr_id=info%3Asid%2Fen.wikipedia.org%3ALarge+language+model" class="Z3988"></span></span> </li> <li id="cite_note-32"><span class="mw-cite-backlink"><b><a href="#cite_ref-32">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFLinGouGongLiu2024" class="citation arxiv cs1">Lin, Zhenghao; Gou, Zhibin; Gong, Yeyun; Liu, Xiao; Shen, Yelong; Xu, Ruochen; Lin, Chen; Yang, Yujiu; Jiao, Jian (2024-04-11). "Rho-1: Not All Tokens Are What You Need". <a href="/wiki/ArXiv_(identifier)" class="mw-redirect" title="ArXiv (identifier)">arXiv</a>:<span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://arxiv.org/abs/2404.07965">2404.07965</a></span> [<a rel="nofollow" class="external text" href="https://arxiv.org/archive/cs.CL">cs.CL</a>].</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=preprint&rft.jtitle=arXiv&rft.atitle=Rho-1%3A+Not+All+Tokens+Are+What+You+Need&rft.date=2024-04-11&rft_id=info%3Aarxiv%2F2404.07965&rft.aulast=Lin&rft.aufirst=Zhenghao&rft.au=Gou%2C+Zhibin&rft.au=Gong%2C+Yeyun&rft.au=Liu%2C+Xiao&rft.au=Shen%2C+Yelong&rft.au=Xu%2C+Ruochen&rft.au=Lin%2C+Chen&rft.au=Yang%2C+Yujiu&rft.au=Jiao%2C+Jian&rfr_id=info%3Asid%2Fen.wikipedia.org%3ALarge+language+model" class="Z3988"></span></span> </li> <li id="cite_note-qbFw1-33"><span class="mw-cite-backlink"><b><a href="#cite_ref-qbFw1_33-0">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFBrownMannRyderSubbiah2020" class="citation arxiv cs1">Brown, Tom B.; et al. (2020). "Language Models are Few-Shot Learners". <a href="/wiki/ArXiv_(identifier)" class="mw-redirect" title="ArXiv (identifier)">arXiv</a>:<span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://arxiv.org/abs/2005.14165">2005.14165</a></span> [<a rel="nofollow" class="external text" href="https://arxiv.org/archive/cs.CL">cs.CL</a>].</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=preprint&rft.jtitle=arXiv&rft.atitle=Language+Models+are+Few-Shot+Learners&rft.date=2020&rft_id=info%3Aarxiv%2F2005.14165&rft.aulast=Brown&rft.aufirst=Tom+B.&rft.au=Mann%2C+Benjamin&rft.au=Ryder%2C+Nick&rft.au=Subbiah%2C+Melanie&rft.au=Kaplan%2C+Jared&rft.au=Dhariwal%2C+Prafulla&rft.au=Neelakantan%2C+Arvind&rft.au=Shyam%2C+Pranav&rft.au=Sastry%2C+Girish&rft.au=Askell%2C+Amanda&rft.au=Agarwal%2C+Sandhini&rft.au=Herbert-Voss%2C+Ariel&rft.au=Krueger%2C+Gretchen&rft.au=Henighan%2C+Tom&rft.au=Child%2C+Rewon&rft.au=Ramesh%2C+Aditya&rft.au=Ziegler%2C+Daniel+M.&rft.au=Wu%2C+Jeffrey&rft.au=Winter%2C+Clemens&rft.au=Hesse%2C+Christopher&rft.au=Chen%2C+Mark&rft.au=Sigler%2C+Eric&rft.au=Litwin%2C+Mateusz&rft.au=Gray%2C+Scott&rft.au=Chess%2C+Benjamin&rft.au=Clark%2C+Jack&rft.au=Berner%2C+Christopher&rft.au=McCandlish%2C+Sam&rft.au=Radford%2C+Alec&rft.au=Sutskever%2C+Ilya&rfr_id=info%3Asid%2Fen.wikipedia.org%3ALarge+language+model" class="Z3988"></span></span> </li> <li id="cite_note-34"><span class="mw-cite-backlink"><b><a href="#cite_ref-34">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFAbdinJacobsAwanAneja2024" class="citation arxiv cs1">Abdin, Marah; Jacobs, Sam Ade; Awan, Ammar Ahmad; Aneja, Jyoti; Awadallah, Ahmed; Awadalla, Hany; Bach, Nguyen; Bahree, Amit; Bakhtiari, Arash (2024-04-23). "Phi-3 Technical Report: A Highly Capable Language Model Locally on Your Phone". <a href="/wiki/ArXiv_(identifier)" class="mw-redirect" title="ArXiv (identifier)">arXiv</a>:<span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://arxiv.org/abs/2404.14219">2404.14219</a></span> [<a rel="nofollow" class="external text" href="https://arxiv.org/archive/cs.CL">cs.CL</a>].</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=preprint&rft.jtitle=arXiv&rft.atitle=Phi-3+Technical+Report%3A+A+Highly+Capable+Language+Model+Locally+on+Your+Phone&rft.date=2024-04-23&rft_id=info%3Aarxiv%2F2404.14219&rft.aulast=Abdin&rft.aufirst=Marah&rft.au=Jacobs%2C+Sam+Ade&rft.au=Awan%2C+Ammar+Ahmad&rft.au=Aneja%2C+Jyoti&rft.au=Awadallah%2C+Ahmed&rft.au=Awadalla%2C+Hany&rft.au=Bach%2C+Nguyen&rft.au=Bahree%2C+Amit&rft.au=Bakhtiari%2C+Arash&rfr_id=info%3Asid%2Fen.wikipedia.org%3ALarge+language+model" class="Z3988"></span></span> </li> <li id="cite_note-instructGPT-paper-35"><span class="mw-cite-backlink"><b><a href="#cite_ref-instructGPT-paper_35-0">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFOuyangWuJiangAlmeida2022" class="citation arxiv cs1">Ouyang, Long; Wu, Jeff; Jiang, Xu; Almeida, Diogo; Wainwright, Carroll L.; Mishkin, Pamela; Zhang, Chong; Agarwal, Sandhini; Slama, Katarina; Ray, Alex; Schulman, John; Hilton, Jacob; Kelton, Fraser; Miller, Luke; Simens, Maddie; Askell, Amanda; Welinder, Peter; Christiano, Paul; Leike, Jan; Lowe, Ryan (2022). "Training language models to follow instructions with human feedback". <a href="/wiki/ArXiv_(identifier)" class="mw-redirect" title="ArXiv (identifier)">arXiv</a>:<span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://arxiv.org/abs/2203.02155">2203.02155</a></span> [<a rel="nofollow" class="external text" href="https://arxiv.org/archive/cs.CL">cs.CL</a>].</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=preprint&rft.jtitle=arXiv&rft.atitle=Training+language+models+to+follow+instructions+with+human+feedback&rft.date=2022&rft_id=info%3Aarxiv%2F2203.02155&rft.aulast=Ouyang&rft.aufirst=Long&rft.au=Wu%2C+Jeff&rft.au=Jiang%2C+Xu&rft.au=Almeida%2C+Diogo&rft.au=Wainwright%2C+Carroll+L.&rft.au=Mishkin%2C+Pamela&rft.au=Zhang%2C+Chong&rft.au=Agarwal%2C+Sandhini&rft.au=Slama%2C+Katarina&rft.au=Ray%2C+Alex&rft.au=Schulman%2C+John&rft.au=Hilton%2C+Jacob&rft.au=Kelton%2C+Fraser&rft.au=Miller%2C+Luke&rft.au=Simens%2C+Maddie&rft.au=Askell%2C+Amanda&rft.au=Welinder%2C+Peter&rft.au=Christiano%2C+Paul&rft.au=Leike%2C+Jan&rft.au=Lowe%2C+Ryan&rfr_id=info%3Asid%2Fen.wikipedia.org%3ALarge+language+model" class="Z3988"></span></span> </li> <li id="cite_note-self-instruct-paper-36"><span class="mw-cite-backlink"><b><a href="#cite_ref-self-instruct-paper_36-0">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFWangKordiMishraLiu2022" class="citation arxiv cs1">Wang, Yizhong; Kordi, Yeganeh; Mishra, Swaroop; Liu, Alisa; Smith, Noah A.; Khashabi, Daniel; Hajishirzi, Hannaneh (2022). "Self-Instruct: Aligning Language Model with Self Generated Instructions". <a href="/wiki/ArXiv_(identifier)" class="mw-redirect" title="ArXiv (identifier)">arXiv</a>:<span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://arxiv.org/abs/2212.10560">2212.10560</a></span> [<a rel="nofollow" class="external text" href="https://arxiv.org/archive/cs.CL">cs.CL</a>].</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=preprint&rft.jtitle=arXiv&rft.atitle=Self-Instruct%3A+Aligning+Language+Model+with+Self+Generated+Instructions&rft.date=2022&rft_id=info%3Aarxiv%2F2212.10560&rft.aulast=Wang&rft.aufirst=Yizhong&rft.au=Kordi%2C+Yeganeh&rft.au=Mishra%2C+Swaroop&rft.au=Liu%2C+Alisa&rft.au=Smith%2C+Noah+A.&rft.au=Khashabi%2C+Daniel&rft.au=Hajishirzi%2C+Hannaneh&rfr_id=info%3Asid%2Fen.wikipedia.org%3ALarge+language+model" class="Z3988"></span></span> </li> <li id="cite_note-HGZCJ-37"><span class="mw-cite-backlink"><b><a href="#cite_ref-HGZCJ_37-0">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFShazeerMirhoseiniMaziarzDavis2017" class="citation arxiv cs1">Shazeer, Noam; Mirhoseini, Azalia; Maziarz, Krzysztof; Davis, Andy; Le, Quoc; Hinton, Geoffrey; Dean, Jeff (2017-01-01). "Outrageously Large Neural Networks: The Sparsely-Gated Mixture-of-Experts Layer". <a href="/wiki/ArXiv_(identifier)" class="mw-redirect" title="ArXiv (identifier)">arXiv</a>:<span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://arxiv.org/abs/1701.06538">1701.06538</a></span> [<a rel="nofollow" class="external text" href="https://arxiv.org/archive/cs.LG">cs.LG</a>].</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=preprint&rft.jtitle=arXiv&rft.atitle=Outrageously+Large+Neural+Networks%3A+The+Sparsely-Gated+Mixture-of-Experts+Layer&rft.date=2017-01-01&rft_id=info%3Aarxiv%2F1701.06538&rft.aulast=Shazeer&rft.aufirst=Noam&rft.au=Mirhoseini%2C+Azalia&rft.au=Maziarz%2C+Krzysztof&rft.au=Davis%2C+Andy&rft.au=Le%2C+Quoc&rft.au=Hinton%2C+Geoffrey&rft.au=Dean%2C+Jeff&rfr_id=info%3Asid%2Fen.wikipedia.org%3ALarge+language+model" class="Z3988"></span></span> </li> <li id="cite_note-R9Qq5-38"><span class="mw-cite-backlink"><b><a href="#cite_ref-R9Qq5_38-0">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFLepikhinLeeXuChen2021" class="citation arxiv cs1">Lepikhin, Dmitry; Lee, HyoukJoong; Xu, Yuanzhong; Chen, Dehao; Firat, Orhan; Huang, Yanping; Krikun, Maxim; Shazeer, Noam; Chen, Zhifeng (2021-01-12). "GShard: Scaling Giant Models with Conditional Computation and Automatic Sharding". <a href="/wiki/ArXiv_(identifier)" class="mw-redirect" title="ArXiv (identifier)">arXiv</a>:<span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://arxiv.org/abs/2006.16668">2006.16668</a></span> [<a rel="nofollow" class="external text" href="https://arxiv.org/archive/cs.CL">cs.CL</a>].</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=preprint&rft.jtitle=arXiv&rft.atitle=GShard%3A+Scaling+Giant+Models+with+Conditional+Computation+and+Automatic+Sharding&rft.date=2021-01-12&rft_id=info%3Aarxiv%2F2006.16668&rft.aulast=Lepikhin&rft.aufirst=Dmitry&rft.au=Lee%2C+HyoukJoong&rft.au=Xu%2C+Yuanzhong&rft.au=Chen%2C+Dehao&rft.au=Firat%2C+Orhan&rft.au=Huang%2C+Yanping&rft.au=Krikun%2C+Maxim&rft.au=Shazeer%2C+Noam&rft.au=Chen%2C+Zhifeng&rfr_id=info%3Asid%2Fen.wikipedia.org%3ALarge+language+model" class="Z3988"></span></span> </li> <li id="cite_note-glam-blog-39"><span class="mw-cite-backlink">^ <a href="#cite_ref-glam-blog_39-0"><sup><i><b>a</b></i></sup></a> <a href="#cite_ref-glam-blog_39-1"><sup><i><b>b</b></i></sup></a> <a href="#cite_ref-glam-blog_39-2"><sup><i><b>c</b></i></sup></a> <a href="#cite_ref-glam-blog_39-3"><sup><i><b>d</b></i></sup></a></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFDaiDu2021" class="citation web cs1">Dai, Andrew M; Du, Nan (December 9, 2021). <a rel="nofollow" class="external text" href="https://ai.googleblog.com/2021/12/more-efficient-in-context-learning-with.html">"More Efficient In-Context Learning with GLaM"</a>. <i>ai.googleblog.com</i>. <a rel="nofollow" class="external text" href="https://web.archive.org/web/20230312072042/https://ai.googleblog.com/2021/12/more-efficient-in-context-learning-with.html">Archived</a> from the original on 2023-03-12<span class="reference-accessdate">. Retrieved <span class="nowrap">2023-03-09</span></span>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=unknown&rft.jtitle=ai.googleblog.com&rft.atitle=More+Efficient+In-Context+Learning+with+GLaM&rft.date=2021-12-09&rft.aulast=Dai&rft.aufirst=Andrew+M&rft.au=Du%2C+Nan&rft_id=https%3A%2F%2Fai.googleblog.com%2F2021%2F12%2Fmore-efficient-in-context-learning-with.html&rfr_id=info%3Asid%2Fen.wikipedia.org%3ALarge+language+model" class="Z3988"></span></span> </li> <li id="cite_note-emergentpaper-40"><span class="mw-cite-backlink">^ <a href="#cite_ref-emergentpaper_40-0"><sup><i><b>a</b></i></sup></a> <a href="#cite_ref-emergentpaper_40-1"><sup><i><b>b</b></i></sup></a> <a href="#cite_ref-emergentpaper_40-2"><sup><i><b>c</b></i></sup></a></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFWeiTayBommasaniRaffel2022" class="citation journal cs1">Wei, Jason; Tay, Yi; Bommasani, Rishi; Raffel, Colin; Zoph, Barret; Borgeaud, Sebastian; Yogatama, Dani; Bosma, Maarten; Zhou, Denny; Metzler, Donald; Chi, Ed H.; Hashimoto, Tatsunori; Vinyals, Oriol; Liang, Percy; Dean, Jeff; Fedus, William (31 August 2022). <a rel="nofollow" class="external text" href="https://openreview.net/forum?id=yzkSU5zdwD">"Emergent Abilities of Large Language Models"</a>. <i>Transactions on Machine Learning Research</i>. <a href="/wiki/ISSN_(identifier)" class="mw-redirect" title="ISSN (identifier)">ISSN</a> <a rel="nofollow" class="external text" href="https://search.worldcat.org/issn/2835-8856">2835-8856</a>. <a rel="nofollow" class="external text" href="https://web.archive.org/web/20230322210052/https://openreview.net/forum?id=yzkSU5zdwD">Archived</a> from the original on 22 March 2023<span class="reference-accessdate">. Retrieved <span class="nowrap">19 March</span> 2023</span>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=article&rft.jtitle=Transactions+on+Machine+Learning+Research&rft.atitle=Emergent+Abilities+of+Large+Language+Models&rft.date=2022-08-31&rft.issn=2835-8856&rft.aulast=Wei&rft.aufirst=Jason&rft.au=Tay%2C+Yi&rft.au=Bommasani%2C+Rishi&rft.au=Raffel%2C+Colin&rft.au=Zoph%2C+Barret&rft.au=Borgeaud%2C+Sebastian&rft.au=Yogatama%2C+Dani&rft.au=Bosma%2C+Maarten&rft.au=Zhou%2C+Denny&rft.au=Metzler%2C+Donald&rft.au=Chi%2C+Ed+H.&rft.au=Hashimoto%2C+Tatsunori&rft.au=Vinyals%2C+Oriol&rft.au=Liang%2C+Percy&rft.au=Dean%2C+Jeff&rft.au=Fedus%2C+William&rft_id=https%3A%2F%2Fopenreview.net%2Fforum%3Fid%3DyzkSU5zdwD&rfr_id=info%3Asid%2Fen.wikipedia.org%3ALarge+language+model" class="Z3988"></span></span> </li> <li id="cite_note-Jay_Allamar-41"><span class="mw-cite-backlink"><b><a href="#cite_ref-Jay_Allamar_41-0">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFAllamar" class="citation web cs1">Allamar, Jay. <a rel="nofollow" class="external text" href="https://jalammar.github.io/illustrated-transformer/">"Illustrated transformer"</a>. <a rel="nofollow" class="external text" href="https://web.archive.org/web/20230725230033/http://jalammar.github.io/illustrated-transformer/">Archived</a> from the original on 2023-07-25<span class="reference-accessdate">. Retrieved <span class="nowrap">2023-07-29</span></span>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&rft.genre=unknown&rft.btitle=Illustrated+transformer&rft.aulast=Allamar&rft.aufirst=Jay&rft_id=https%3A%2F%2Fjalammar.github.io%2Fillustrated-transformer%2F&rfr_id=info%3Asid%2Fen.wikipedia.org%3ALarge+language+model" class="Z3988"></span></span> </li> <li id="cite_note-Jay_Allamar_GPT2-42"><span class="mw-cite-backlink"><b><a href="#cite_ref-Jay_Allamar_GPT2_42-0">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFAllamar" class="citation web cs1">Allamar, Jay. <a rel="nofollow" class="external text" href="https://jalammar.github.io/illustrated-gpt2/">"The Illustrated GPT-2 (Visualizing Transformer Language Models)"</a><span class="reference-accessdate">. Retrieved <span class="nowrap">2023-08-01</span></span>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&rft.genre=unknown&rft.btitle=The+Illustrated+GPT-2+%28Visualizing+Transformer+Language+Models%29&rft.aulast=Allamar&rft.aufirst=Jay&rft_id=https%3A%2F%2Fjalammar.github.io%2Fillustrated-gpt2%2F&rfr_id=info%3Asid%2Fen.wikipedia.org%3ALarge+language+model" class="Z3988"></span></span> </li> <li id="cite_note-43"><span class="mw-cite-backlink"><b><a href="#cite_ref-43">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite class="citation web cs1"><a rel="nofollow" class="external text" href="https://blog.google/technology/ai/google-gemini-next-generation-model-february-2024/#context-window">"Our next-generation model: Gemini 1.5"</a>. <i>Google</i>. 15 February 2024. <a rel="nofollow" class="external text" href="https://web.archive.org/web/20240218141522/https://blog.google/technology/ai/google-gemini-next-generation-model-february-2024/#context-window">Archived</a> from the original on 18 February 2024<span class="reference-accessdate">. Retrieved <span class="nowrap">18 February</span> 2024</span>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=unknown&rft.jtitle=Google&rft.atitle=Our+next-generation+model%3A+Gemini+1.5&rft.date=2024-02-15&rft_id=https%3A%2F%2Fblog.google%2Ftechnology%2Fai%2Fgoogle-gemini-next-generation-model-february-2024%2F%23context-window&rfr_id=info%3Asid%2Fen.wikipedia.org%3ALarge+language+model" class="Z3988"></span></span> </li> <li id="cite_note-44"><span class="mw-cite-backlink"><b><a href="#cite_ref-44">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite class="citation web cs1"><a rel="nofollow" class="external text" href="https://www.anthropic.com/news/claude-2-1-prompting">"Long context prompting for Claude 2.1"</a>. December 6, 2023. <a rel="nofollow" class="external text" href="https://web.archive.org/web/20240827053830/https://www.anthropic.com/news/claude-2-1-prompting">Archived</a> from the original on August 27, 2024<span class="reference-accessdate">. Retrieved <span class="nowrap">January 20,</span> 2024</span>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&rft.genre=unknown&rft.btitle=Long+context+prompting+for+Claude+2.1&rft.date=2023-12-06&rft_id=https%3A%2F%2Fwww.anthropic.com%2Fnews%2Fclaude-2-1-prompting&rfr_id=info%3Asid%2Fen.wikipedia.org%3ALarge+language+model" class="Z3988"></span></span> </li> <li id="cite_note-45"><span class="mw-cite-backlink"><b><a href="#cite_ref-45">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite class="citation web cs1"><a rel="nofollow" class="external text" href="https://platform.openai.com/docs/guides/rate-limits">"Rate limits"</a>. <i>openai.com</i>. <a rel="nofollow" class="external text" href="https://web.archive.org/web/20240202003219/https://platform.openai.com/docs/guides/rate-limits">Archived</a> from the original on February 2, 2024<span class="reference-accessdate">. Retrieved <span class="nowrap">January 20,</span> 2024</span>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=unknown&rft.jtitle=openai.com&rft.atitle=Rate+limits&rft_id=https%3A%2F%2Fplatform.openai.com%2Fdocs%2Fguides%2Frate-limits&rfr_id=info%3Asid%2Fen.wikipedia.org%3ALarge+language+model" class="Z3988"></span></span> </li> <li id="cite_note-ioUpE-46"><span class="mw-cite-backlink"><b><a href="#cite_ref-ioUpE_46-0">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFZaibShengEmma_Zhang2020" class="citation book cs1">Zaib, Munazza; Sheng, Quan Z.; Emma Zhang, Wei (4 February 2020). <a rel="nofollow" class="external text" href="https://www.researchgate.net/publication/338931711">"A Short Survey of Pre-trained Language Models for Conversational AI-A New Age in NLP"</a>. <i>Proceedings of the Australasian Computer Science Week Multiconference</i>. pp. 1–4. <a href="/wiki/ArXiv_(identifier)" class="mw-redirect" title="ArXiv (identifier)">arXiv</a>:<span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://arxiv.org/abs/2104.10810">2104.10810</a></span>. <a href="/wiki/Doi_(identifier)" class="mw-redirect" title="Doi (identifier)">doi</a>:<a rel="nofollow" class="external text" href="https://doi.org/10.1145%2F3373017.3373028">10.1145/3373017.3373028</a>. <a href="/wiki/ISBN_(identifier)" class="mw-redirect" title="ISBN (identifier)">ISBN</a> <a href="/wiki/Special:BookSources/9781450376976" title="Special:BookSources/9781450376976"><bdi>9781450376976</bdi></a>. <a href="/wiki/S2CID_(identifier)" class="mw-redirect" title="S2CID (identifier)">S2CID</a> <a rel="nofollow" class="external text" href="https://api.semanticscholar.org/CorpusID:211040895">211040895</a>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&rft.genre=bookitem&rft.atitle=A+Short+Survey+of+Pre-trained+Language+Models+for+Conversational+AI-A+New+Age+in+NLP&rft.btitle=Proceedings+of+the+Australasian+Computer+Science+Week+Multiconference&rft.pages=1-4&rft.date=2020-02-04&rft_id=info%3Aarxiv%2F2104.10810&rft_id=https%3A%2F%2Fapi.semanticscholar.org%2FCorpusID%3A211040895%23id-name%3DS2CID&rft_id=info%3Adoi%2F10.1145%2F3373017.3373028&rft.isbn=9781450376976&rft.aulast=Zaib&rft.aufirst=Munazza&rft.au=Sheng%2C+Quan+Z.&rft.au=Emma+Zhang%2C+Wei&rft_id=https%3A%2F%2Fwww.researchgate.net%2Fpublication%2F338931711&rfr_id=info%3Asid%2Fen.wikipedia.org%3ALarge+language+model" class="Z3988"></span></span> </li> <li id="cite_note-jm-47"><span class="mw-cite-backlink">^ <a href="#cite_ref-jm_47-0"><sup><i><b>a</b></i></sup></a> <a href="#cite_ref-jm_47-1"><sup><i><b>b</b></i></sup></a> <a href="#cite_ref-jm_47-2"><sup><i><b>c</b></i></sup></a></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFJurafskyMartin2023" class="citation book cs1">Jurafsky, Dan; Martin, James H. (7 January 2023). <a rel="nofollow" class="external text" href="https://web.stanford.edu/~jurafsky/slp3/ed3book_jan72023.pdf"><i>Speech and Language Processing</i></a> <span class="cs1-format">(PDF)</span> (3rd edition draft ed.). <a rel="nofollow" class="external text" href="https://web.archive.org/web/20230323210221/https://web.stanford.edu/~jurafsky/slp3/ed3book_jan72023.pdf">Archived</a> <span class="cs1-format">(PDF)</span> from the original on 23 March 2023<span class="reference-accessdate">. Retrieved <span class="nowrap">24 May</span> 2022</span>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&rft.genre=book&rft.btitle=Speech+and+Language+Processing&rft.edition=3rd+edition+draft&rft.date=2023-01-07&rft.aulast=Jurafsky&rft.aufirst=Dan&rft.au=Martin%2C+James+H.&rft_id=https%3A%2F%2Fweb.stanford.edu%2F~jurafsky%2Fslp3%2Fed3book_jan72023.pdf&rfr_id=info%3Asid%2Fen.wikipedia.org%3ALarge+language+model" class="Z3988"></span></span> </li> <li id="cite_note-48"><span class="mw-cite-backlink"><b><a href="#cite_ref-48">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite class="citation web cs1"><a rel="nofollow" class="external text" href="https://imbue.com/research/70b-infrastructure/">"From bare metal to a 70B model: infrastructure set-up and scripts"</a>. <i>imbue.com</i>. <a rel="nofollow" class="external text" href="https://web.archive.org/web/20240726203419/https://imbue.com/research/70b-infrastructure/">Archived</a> from the original on 2024-07-26<span class="reference-accessdate">. Retrieved <span class="nowrap">2024-07-24</span></span>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=unknown&rft.jtitle=imbue.com&rft.atitle=From+bare+metal+to+a+70B+model%3A+infrastructure+set-up+and+scripts&rft_id=https%3A%2F%2Fimbue.com%2Fresearch%2F70b-infrastructure%2F&rfr_id=info%3Asid%2Fen.wikipedia.org%3ALarge+language+model" class="Z3988"></span></span> </li> <li id="cite_note-49"><span class="mw-cite-backlink"><b><a href="#cite_ref-49">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite class="citation web cs1"><a rel="nofollow" class="external text" href="https://github.com/facebookresearch/metaseq/tree/main/projects/OPT/chronicles">"metaseq/projects/OPT/chronicles at main · facebookresearch/metaseq"</a>. <i>GitHub</i>. <a rel="nofollow" class="external text" href="https://web.archive.org/web/20240124035658/https://github.com/facebookresearch/metaseq/tree/main/projects/OPT/chronicles">Archived</a> from the original on 2024-01-24<span class="reference-accessdate">. Retrieved <span class="nowrap">2024-07-24</span></span>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=unknown&rft.jtitle=GitHub&rft.atitle=metaseq%2Fprojects%2FOPT%2Fchronicles+at+main+%C2%B7+facebookresearch%2Fmetaseq&rft_id=https%3A%2F%2Fgithub.com%2Ffacebookresearch%2Fmetaseq%2Ftree%2Fmain%2Fprojects%2FOPT%2Fchronicles&rfr_id=info%3Asid%2Fen.wikipedia.org%3ALarge+language+model" class="Z3988"></span></span> </li> <li id="cite_note-50"><span class="mw-cite-backlink"><b><a href="#cite_ref-50">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFAlbrecht2024" class="citation web cs1">Albrecht, Josh (2024-07-23). <a rel="nofollow" class="external text" href="https://www.latent.space/p/llm-training-2024">"State of the Art: Training >70B LLMs on 10,000 H100 clusters"</a>. <i>www.latent.space</i><span class="reference-accessdate">. Retrieved <span class="nowrap">2024-07-24</span></span>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=unknown&rft.jtitle=www.latent.space&rft.atitle=State+of+the+Art%3A+Training+%3E70B+LLMs+on+10%2C000+H100+clusters&rft.date=2024-07-23&rft.aulast=Albrecht&rft.aufirst=Josh&rft_id=https%3A%2F%2Fwww.latent.space%2Fp%2Fllm-training-2024&rfr_id=info%3Asid%2Fen.wikipedia.org%3ALarge+language+model" class="Z3988"></span></span> </li> <li id="cite_note-Wiggers-51"><span class="mw-cite-backlink">^ <a href="#cite_ref-Wiggers_51-0"><sup><i><b>a</b></i></sup></a> <a href="#cite_ref-Wiggers_51-1"><sup><i><b>b</b></i></sup></a></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFWiggers2022" class="citation web cs1">Wiggers, Kyle (28 April 2022). <a rel="nofollow" class="external text" href="https://techcrunch.com/2022/04/28/the-emerging-types-of-language-models-and-why-they-matter/">"The emerging types of language models and why they matter"</a>. <i>TechCrunch</i>. <a rel="nofollow" class="external text" href="https://web.archive.org/web/20230316072443/https://techcrunch.com/2022/04/28/the-emerging-types-of-language-models-and-why-they-matter/">Archived</a> from the original on 16 March 2023<span class="reference-accessdate">. Retrieved <span class="nowrap">9 March</span> 2023</span>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=unknown&rft.jtitle=TechCrunch&rft.atitle=The+emerging+types+of+language+models+and+why+they+matter&rft.date=2022-04-28&rft.aulast=Wiggers&rft.aufirst=Kyle&rft_id=https%3A%2F%2Ftechcrunch.com%2F2022%2F04%2F28%2Fthe-emerging-types-of-language-models-and-why-they-matter%2F&rfr_id=info%3Asid%2Fen.wikipedia.org%3ALarge+language+model" class="Z3988"></span></span> </li> <li id="cite_note-xaytj-52"><span class="mw-cite-backlink"><b><a href="#cite_ref-xaytj_52-0">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFSharirPelegShoham2020" class="citation arxiv cs1">Sharir, Or; Peleg, Barak; Shoham, Yoav (2020). "The Cost of Training NLP Models: A Concise Overview". <a href="/wiki/ArXiv_(identifier)" class="mw-redirect" title="ArXiv (identifier)">arXiv</a>:<span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://arxiv.org/abs/2004.08900">2004.08900</a></span> [<a rel="nofollow" class="external text" href="https://arxiv.org/archive/cs.CL">cs.CL</a>].</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=preprint&rft.jtitle=arXiv&rft.atitle=The+Cost+of+Training+NLP+Models%3A+A+Concise+Overview&rft.date=2020&rft_id=info%3Aarxiv%2F2004.08900&rft.aulast=Sharir&rft.aufirst=Or&rft.au=Peleg%2C+Barak&rft.au=Shoham%2C+Yoav&rfr_id=info%3Asid%2Fen.wikipedia.org%3ALarge+language+model" class="Z3988"></span></span> </li> <li id="cite_note-Pythia-53"><span class="mw-cite-backlink"><b><a href="#cite_ref-Pythia_53-0">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFBidermanSchoelkopfAnthonyBradley2023" class="citation arxiv cs1">Biderman, Stella; Schoelkopf, Hailey; Anthony, Quentin; Bradley, Herbie; Khan, Mohammad Aflah; Purohit, Shivanshu; Prashanth, USVSN Sai (April 2023). "Pythia: A Suite for Analyzing Large Language Models Across Training and Scaling". <a href="/wiki/ArXiv_(identifier)" class="mw-redirect" title="ArXiv (identifier)">arXiv</a>:<span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://arxiv.org/abs/2304.01373">2304.01373</a></span> [<a rel="nofollow" class="external text" href="https://arxiv.org/archive/cs.CL">cs.CL</a>].</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=preprint&rft.jtitle=arXiv&rft.atitle=Pythia%3A+A+Suite+for+Analyzing+Large+Language+Models+Across+Training+and+Scaling&rft.date=2023-04&rft_id=info%3Aarxiv%2F2304.01373&rft.aulast=Biderman&rft.aufirst=Stella&rft.au=Schoelkopf%2C+Hailey&rft.au=Anthony%2C+Quentin&rft.au=Bradley%2C+Herbie&rft.au=Khan%2C+Mohammad+Aflah&rft.au=Purohit%2C+Shivanshu&rft.au=Prashanth%2C+USVSN+Sai&rfr_id=info%3Asid%2Fen.wikipedia.org%3ALarge+language+model" class="Z3988"></span></span> </li> <li id="cite_note-54"><span class="mw-cite-backlink"><b><a href="#cite_ref-54">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFMaslejFattoriniBrynjolfssonEtchemendy2023" class="citation cs2">Maslej, Nestor; Fattorini, Loredana; Brynjolfsson, Erik; Etchemendy, John; Ligett, Katrina; Lyons, Terah; Manyika, James; Ngo, Helen; Niebles, Juan Carlos (2023-10-05), <i>Artificial Intelligence Index Report 2023</i>, <a href="/wiki/ArXiv_(identifier)" class="mw-redirect" title="ArXiv (identifier)">arXiv</a>:<span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://arxiv.org/abs/2310.03715">2310.03715</a></span></cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&rft.genre=book&rft.btitle=Artificial+Intelligence+Index+Report+2023&rft.date=2023-10-05&rft_id=info%3Aarxiv%2F2310.03715&rft.aulast=Maslej&rft.aufirst=Nestor&rft.au=Fattorini%2C+Loredana&rft.au=Brynjolfsson%2C+Erik&rft.au=Etchemendy%2C+John&rft.au=Ligett%2C+Katrina&rft.au=Lyons%2C+Terah&rft.au=Manyika%2C+James&rft.au=Ngo%2C+Helen&rft.au=Niebles%2C+Juan+Carlos&rfr_id=info%3Asid%2Fen.wikipedia.org%3ALarge+language+model" class="Z3988"></span></span> </li> <li id="cite_note-kaplan-scaling-55"><span class="mw-cite-backlink">^ <a href="#cite_ref-kaplan-scaling_55-0"><sup><i><b>a</b></i></sup></a> <a href="#cite_ref-kaplan-scaling_55-1"><sup><i><b>b</b></i></sup></a></span> <span class="reference-text">Section 2.1 and Table 1, <link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFKaplanMcCandlishHenighanBrown2020" class="citation arxiv cs1">Kaplan, Jared; McCandlish, Sam; Henighan, Tom; Brown, Tom B.; Chess, Benjamin; Child, Rewon; Gray, Scott; Radford, Alec; Wu, Jeffrey; Amodei, Dario (2020). "Scaling Laws for Neural Language Models". <a href="/wiki/ArXiv_(identifier)" class="mw-redirect" title="ArXiv (identifier)">arXiv</a>:<span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://arxiv.org/abs/2001.08361">2001.08361</a></span> [<a rel="nofollow" class="external text" href="https://arxiv.org/archive/cs.LG">cs.LG</a>].</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=preprint&rft.jtitle=arXiv&rft.atitle=Scaling+Laws+for+Neural+Language+Models&rft.date=2020&rft_id=info%3Aarxiv%2F2001.08361&rft.aulast=Kaplan&rft.aufirst=Jared&rft.au=McCandlish%2C+Sam&rft.au=Henighan%2C+Tom&rft.au=Brown%2C+Tom+B.&rft.au=Chess%2C+Benjamin&rft.au=Child%2C+Rewon&rft.au=Gray%2C+Scott&rft.au=Radford%2C+Alec&rft.au=Wu%2C+Jeffrey&rft.au=Amodei%2C+Dario&rfr_id=info%3Asid%2Fen.wikipedia.org%3ALarge+language+model" class="Z3988"></span></span> </li> <li id="cite_note-PI1fW-56"><span class="mw-cite-backlink"><b><a href="#cite_ref-PI1fW_56-0">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFGaoMadaanZhouAlon2022" class="citation arxiv cs1">Gao, Luyu; Madaan, Aman; Zhou, Shuyan; Alon, Uri; Liu, Pengfei; Yang, Yiming; Callan, Jamie; Neubig, Graham (2022-11-01). "PAL: Program-aided Language Models". <a href="/wiki/ArXiv_(identifier)" class="mw-redirect" title="ArXiv (identifier)">arXiv</a>:<span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://arxiv.org/abs/2211.10435">2211.10435</a></span> [<a rel="nofollow" class="external text" href="https://arxiv.org/archive/cs.CL">cs.CL</a>].</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=preprint&rft.jtitle=arXiv&rft.atitle=PAL%3A+Program-aided+Language+Models&rft.date=2022-11-01&rft_id=info%3Aarxiv%2F2211.10435&rft.aulast=Gao&rft.aufirst=Luyu&rft.au=Madaan%2C+Aman&rft.au=Zhou%2C+Shuyan&rft.au=Alon%2C+Uri&rft.au=Liu%2C+Pengfei&rft.au=Yang%2C+Yiming&rft.au=Callan%2C+Jamie&rft.au=Neubig%2C+Graham&rfr_id=info%3Asid%2Fen.wikipedia.org%3ALarge+language+model" class="Z3988"></span></span> </li> <li id="cite_note-J5OW5-57"><span class="mw-cite-backlink"><b><a href="#cite_ref-J5OW5_57-0">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite class="citation web cs1"><a rel="nofollow" class="external text" href="https://reasonwithpal.com/">"PAL: Program-aided Language Models"</a>. <i>reasonwithpal.com</i>. <a rel="nofollow" class="external text" href="https://web.archive.org/web/20230612162208/https://reasonwithpal.com/">Archived</a> from the original on 2023-06-12<span class="reference-accessdate">. Retrieved <span class="nowrap">2023-06-12</span></span>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=unknown&rft.jtitle=reasonwithpal.com&rft.atitle=PAL%3A+Program-aided+Language+Models&rft_id=https%3A%2F%2Freasonwithpal.com%2F&rfr_id=info%3Asid%2Fen.wikipedia.org%3ALarge+language+model" class="Z3988"></span></span> </li> <li id="cite_note-gQxzq-58"><span class="mw-cite-backlink"><b><a href="#cite_ref-gQxzq_58-0">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFParanjapeLundbergSinghHajishirzi2023" class="citation arxiv cs1">Paranjape, Bhargavi; Lundberg, Scott; Singh, Sameer; Hajishirzi, Hannaneh; Zettlemoyer, Luke; Tulio Ribeiro, Marco (2023-03-01). "ART: Automatic multi-step reasoning and tool-use for large language models". <a href="/wiki/ArXiv_(identifier)" class="mw-redirect" title="ArXiv (identifier)">arXiv</a>:<span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://arxiv.org/abs/2303.09014">2303.09014</a></span> [<a rel="nofollow" class="external text" href="https://arxiv.org/archive/cs.CL">cs.CL</a>].</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=preprint&rft.jtitle=arXiv&rft.atitle=ART%3A+Automatic+multi-step+reasoning+and+tool-use+for+large+language+models&rft.date=2023-03-01&rft_id=info%3Aarxiv%2F2303.09014&rft.aulast=Paranjape&rft.aufirst=Bhargavi&rft.au=Lundberg%2C+Scott&rft.au=Singh%2C+Sameer&rft.au=Hajishirzi%2C+Hannaneh&rft.au=Zettlemoyer%2C+Luke&rft.au=Tulio+Ribeiro%2C+Marco&rfr_id=info%3Asid%2Fen.wikipedia.org%3ALarge+language+model" class="Z3988"></span></span> </li> <li id="cite_note-lLrda-59"><span class="mw-cite-backlink"><b><a href="#cite_ref-lLrda_59-0">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFLiangWuSongWu2023" class="citation arxiv cs1">Liang, Yaobo; Wu, Chenfei; Song, Ting; Wu, Wenshan; Xia, Yan; Liu, Yu; Ou, Yang; Lu, Shuai; Ji, Lei; Mao, Shaoguang; Wang, Yun; Shou, Linjun; Gong, Ming; Duan, Nan (2023-03-01). "TaskMatrix.AI: Completing Tasks by Connecting Foundation Models with Millions of APIs". <a href="/wiki/ArXiv_(identifier)" class="mw-redirect" title="ArXiv (identifier)">arXiv</a>:<span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://arxiv.org/abs/2303.16434">2303.16434</a></span> [<a rel="nofollow" class="external text" href="https://arxiv.org/archive/cs.AI">cs.AI</a>].</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=preprint&rft.jtitle=arXiv&rft.atitle=TaskMatrix.AI%3A+Completing+Tasks+by+Connecting+Foundation+Models+with+Millions+of+APIs&rft.date=2023-03-01&rft_id=info%3Aarxiv%2F2303.16434&rft.aulast=Liang&rft.aufirst=Yaobo&rft.au=Wu%2C+Chenfei&rft.au=Song%2C+Ting&rft.au=Wu%2C+Wenshan&rft.au=Xia%2C+Yan&rft.au=Liu%2C+Yu&rft.au=Ou%2C+Yang&rft.au=Lu%2C+Shuai&rft.au=Ji%2C+Lei&rft.au=Mao%2C+Shaoguang&rft.au=Wang%2C+Yun&rft.au=Shou%2C+Linjun&rft.au=Gong%2C+Ming&rft.au=Duan%2C+Nan&rfr_id=info%3Asid%2Fen.wikipedia.org%3ALarge+language+model" class="Z3988"></span></span> </li> <li id="cite_note-4Xzrs-60"><span class="mw-cite-backlink"><b><a href="#cite_ref-4Xzrs_60-0">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFPatilZhangWangGonzalez2023" class="citation arxiv cs1">Patil, Shishir G.; Zhang, Tianjun; Wang, Xin; Gonzalez, Joseph E. (2023-05-01). "Gorilla: Large Language Model Connected with Massive APIs". <a href="/wiki/ArXiv_(identifier)" class="mw-redirect" title="ArXiv (identifier)">arXiv</a>:<span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://arxiv.org/abs/2305.15334">2305.15334</a></span> [<a rel="nofollow" class="external text" href="https://arxiv.org/archive/cs.CL">cs.CL</a>].</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=preprint&rft.jtitle=arXiv&rft.atitle=Gorilla%3A+Large+Language+Model+Connected+with+Massive+APIs&rft.date=2023-05-01&rft_id=info%3Aarxiv%2F2305.15334&rft.aulast=Patil&rft.aufirst=Shishir+G.&rft.au=Zhang%2C+Tianjun&rft.au=Wang%2C+Xin&rft.au=Gonzalez%2C+Joseph+E.&rfr_id=info%3Asid%2Fen.wikipedia.org%3ALarge+language+model" class="Z3988"></span></span> </li> <li id="cite_note-BUZBP-61"><span class="mw-cite-backlink"><b><a href="#cite_ref-BUZBP_61-0">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFLewisPerezPiktusPetroni2020" class="citation journal cs1">Lewis, Patrick; Perez, Ethan; Piktus, Aleksandra; Petroni, Fabio; Karpukhin, Vladimir; Goyal, Naman; Küttler, Heinrich; Lewis, Mike; Yih, Wen-tau; Rocktäschel, Tim; Riedel, Sebastian; Kiela, Douwe (2020). <a rel="nofollow" class="external text" href="https://proceedings.neurips.cc/paper/2020/hash/6b493230205f780e1bc26945df7481e5-Abstract.html">"Retrieval-Augmented Generation for Knowledge-Intensive NLP Tasks"</a>. <i>Advances in Neural Information Processing Systems</i>. <b>33</b>. Curran Associates, Inc.: 9459–9474. <a href="/wiki/ArXiv_(identifier)" class="mw-redirect" title="ArXiv (identifier)">arXiv</a>:<span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://arxiv.org/abs/2005.11401">2005.11401</a></span>. <a rel="nofollow" class="external text" href="https://web.archive.org/web/20230612171229/https://proceedings.neurips.cc/paper/2020/hash/6b493230205f780e1bc26945df7481e5-Abstract.html">Archived</a> from the original on 2023-06-12<span class="reference-accessdate">. Retrieved <span class="nowrap">2023-06-12</span></span>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=article&rft.jtitle=Advances+in+Neural+Information+Processing+Systems&rft.atitle=Retrieval-Augmented+Generation+for+Knowledge-Intensive+NLP+Tasks&rft.volume=33&rft.pages=9459-9474&rft.date=2020&rft_id=info%3Aarxiv%2F2005.11401&rft.aulast=Lewis&rft.aufirst=Patrick&rft.au=Perez%2C+Ethan&rft.au=Piktus%2C+Aleksandra&rft.au=Petroni%2C+Fabio&rft.au=Karpukhin%2C+Vladimir&rft.au=Goyal%2C+Naman&rft.au=K%C3%BCttler%2C+Heinrich&rft.au=Lewis%2C+Mike&rft.au=Yih%2C+Wen-tau&rft.au=Rockt%C3%A4schel%2C+Tim&rft.au=Riedel%2C+Sebastian&rft.au=Kiela%2C+Douwe&rft_id=https%3A%2F%2Fproceedings.neurips.cc%2Fpaper%2F2020%2Fhash%2F6b493230205f780e1bc26945df7481e5-Abstract.html&rfr_id=info%3Asid%2Fen.wikipedia.org%3ALarge+language+model" class="Z3988"></span></span> </li> <li id="cite_note-62"><span class="mw-cite-backlink"><b><a href="#cite_ref-62">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite class="citation web cs1"><a rel="nofollow" class="external text" href="https://www.kdnuggets.com/the-growth-behind-llmbased-autonomous-agents">"The Growth Behind LLM-based Autonomous Agents"</a>. <i>KDnuggets</i>. October 23, 2023.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=unknown&rft.jtitle=KDnuggets&rft.atitle=The+Growth+Behind+LLM-based+Autonomous+Agents&rft.date=2023-10-23&rft_id=https%3A%2F%2Fwww.kdnuggets.com%2Fthe-growth-behind-llmbased-autonomous-agents&rfr_id=info%3Asid%2Fen.wikipedia.org%3ALarge+language+model" class="Z3988"></span></span> </li> <li id="cite_note-DmvNE-63"><span class="mw-cite-backlink"><b><a href="#cite_ref-DmvNE_63-0">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFYaoZhaoYuDu2022" class="citation arxiv cs1">Yao, Shunyu; Zhao, Jeffrey; Yu, Dian; Du, Nan; Shafran, Izhak; Narasimhan, Karthik; Cao, Yuan (2022-10-01). "ReAct: Synergizing Reasoning and Acting in Language Models". <a href="/wiki/ArXiv_(identifier)" class="mw-redirect" title="ArXiv (identifier)">arXiv</a>:<span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://arxiv.org/abs/2210.03629">2210.03629</a></span> [<a rel="nofollow" class="external text" href="https://arxiv.org/archive/cs.CL">cs.CL</a>].</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=preprint&rft.jtitle=arXiv&rft.atitle=ReAct%3A+Synergizing+Reasoning+and+Acting+in+Language+Models&rft.date=2022-10-01&rft_id=info%3Aarxiv%2F2210.03629&rft.aulast=Yao&rft.aufirst=Shunyu&rft.au=Zhao%2C+Jeffrey&rft.au=Yu%2C+Dian&rft.au=Du%2C+Nan&rft.au=Shafran%2C+Izhak&rft.au=Narasimhan%2C+Karthik&rft.au=Cao%2C+Yuan&rfr_id=info%3Asid%2Fen.wikipedia.org%3ALarge+language+model" class="Z3988"></span></span> </li> <li id="cite_note-JS8Vd-64"><span class="mw-cite-backlink"><b><a href="#cite_ref-JS8Vd_64-0">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFWuPrabhumoyeMin2023" class="citation arxiv cs1">Wu, Yue; Prabhumoye, Shrimai; Min, So Yeon (24 May 2023). "SPRING: GPT-4 Out-performs RL Algorithms by Studying Papers and Reasoning". <a href="/wiki/ArXiv_(identifier)" class="mw-redirect" title="ArXiv (identifier)">arXiv</a>:<span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://arxiv.org/abs/2305.15486">2305.15486</a></span> [<a rel="nofollow" class="external text" href="https://arxiv.org/archive/cs.AI">cs.AI</a>].</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=preprint&rft.jtitle=arXiv&rft.atitle=SPRING%3A+GPT-4+Out-performs+RL+Algorithms+by+Studying+Papers+and+Reasoning&rft.date=2023-05-24&rft_id=info%3Aarxiv%2F2305.15486&rft.aulast=Wu&rft.aufirst=Yue&rft.au=Prabhumoye%2C+Shrimai&rft.au=Min%2C+So+Yeon&rfr_id=info%3Asid%2Fen.wikipedia.org%3ALarge+language+model" class="Z3988"></span></span> </li> <li id="cite_note-65"><span class="mw-cite-backlink"><b><a href="#cite_ref-65">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFWangCaiLiuMa2023" class="citation arxiv cs1">Wang, Zihao; Cai, Shaofei; Liu, Anji; Ma, Xiaojian; Liang, Yitao (2023-02-03). "Describe, Explain, Plan and Select: Interactive Planning with Large Language Models Enables Open-World Multi-Task Agents". <a href="/wiki/ArXiv_(identifier)" class="mw-redirect" title="ArXiv (identifier)">arXiv</a>:<span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://arxiv.org/abs/2302.01560">2302.01560</a></span> [<a rel="nofollow" class="external text" href="https://arxiv.org/archive/cs.AI">cs.AI</a>].</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=preprint&rft.jtitle=arXiv&rft.atitle=Describe%2C+Explain%2C+Plan+and+Select%3A+Interactive+Planning+with+Large+Language+Models+Enables+Open-World+Multi-Task+Agents&rft.date=2023-02-03&rft_id=info%3Aarxiv%2F2302.01560&rft.aulast=Wang&rft.aufirst=Zihao&rft.au=Cai%2C+Shaofei&rft.au=Liu%2C+Anji&rft.au=Ma%2C+Xiaojian&rft.au=Liang%2C+Yitao&rfr_id=info%3Asid%2Fen.wikipedia.org%3ALarge+language+model" class="Z3988"></span></span> </li> <li id="cite_note-sbB2T-66"><span class="mw-cite-backlink"><b><a href="#cite_ref-sbB2T_66-0">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFShinnCassanoLabashGopinath2023" class="citation arxiv cs1">Shinn, Noah; Cassano, Federico; Labash, Beck; Gopinath, Ashwin; Narasimhan, Karthik; Yao, Shunyu (2023-03-01). "Reflexion: Language Agents with Verbal Reinforcement Learning". <a href="/wiki/ArXiv_(identifier)" class="mw-redirect" title="ArXiv (identifier)">arXiv</a>:<span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://arxiv.org/abs/2303.11366">2303.11366</a></span> [<a rel="nofollow" class="external text" href="https://arxiv.org/archive/cs.AI">cs.AI</a>].</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=preprint&rft.jtitle=arXiv&rft.atitle=Reflexion%3A+Language+Agents+with+Verbal+Reinforcement+Learning&rft.date=2023-03-01&rft_id=info%3Aarxiv%2F2303.11366&rft.aulast=Shinn&rft.aufirst=Noah&rft.au=Cassano%2C+Federico&rft.au=Labash%2C+Beck&rft.au=Gopinath%2C+Ashwin&rft.au=Narasimhan%2C+Karthik&rft.au=Yao%2C+Shunyu&rfr_id=info%3Asid%2Fen.wikipedia.org%3ALarge+language+model" class="Z3988"></span></span> </li> <li id="cite_note-ltTer-67"><span class="mw-cite-backlink"><b><a href="#cite_ref-ltTer_67-0">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFHaoGuMaJiahua_Hong2023" class="citation arxiv cs1">Hao, Shibo; Gu, Yi; Ma, Haodi; Jiahua Hong, Joshua; Wang, Zhen; Zhe Wang, Daisy; Hu, Zhiting (2023-05-01). "Reasoning with Language Model is Planning with World Model". <a href="/wiki/ArXiv_(identifier)" class="mw-redirect" title="ArXiv (identifier)">arXiv</a>:<span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://arxiv.org/abs/2305.14992">2305.14992</a></span> [<a rel="nofollow" class="external text" href="https://arxiv.org/archive/cs.CL">cs.CL</a>].</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=preprint&rft.jtitle=arXiv&rft.atitle=Reasoning+with+Language+Model+is+Planning+with+World+Model&rft.date=2023-05-01&rft_id=info%3Aarxiv%2F2305.14992&rft.aulast=Hao&rft.aufirst=Shibo&rft.au=Gu%2C+Yi&rft.au=Ma%2C+Haodi&rft.au=Jiahua+Hong%2C+Joshua&rft.au=Wang%2C+Zhen&rft.au=Zhe+Wang%2C+Daisy&rft.au=Hu%2C+Zhiting&rfr_id=info%3Asid%2Fen.wikipedia.org%3ALarge+language+model" class="Z3988"></span></span> </li> <li id="cite_note-mBvD9-68"><span class="mw-cite-backlink"><b><a href="#cite_ref-mBvD9_68-0">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFZhangLehmanStanleyClune2023" class="citation arxiv cs1">Zhang, Jenny; Lehman, Joel; Stanley, Kenneth; Clune, Jeff (2 June 2023). "OMNI: Open-endedness via Models of human Notions of Interestingness". <a href="/wiki/ArXiv_(identifier)" class="mw-redirect" title="ArXiv (identifier)">arXiv</a>:<span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://arxiv.org/abs/2306.01711">2306.01711</a></span> [<a rel="nofollow" class="external text" href="https://arxiv.org/archive/cs.AI">cs.AI</a>].</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=preprint&rft.jtitle=arXiv&rft.atitle=OMNI%3A+Open-endedness+via+Models+of+human+Notions+of+Interestingness&rft.date=2023-06-02&rft_id=info%3Aarxiv%2F2306.01711&rft.aulast=Zhang&rft.aufirst=Jenny&rft.au=Lehman%2C+Joel&rft.au=Stanley%2C+Kenneth&rft.au=Clune%2C+Jeff&rfr_id=info%3Asid%2Fen.wikipedia.org%3ALarge+language+model" class="Z3988"></span></span> </li> <li id="cite_note-:0-69"><span class="mw-cite-backlink">^ <a href="#cite_ref-:0_69-0"><sup><i><b>a</b></i></sup></a> <a href="#cite_ref-:0_69-1"><sup><i><b>b</b></i></sup></a></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite class="citation web cs1"><a rel="nofollow" class="external text" href="https://voyager.minedojo.org/">"Voyager | An Open-Ended Embodied Agent with Large Language Models"</a>. <i>voyager.minedojo.org</i>. <a rel="nofollow" class="external text" href="https://web.archive.org/web/20230608225054/https://voyager.minedojo.org/">Archived</a> from the original on 2023-06-08<span class="reference-accessdate">. Retrieved <span class="nowrap">2023-06-09</span></span>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=unknown&rft.jtitle=voyager.minedojo.org&rft.atitle=Voyager+%7C+An+Open-Ended+Embodied+Agent+with+Large+Language+Models&rft_id=https%3A%2F%2Fvoyager.minedojo.org%2F&rfr_id=info%3Asid%2Fen.wikipedia.org%3ALarge+language+model" class="Z3988"></span></span> </li> <li id="cite_note-XuvjF-70"><span class="mw-cite-backlink"><b><a href="#cite_ref-XuvjF_70-0">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFParkO'BrienCaiRingel_Morris2023" class="citation arxiv cs1">Park, Joon Sung; O'Brien, Joseph C.; Cai, Carrie J.; Ringel Morris, Meredith; Liang, Percy; Bernstein, Michael S. (2023-04-01). "Generative Agents: Interactive Simulacra of Human Behavior". <a href="/wiki/ArXiv_(identifier)" class="mw-redirect" title="ArXiv (identifier)">arXiv</a>:<span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://arxiv.org/abs/2304.03442">2304.03442</a></span> [<a rel="nofollow" class="external text" href="https://arxiv.org/archive/cs.HC">cs.HC</a>].</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=preprint&rft.jtitle=arXiv&rft.atitle=Generative+Agents%3A+Interactive+Simulacra+of+Human+Behavior&rft.date=2023-04-01&rft_id=info%3Aarxiv%2F2304.03442&rft.aulast=Park&rft.aufirst=Joon+Sung&rft.au=O%27Brien%2C+Joseph+C.&rft.au=Cai%2C+Carrie+J.&rft.au=Ringel+Morris%2C+Meredith&rft.au=Liang%2C+Percy&rft.au=Bernstein%2C+Michael+S.&rfr_id=info%3Asid%2Fen.wikipedia.org%3ALarge+language+model" class="Z3988"></span></span> </li> <li id="cite_note-71"><span class="mw-cite-backlink"><b><a href="#cite_ref-71">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFMann" class="citation web cs1">Mann, Tobias. <a rel="nofollow" class="external text" href="https://www.theregister.com/2024/03/17/ai_pc_local_llm/">"How to run an LLM locally on your PC in less than 10 minutes"</a>. <i>www.theregister.com</i><span class="reference-accessdate">. Retrieved <span class="nowrap">2024-05-17</span></span>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=unknown&rft.jtitle=www.theregister.com&rft.atitle=How+to+run+an+LLM+locally+on+your+PC+in+less+than+10+minutes&rft.aulast=Mann&rft.aufirst=Tobias&rft_id=https%3A%2F%2Fwww.theregister.com%2F2024%2F03%2F17%2Fai_pc_local_llm%2F&rfr_id=info%3Asid%2Fen.wikipedia.org%3ALarge+language+model" class="Z3988"></span></span> </li> <li id="cite_note-LS2Go-72"><span class="mw-cite-backlink"><b><a href="#cite_ref-LS2Go_72-0">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFNagelAmjadBaalenLouizos2020" class="citation journal cs1">Nagel, Markus; Amjad, Rana Ali; Baalen, Mart Van; Louizos, Christos; Blankevoort, Tijmen (2020-11-21). <a rel="nofollow" class="external text" href="https://proceedings.mlr.press/v119/nagel20a.html">"Up or Down? Adaptive Rounding for Post-Training Quantization"</a>. <i>Proceedings of the 37th International Conference on Machine Learning</i>. PMLR: 7197–7206. <a rel="nofollow" class="external text" href="https://web.archive.org/web/20230614080854/https://proceedings.mlr.press/v119/nagel20a.html">Archived</a> from the original on 2023-06-14<span class="reference-accessdate">. Retrieved <span class="nowrap">2023-06-14</span></span>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=article&rft.jtitle=Proceedings+of+the+37th+International+Conference+on+Machine+Learning&rft.atitle=Up+or+Down%3F+Adaptive+Rounding+for+Post-Training+Quantization&rft.pages=7197-7206&rft.date=2020-11-21&rft.aulast=Nagel&rft.aufirst=Markus&rft.au=Amjad%2C+Rana+Ali&rft.au=Baalen%2C+Mart+Van&rft.au=Louizos%2C+Christos&rft.au=Blankevoort%2C+Tijmen&rft_id=https%3A%2F%2Fproceedings.mlr.press%2Fv119%2Fnagel20a.html&rfr_id=info%3Asid%2Fen.wikipedia.org%3ALarge+language+model" class="Z3988"></span></span> </li> <li id="cite_note-cpzcK-73"><span class="mw-cite-backlink"><b><a href="#cite_ref-cpzcK_73-0">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFPolinoPascanuAlistarh2018" class="citation arxiv cs1">Polino, Antonio; Pascanu, Razvan; Alistarh, Dan (2018-02-01). "Model compression via distillation and quantization". <a href="/wiki/ArXiv_(identifier)" class="mw-redirect" title="ArXiv (identifier)">arXiv</a>:<span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://arxiv.org/abs/1802.05668">1802.05668</a></span> [<a rel="nofollow" class="external text" href="https://arxiv.org/archive/cs.NE">cs.NE</a>].</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=preprint&rft.jtitle=arXiv&rft.atitle=Model+compression+via+distillation+and+quantization&rft.date=2018-02-01&rft_id=info%3Aarxiv%2F1802.05668&rft.aulast=Polino&rft.aufirst=Antonio&rft.au=Pascanu%2C+Razvan&rft.au=Alistarh%2C+Dan&rfr_id=info%3Asid%2Fen.wikipedia.org%3ALarge+language+model" class="Z3988"></span></span> </li> <li id="cite_note-QVU95-74"><span class="mw-cite-backlink"><b><a href="#cite_ref-QVU95_74-0">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFFrantarAshkboosHoeflerAlistarh2022" class="citation arxiv cs1">Frantar, Elias; Ashkboos, Saleh; Hoefler, Torsten; Alistarh, Dan (2022-10-01). "GPTQ: Accurate Post-Training Quantization for Generative Pre-trained Transformers". <a href="/wiki/ArXiv_(identifier)" class="mw-redirect" title="ArXiv (identifier)">arXiv</a>:<span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://arxiv.org/abs/2210.17323">2210.17323</a></span> [<a rel="nofollow" class="external text" href="https://arxiv.org/archive/cs.LG">cs.LG</a>].</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=preprint&rft.jtitle=arXiv&rft.atitle=GPTQ%3A+Accurate+Post-Training+Quantization+for+Generative+Pre-trained+Transformers&rft.date=2022-10-01&rft_id=info%3Aarxiv%2F2210.17323&rft.aulast=Frantar&rft.aufirst=Elias&rft.au=Ashkboos%2C+Saleh&rft.au=Hoefler%2C+Torsten&rft.au=Alistarh%2C+Dan&rfr_id=info%3Asid%2Fen.wikipedia.org%3ALarge+language+model" class="Z3988"></span></span> </li> <li id="cite_note-dU9Bu-75"><span class="mw-cite-backlink"><b><a href="#cite_ref-dU9Bu_75-0">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFDettmersSvirschevskiEgiazarianKuznedelev2023" class="citation arxiv cs1">Dettmers, Tim; Svirschevski, Ruslan; Egiazarian, Vage; Kuznedelev, Denis; Frantar, Elias; Ashkboos, Saleh; Borzunov, Alexander; Hoefler, Torsten; Alistarh, Dan (2023-06-01). "SpQR: A Sparse-Quantized Representation for Near-Lossless LLM Weight Compression". <a href="/wiki/ArXiv_(identifier)" class="mw-redirect" title="ArXiv (identifier)">arXiv</a>:<span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://arxiv.org/abs/2306.03078">2306.03078</a></span> [<a rel="nofollow" class="external text" href="https://arxiv.org/archive/cs.CL">cs.CL</a>].</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=preprint&rft.jtitle=arXiv&rft.atitle=SpQR%3A+A+Sparse-Quantized+Representation+for+Near-Lossless+LLM+Weight+Compression&rft.date=2023-06-01&rft_id=info%3Aarxiv%2F2306.03078&rft.aulast=Dettmers&rft.aufirst=Tim&rft.au=Svirschevski%2C+Ruslan&rft.au=Egiazarian%2C+Vage&rft.au=Kuznedelev%2C+Denis&rft.au=Frantar%2C+Elias&rft.au=Ashkboos%2C+Saleh&rft.au=Borzunov%2C+Alexander&rft.au=Hoefler%2C+Torsten&rft.au=Alistarh%2C+Dan&rfr_id=info%3Asid%2Fen.wikipedia.org%3ALarge+language+model" class="Z3988"></span></span> </li> <li id="cite_note-76"><span class="mw-cite-backlink"><b><a href="#cite_ref-76">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFGrootendorst" class="citation web cs1">Grootendorst, Maarten. <a rel="nofollow" class="external text" href="https://web.archive.org/web/20240731003355/https://newsletter.maartengrootendorst.com/p/a-visual-guide-to-quantization">"A Visual Guide to Quantization"</a>. <i>newsletter.maartengrootendorst.com</i>. Archived from <a rel="nofollow" class="external text" href="https://newsletter.maartengrootendorst.com/p/a-visual-guide-to-quantization">the original</a> on 31 Jul 2024<span class="reference-accessdate">. Retrieved <span class="nowrap">2024-07-31</span></span>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=unknown&rft.jtitle=newsletter.maartengrootendorst.com&rft.atitle=A+Visual+Guide+to+Quantization&rft.aulast=Grootendorst&rft.aufirst=Maarten&rft_id=https%3A%2F%2Fnewsletter.maartengrootendorst.com%2Fp%2Fa-visual-guide-to-quantization&rfr_id=info%3Asid%2Fen.wikipedia.org%3ALarge+language+model" class="Z3988"></span></span> </li> <li id="cite_note-D0nFA-77"><span class="mw-cite-backlink"><b><a href="#cite_ref-D0nFA_77-0">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFDettmersPagnoniHoltzmanZettlemoyer2023" class="citation arxiv cs1">Dettmers, Tim; Pagnoni, Artidoro; <a href="/wiki/Ari_Holtzman" title="Ari Holtzman">Holtzman, Ari</a>; Zettlemoyer, Luke (2023-05-01). "QLoRA: Efficient Finetuning of Quantized LLMs". <a href="/wiki/ArXiv_(identifier)" class="mw-redirect" title="ArXiv (identifier)">arXiv</a>:<span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://arxiv.org/abs/2305.14314">2305.14314</a></span> [<a rel="nofollow" class="external text" href="https://arxiv.org/archive/cs.LG">cs.LG</a>].</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=preprint&rft.jtitle=arXiv&rft.atitle=QLoRA%3A+Efficient+Finetuning+of+Quantized+LLMs&rft.date=2023-05-01&rft_id=info%3Aarxiv%2F2305.14314&rft.aulast=Dettmers&rft.aufirst=Tim&rft.au=Pagnoni%2C+Artidoro&rft.au=Holtzman%2C+Ari&rft.au=Zettlemoyer%2C+Luke&rfr_id=info%3Asid%2Fen.wikipedia.org%3ALarge+language+model" class="Z3988"></span></span> </li> <li id="cite_note-78"><span class="mw-cite-backlink"><b><a href="#cite_ref-78">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFKirosSalakhutdinovZemel2014" class="citation journal cs1">Kiros, Ryan; Salakhutdinov, Ruslan; Zemel, Rich (2014-06-18). <a rel="nofollow" class="external text" href="https://proceedings.mlr.press/v32/kiros14.html">"Multimodal Neural Language Models"</a>. <i>Proceedings of the 31st International Conference on Machine Learning</i>. PMLR: 595–603. <a rel="nofollow" class="external text" href="https://web.archive.org/web/20230702195952/https://proceedings.mlr.press/v32/kiros14.html">Archived</a> from the original on 2023-07-02<span class="reference-accessdate">. Retrieved <span class="nowrap">2023-07-02</span></span>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=article&rft.jtitle=Proceedings+of+the+31st+International+Conference+on+Machine+Learning&rft.atitle=Multimodal+Neural+Language+Models&rft.pages=595-603&rft.date=2014-06-18&rft.aulast=Kiros&rft.aufirst=Ryan&rft.au=Salakhutdinov%2C+Ruslan&rft.au=Zemel%2C+Rich&rft_id=https%3A%2F%2Fproceedings.mlr.press%2Fv32%2Fkiros14.html&rfr_id=info%3Asid%2Fen.wikipedia.org%3ALarge+language+model" class="Z3988"></span></span> </li> <li id="cite_note-79"><span class="mw-cite-backlink"><b><a href="#cite_ref-79">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFKrizhevskySutskeverHinton2012" class="citation journal cs1">Krizhevsky, Alex; Sutskever, Ilya; Hinton, Geoffrey E (2012). <a rel="nofollow" class="external text" href="https://proceedings.neurips.cc/paper/2012/hash/c399862d3b9d6b76c8436e924a68c45b-Abstract.html">"ImageNet Classification with Deep Convolutional Neural Networks"</a>. <i>Advances in Neural Information Processing Systems</i>. <b>25</b>. Curran Associates, Inc. <a rel="nofollow" class="external text" href="https://web.archive.org/web/20230702195952/https://proceedings.neurips.cc/paper/2012/hash/c399862d3b9d6b76c8436e924a68c45b-Abstract.html">Archived</a> from the original on 2023-07-02<span class="reference-accessdate">. Retrieved <span class="nowrap">2023-07-02</span></span>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=article&rft.jtitle=Advances+in+Neural+Information+Processing+Systems&rft.atitle=ImageNet+Classification+with+Deep+Convolutional+Neural+Networks&rft.volume=25&rft.date=2012&rft.aulast=Krizhevsky&rft.aufirst=Alex&rft.au=Sutskever%2C+Ilya&rft.au=Hinton%2C+Geoffrey+E&rft_id=https%3A%2F%2Fproceedings.neurips.cc%2Fpaper%2F2012%2Fhash%2Fc399862d3b9d6b76c8436e924a68c45b-Abstract.html&rfr_id=info%3Asid%2Fen.wikipedia.org%3ALarge+language+model" class="Z3988"></span></span> </li> <li id="cite_note-80"><span class="mw-cite-backlink"><b><a href="#cite_ref-80">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFAntolAgrawalLuMitchell2015" class="citation journal cs1">Antol, Stanislaw; Agrawal, Aishwarya; Lu, Jiasen; Mitchell, Margaret; Batra, Dhruv; Zitnick, C. Lawrence; Parikh, Devi (2015). <a rel="nofollow" class="external text" href="https://openaccess.thecvf.com/content_iccv_2015/html/Antol_VQA_Visual_Question_ICCV_2015_paper.html">"VQA: Visual Question Answering"</a>. <i>ICCV</i>: 2425–2433. <a rel="nofollow" class="external text" href="https://web.archive.org/web/20230702195952/https://openaccess.thecvf.com/content_iccv_2015/html/Antol_VQA_Visual_Question_ICCV_2015_paper.html">Archived</a> from the original on 2023-07-02<span class="reference-accessdate">. Retrieved <span class="nowrap">2023-07-02</span></span>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=article&rft.jtitle=ICCV&rft.atitle=VQA%3A+Visual+Question+Answering&rft.pages=2425-2433&rft.date=2015&rft.aulast=Antol&rft.aufirst=Stanislaw&rft.au=Agrawal%2C+Aishwarya&rft.au=Lu%2C+Jiasen&rft.au=Mitchell%2C+Margaret&rft.au=Batra%2C+Dhruv&rft.au=Zitnick%2C+C.+Lawrence&rft.au=Parikh%2C+Devi&rft_id=https%3A%2F%2Fopenaccess.thecvf.com%2Fcontent_iccv_2015%2Fhtml%2FAntol_VQA_Visual_Question_ICCV_2015_paper.html&rfr_id=info%3Asid%2Fen.wikipedia.org%3ALarge+language+model" class="Z3988"></span></span> </li> <li id="cite_note-81"><span class="mw-cite-backlink"><b><a href="#cite_ref-81">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFLiLiSavareseHoi2023" class="citation arxiv cs1">Li, Junnan; Li, Dongxu; Savarese, Silvio; Hoi, Steven (2023-01-01). "BLIP-2: Bootstrapping Language-Image Pre-training with Frozen Image Encoders and Large Language Models". <a href="/wiki/ArXiv_(identifier)" class="mw-redirect" title="ArXiv (identifier)">arXiv</a>:<span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://arxiv.org/abs/2301.12597">2301.12597</a></span> [<a rel="nofollow" class="external text" href="https://arxiv.org/archive/cs.CV">cs.CV</a>].</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=preprint&rft.jtitle=arXiv&rft.atitle=BLIP-2%3A+Bootstrapping+Language-Image+Pre-training+with+Frozen+Image+Encoders+and+Large+Language+Models&rft.date=2023-01-01&rft_id=info%3Aarxiv%2F2301.12597&rft.aulast=Li&rft.aufirst=Junnan&rft.au=Li%2C+Dongxu&rft.au=Savarese%2C+Silvio&rft.au=Hoi%2C+Steven&rfr_id=info%3Asid%2Fen.wikipedia.org%3ALarge+language+model" class="Z3988"></span></span> </li> <li id="cite_note-82"><span class="mw-cite-backlink"><b><a href="#cite_ref-82">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFAlayracDonahueLucMiech2022" class="citation journal cs1">Alayrac, Jean-Baptiste; Donahue, Jeff; Luc, Pauline; Miech, Antoine; Barr, Iain; Hasson, Yana; Lenc, Karel; Mensch, Arthur; Millican, Katherine; Reynolds, Malcolm; Ring, Roman; Rutherford, Eliza; Cabi, Serkan; Han, Tengda; Gong, Zhitao (2022-12-06). <a rel="nofollow" class="external text" href="https://proceedings.neurips.cc/paper_files/paper/2022/hash/960a172bc7fbf0177ccccbb411a7d800-Abstract-Conference.html">"Flamingo: a Visual Language Model for Few-Shot Learning"</a>. <i>Advances in Neural Information Processing Systems</i>. <b>35</b>: 23716–23736. <a href="/wiki/ArXiv_(identifier)" class="mw-redirect" title="ArXiv (identifier)">arXiv</a>:<span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://arxiv.org/abs/2204.14198">2204.14198</a></span>. <a rel="nofollow" class="external text" href="https://web.archive.org/web/20230702195951/https://proceedings.neurips.cc/paper_files/paper/2022/hash/960a172bc7fbf0177ccccbb411a7d800-Abstract-Conference.html">Archived</a> from the original on 2023-07-02<span class="reference-accessdate">. Retrieved <span class="nowrap">2023-07-02</span></span>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=article&rft.jtitle=Advances+in+Neural+Information+Processing+Systems&rft.atitle=Flamingo%3A+a+Visual+Language+Model+for+Few-Shot+Learning&rft.volume=35&rft.pages=23716-23736&rft.date=2022-12-06&rft_id=info%3Aarxiv%2F2204.14198&rft.aulast=Alayrac&rft.aufirst=Jean-Baptiste&rft.au=Donahue%2C+Jeff&rft.au=Luc%2C+Pauline&rft.au=Miech%2C+Antoine&rft.au=Barr%2C+Iain&rft.au=Hasson%2C+Yana&rft.au=Lenc%2C+Karel&rft.au=Mensch%2C+Arthur&rft.au=Millican%2C+Katherine&rft.au=Reynolds%2C+Malcolm&rft.au=Ring%2C+Roman&rft.au=Rutherford%2C+Eliza&rft.au=Cabi%2C+Serkan&rft.au=Han%2C+Tengda&rft.au=Gong%2C+Zhitao&rft_id=https%3A%2F%2Fproceedings.neurips.cc%2Fpaper_files%2Fpaper%2F2022%2Fhash%2F960a172bc7fbf0177ccccbb411a7d800-Abstract-Conference.html&rfr_id=info%3Asid%2Fen.wikipedia.org%3ALarge+language+model" class="Z3988"></span></span> </li> <li id="cite_note-83"><span class="mw-cite-backlink"><b><a href="#cite_ref-83">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFDriessXiaSajjadiLynch2023" class="citation arxiv cs1">Driess, Danny; Xia, Fei; Sajjadi, Mehdi S. M.; Lynch, Corey; Chowdhery, Aakanksha; Ichter, Brian; Wahid, Ayzaan; Tompson, Jonathan; Vuong, Quan; Yu, Tianhe; Huang, Wenlong; Chebotar, Yevgen; Sermanet, Pierre; Duckworth, Daniel; Levine, Sergey (2023-03-01). "PaLM-E: An Embodied Multimodal Language Model". <a href="/wiki/ArXiv_(identifier)" class="mw-redirect" title="ArXiv (identifier)">arXiv</a>:<span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://arxiv.org/abs/2303.03378">2303.03378</a></span> [<a rel="nofollow" class="external text" href="https://arxiv.org/archive/cs.LG">cs.LG</a>].</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=preprint&rft.jtitle=arXiv&rft.atitle=PaLM-E%3A+An+Embodied+Multimodal+Language+Model&rft.date=2023-03-01&rft_id=info%3Aarxiv%2F2303.03378&rft.aulast=Driess&rft.aufirst=Danny&rft.au=Xia%2C+Fei&rft.au=Sajjadi%2C+Mehdi+S.+M.&rft.au=Lynch%2C+Corey&rft.au=Chowdhery%2C+Aakanksha&rft.au=Ichter%2C+Brian&rft.au=Wahid%2C+Ayzaan&rft.au=Tompson%2C+Jonathan&rft.au=Vuong%2C+Quan&rft.au=Yu%2C+Tianhe&rft.au=Huang%2C+Wenlong&rft.au=Chebotar%2C+Yevgen&rft.au=Sermanet%2C+Pierre&rft.au=Duckworth%2C+Daniel&rft.au=Levine%2C+Sergey&rfr_id=info%3Asid%2Fen.wikipedia.org%3ALarge+language+model" class="Z3988"></span></span> </li> <li id="cite_note-84"><span class="mw-cite-backlink"><b><a href="#cite_ref-84">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFLiuLiWuLee2023" class="citation arxiv cs1">Liu, Haotian; Li, Chunyuan; Wu, Qingyang; Lee, Yong Jae (2023-04-01). "Visual Instruction Tuning". <a href="/wiki/ArXiv_(identifier)" class="mw-redirect" title="ArXiv (identifier)">arXiv</a>:<span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://arxiv.org/abs/2304.08485">2304.08485</a></span> [<a rel="nofollow" class="external text" href="https://arxiv.org/archive/cs.CV">cs.CV</a>].</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=preprint&rft.jtitle=arXiv&rft.atitle=Visual+Instruction+Tuning&rft.date=2023-04-01&rft_id=info%3Aarxiv%2F2304.08485&rft.aulast=Liu&rft.aufirst=Haotian&rft.au=Li%2C+Chunyuan&rft.au=Wu%2C+Qingyang&rft.au=Lee%2C+Yong+Jae&rfr_id=info%3Asid%2Fen.wikipedia.org%3ALarge+language+model" class="Z3988"></span></span> </li> <li id="cite_note-85"><span class="mw-cite-backlink"><b><a href="#cite_ref-85">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFZhangLiBing2023" class="citation arxiv cs1">Zhang, Hang; Li, Xin; Bing, Lidong (2023-06-01). "Video-LLaMA: An Instruction-tuned Audio-Visual Language Model for Video Understanding". <a href="/wiki/ArXiv_(identifier)" class="mw-redirect" title="ArXiv (identifier)">arXiv</a>:<span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://arxiv.org/abs/2306.02858">2306.02858</a></span> [<a rel="nofollow" class="external text" href="https://arxiv.org/archive/cs.CL">cs.CL</a>].</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=preprint&rft.jtitle=arXiv&rft.atitle=Video-LLaMA%3A+An+Instruction-tuned+Audio-Visual+Language+Model+for+Video+Understanding&rft.date=2023-06-01&rft_id=info%3Aarxiv%2F2306.02858&rft.aulast=Zhang&rft.aufirst=Hang&rft.au=Li%2C+Xin&rft.au=Bing%2C+Lidong&rfr_id=info%3Asid%2Fen.wikipedia.org%3ALarge+language+model" class="Z3988"></span></span> </li> <li id="cite_note-86"><span class="mw-cite-backlink"><b><a href="#cite_ref-86">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFOpenAI2023" class="citation arxiv cs1">OpenAI (2023-03-27). "GPT-4 Technical Report". <a href="/wiki/ArXiv_(identifier)" class="mw-redirect" title="ArXiv (identifier)">arXiv</a>:<span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://arxiv.org/abs/2303.08774">2303.08774</a></span> [<a rel="nofollow" class="external text" href="https://arxiv.org/archive/cs.CL">cs.CL</a>].</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=preprint&rft.jtitle=arXiv&rft.atitle=GPT-4+Technical+Report&rft.date=2023-03-27&rft_id=info%3Aarxiv%2F2303.08774&rft.au=OpenAI&rfr_id=info%3Asid%2Fen.wikipedia.org%3ALarge+language+model" class="Z3988"></span></span> </li> <li id="cite_note-87"><span class="mw-cite-backlink"><b><a href="#cite_ref-87">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFOpenAI2023" class="citation web cs1">OpenAI (September 25, 2023). <a rel="nofollow" class="external text" href="https://cdn.openai.com/papers/GPTV_System_Card.pdf">"GPT-4V(ision) System Card"</a> <span class="cs1-format">(PDF)</span>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&rft.genre=unknown&rft.btitle=GPT-4V%28ision%29+System+Card&rft.date=2023-09-25&rft.au=OpenAI&rft_id=https%3A%2F%2Fcdn.openai.com%2Fpapers%2FGPTV_System_Card.pdf&rfr_id=info%3Asid%2Fen.wikipedia.org%3ALarge+language+model" class="Z3988"></span></span> </li> <li id="cite_note-88"><span class="mw-cite-backlink"><b><a href="#cite_ref-88">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFPichai2023" class="citation cs2">Pichai, Sundar (10 May 2023), <a rel="nofollow" class="external text" href="https://www.youtube.com/watch?v=cNfINi5CNbY&t=931s"><i>Google Keynote (Google I/O '23)</i></a>, timestamp 15:31<span class="reference-accessdate">, retrieved <span class="nowrap">2023-07-02</span></span></cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&rft.genre=book&rft.btitle=Google+Keynote+%28Google+I%2FO+%2723%29&rft.pages=timestamp+15%3A31&rft.date=2023-05-10&rft.aulast=Pichai&rft.aufirst=Sundar&rft_id=https%3A%2F%2Fwww.youtube.com%2Fwatch%3Fv%3DcNfINi5CNbY%26t%3D931s&rfr_id=info%3Asid%2Fen.wikipedia.org%3ALarge+language+model" class="Z3988"></span></span> </li> <li id="cite_note-89"><span class="mw-cite-backlink"><b><a href="#cite_ref-89">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFWiggers2024" class="citation web cs1">Wiggers, Kyle (11 September 2024). <a rel="nofollow" class="external text" href="https://techcrunch.com/2024/09/11/mistral-releases-pixtral-its-first-multimodal-model/?utm_medium=aisecret.us&utm_source=aisecret.us&utm_campaign=aisecret.us">"Mistral releases Pixtral 12B, its first multimodal model"</a>. <i>TechCrunch</i><span class="reference-accessdate">. Retrieved <span class="nowrap">14 September</span> 2024</span>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=unknown&rft.jtitle=TechCrunch&rft.atitle=Mistral+releases+Pixtral+12B%2C+its+first+multimodal+model&rft.date=2024-09-11&rft.aulast=Wiggers&rft.aufirst=Kyle&rft_id=https%3A%2F%2Ftechcrunch.com%2F2024%2F09%2F11%2Fmistral-releases-pixtral-its-first-multimodal-model%2F%3Futm_medium%3Daisecret.us%26utm_source%3Daisecret.us%26utm_campaign%3Daisecret.us&rfr_id=info%3Asid%2Fen.wikipedia.org%3ALarge+language+model" class="Z3988"></span></span> </li> <li id="cite_note-fJta3-90"><span class="mw-cite-backlink"><b><a href="#cite_ref-fJta3_90-0">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFHoffmannBorgeaudMenschBuchatskaya2022" class="citation arxiv cs1">Hoffmann, Jordan; Borgeaud, Sebastian; Mensch, Arthur; Buchatskaya, Elena; Cai, Trevor; Rutherford, Eliza; Casas, Diego de Las; Hendricks, Lisa Anne; Welbl, Johannes; Clark, Aidan; Hennigan, Tom; Noland, Eric; Millican, Katie; Driessche, George van den; Damoc, Bogdan (2022-03-29). "Training Compute-Optimal Large Language Models". <a href="/wiki/ArXiv_(identifier)" class="mw-redirect" title="ArXiv (identifier)">arXiv</a>:<span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://arxiv.org/abs/2203.15556">2203.15556</a></span> [<a rel="nofollow" class="external text" href="https://arxiv.org/archive/cs.CL">cs.CL</a>].</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=preprint&rft.jtitle=arXiv&rft.atitle=Training+Compute-Optimal+Large+Language+Models&rft.date=2022-03-29&rft_id=info%3Aarxiv%2F2203.15556&rft.aulast=Hoffmann&rft.aufirst=Jordan&rft.au=Borgeaud%2C+Sebastian&rft.au=Mensch%2C+Arthur&rft.au=Buchatskaya%2C+Elena&rft.au=Cai%2C+Trevor&rft.au=Rutherford%2C+Eliza&rft.au=Casas%2C+Diego+de+Las&rft.au=Hendricks%2C+Lisa+Anne&rft.au=Welbl%2C+Johannes&rft.au=Clark%2C+Aidan&rft.au=Hennigan%2C+Tom&rft.au=Noland%2C+Eric&rft.au=Millican%2C+Katie&rft.au=Driessche%2C+George+van+den&rft.au=Damoc%2C+Bogdan&rfr_id=info%3Asid%2Fen.wikipedia.org%3ALarge+language+model" class="Z3988"></span></span> </li> <li id="cite_note-IYm4Q-91"><span class="mw-cite-backlink">^ <a href="#cite_ref-IYm4Q_91-0"><sup><i><b>a</b></i></sup></a> <a href="#cite_ref-IYm4Q_91-1"><sup><i><b>b</b></i></sup></a></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFCaballeroGuptaRishKrueger2022" class="citation arxiv cs1">Caballero, Ethan; Gupta, Kshitij; Rish, Irina; Krueger, David (2022). "Broken Neural Scaling Laws". <a href="/wiki/ArXiv_(identifier)" class="mw-redirect" title="ArXiv (identifier)">arXiv</a>:<span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://arxiv.org/abs/2210.14891">2210.14891</a></span> [<a rel="nofollow" class="external text" href="https://arxiv.org/archive/cs.LG">cs.LG</a>].</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=preprint&rft.jtitle=arXiv&rft.atitle=Broken+Neural+Scaling+Laws&rft.date=2022&rft_id=info%3Aarxiv%2F2210.14891&rft.aulast=Caballero&rft.aufirst=Ethan&rft.au=Gupta%2C+Kshitij&rft.au=Rish%2C+Irina&rft.au=Krueger%2C+David&rfr_id=info%3Asid%2Fen.wikipedia.org%3ALarge+language+model" class="Z3988"></span></span> </li> <li id="cite_note-JM6s1-92"><span class="mw-cite-backlink"><b><a href="#cite_ref-JM6s1_92-0">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite class="citation web cs1"><a rel="nofollow" class="external text" href="https://www.jasonwei.net/blog/emergence">"137 emergent abilities of large language models"</a>. <i>Jason Wei</i><span class="reference-accessdate">. Retrieved <span class="nowrap">2023-06-24</span></span>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=unknown&rft.jtitle=Jason+Wei&rft.atitle=137+emergent+abilities+of+large+language+models&rft_id=https%3A%2F%2Fwww.jasonwei.net%2Fblog%2Femergence&rfr_id=info%3Asid%2Fen.wikipedia.org%3ALarge+language+model" class="Z3988"></span></span> </li> <li id="cite_note-Bowman-93"><span class="mw-cite-backlink"><b><a href="#cite_ref-Bowman_93-0">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFBowman2023" class="citation arxiv cs1">Bowman, Samuel R. (2023). "Eight Things to Know about Large Language Models". <a href="/wiki/ArXiv_(identifier)" class="mw-redirect" title="ArXiv (identifier)">arXiv</a>:<span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://arxiv.org/abs/2304.00612">2304.00612</a></span> [<a rel="nofollow" class="external text" href="https://arxiv.org/archive/cs.CL">cs.CL</a>].</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=preprint&rft.jtitle=arXiv&rft.atitle=Eight+Things+to+Know+about+Large+Language+Models&rft.date=2023&rft_id=info%3Aarxiv%2F2304.00612&rft.aulast=Bowman&rft.aufirst=Samuel+R.&rfr_id=info%3Asid%2Fen.wikipedia.org%3ALarge+language+model" class="Z3988"></span></span> </li> <li id="cite_note-Heuristic-Mukherjee-94"><span class="mw-cite-backlink"><b><a href="#cite_ref-Heuristic-Mukherjee_94-0">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFMukherjeeChang2024" class="citation arxiv cs1">Mukherjee, Anirban; Chang, Hannah (2024). "Heuristic Reasoning in AI: Instrumental Use and Mimetic Absorption". <a href="/wiki/ArXiv_(identifier)" class="mw-redirect" title="ArXiv (identifier)">arXiv</a>:<span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://arxiv.org/abs/2403.09404">2403.09404</a></span>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=preprint&rft.jtitle=arXiv&rft.atitle=Heuristic+Reasoning+in+AI%3A+Instrumental+Use+and+Mimetic+Absorption&rft.date=2024&rft_id=info%3Aarxiv%2F2403.09404&rft.aulast=Mukherjee&rft.aufirst=Anirban&rft.au=Chang%2C+Hannah&rfr_id=info%3Asid%2Fen.wikipedia.org%3ALarge+language+model" class="Z3988"></span></span> </li> <li id="cite_note-Hahn_20230314-95"><span class="mw-cite-backlink"><b><a href="#cite_ref-Hahn_20230314_95-0">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFHahnGoyal2023" class="citation arxiv cs1">Hahn, Michael; Goyal, Navin (2023-03-14). "A Theory of Emergent In-Context Learning as Implicit Structure Induction". <a href="/wiki/ArXiv_(identifier)" class="mw-redirect" title="ArXiv (identifier)">arXiv</a>:<span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://arxiv.org/abs/2303.07971">2303.07971</a></span> [<a rel="nofollow" class="external text" href="https://arxiv.org/archive/cs.LG">cs.LG</a>].</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=preprint&rft.jtitle=arXiv&rft.atitle=A+Theory+of+Emergent+In-Context+Learning+as+Implicit+Structure+Induction&rft.date=2023-03-14&rft_id=info%3Aarxiv%2F2303.07971&rft.aulast=Hahn&rft.aufirst=Michael&rft.au=Goyal%2C+Navin&rfr_id=info%3Asid%2Fen.wikipedia.org%3ALarge+language+model" class="Z3988"></span></span> </li> <li id="cite_note-57FEA-96"><span class="mw-cite-backlink"><b><a href="#cite_ref-57FEA_96-0">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFPilehvarCamacho-Collados2019" class="citation journal cs1">Pilehvar, Mohammad Taher; Camacho-Collados, Jose (June 2019). <a rel="nofollow" class="external text" href="https://aclanthology.org/N19-1128">"Proceedings of the 2019 Conference of the North"</a>. <i>Proceedings of the 2019 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, Volume 1 (Long and Short Papers)</i>. Minneapolis, Minnesota: Association for Computational Linguistics: 1267–1273. <a href="/wiki/Doi_(identifier)" class="mw-redirect" title="Doi (identifier)">doi</a>:<a rel="nofollow" class="external text" href="https://doi.org/10.18653%2Fv1%2FN19-1128">10.18653/v1/N19-1128</a>. <a href="/wiki/S2CID_(identifier)" class="mw-redirect" title="S2CID (identifier)">S2CID</a> <a rel="nofollow" class="external text" href="https://api.semanticscholar.org/CorpusID:102353817">102353817</a>. <a rel="nofollow" class="external text" href="https://web.archive.org/web/20230627202732/https://aclanthology.org/N19-1128/">Archived</a> from the original on 2023-06-27<span class="reference-accessdate">. Retrieved <span class="nowrap">2023-06-27</span></span>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=article&rft.jtitle=Proceedings+of+the+2019+Conference+of+the+North+American+Chapter+of+the+Association+for+Computational+Linguistics%3A+Human+Language+Technologies%2C+Volume+1+%28Long+and+Short+Papers%29&rft.atitle=Proceedings+of+the+2019+Conference+of+the+North&rft.pages=1267-1273&rft.date=2019-06&rft_id=info%3Adoi%2F10.18653%2Fv1%2FN19-1128&rft_id=https%3A%2F%2Fapi.semanticscholar.org%2FCorpusID%3A102353817%23id-name%3DS2CID&rft.aulast=Pilehvar&rft.aufirst=Mohammad+Taher&rft.au=Camacho-Collados%2C+Jose&rft_id=https%3A%2F%2Faclanthology.org%2FN19-1128&rfr_id=info%3Asid%2Fen.wikipedia.org%3ALarge+language+model" class="Z3988"></span></span> </li> <li id="cite_note-TEIkA-97"><span class="mw-cite-backlink"><b><a href="#cite_ref-TEIkA_97-0">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite class="citation web cs1"><a rel="nofollow" class="external text" href="https://pilehvar.github.io/wic/">"WiC: The Word-in-Context Dataset"</a>. <i>pilehvar.github.io</i>. <a rel="nofollow" class="external text" href="https://web.archive.org/web/20230627202725/https://pilehvar.github.io/wic/">Archived</a> from the original on 2023-06-27<span class="reference-accessdate">. Retrieved <span class="nowrap">2023-06-27</span></span>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=unknown&rft.jtitle=pilehvar.github.io&rft.atitle=WiC%3A+The+Word-in-Context+Dataset&rft_id=https%3A%2F%2Fpilehvar.github.io%2Fwic%2F&rfr_id=info%3Asid%2Fen.wikipedia.org%3ALarge+language+model" class="Z3988"></span></span> </li> <li id="cite_note-zgy1i-98"><span class="mw-cite-backlink"><b><a href="#cite_ref-zgy1i_98-0">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFPatelPavlick2021" class="citation journal cs1">Patel, Roma; Pavlick, Ellie (2021-10-06). <a rel="nofollow" class="external text" href="https://openreview.net/forum?id=gJcEM8sxHK">"Mapping Language Models to Grounded Conceptual Spaces"</a>. <i>ICLR</i>. <a rel="nofollow" class="external text" href="https://web.archive.org/web/20230624191940/https://openreview.net/forum?id=gJcEM8sxHK">Archived</a> from the original on 2023-06-24<span class="reference-accessdate">. Retrieved <span class="nowrap">2023-06-27</span></span>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=article&rft.jtitle=ICLR&rft.atitle=Mapping+Language+Models+to+Grounded+Conceptual+Spaces&rft.date=2021-10-06&rft.aulast=Patel&rft.aufirst=Roma&rft.au=Pavlick%2C+Ellie&rft_id=https%3A%2F%2Fopenreview.net%2Fforum%3Fid%3DgJcEM8sxHK&rfr_id=info%3Asid%2Fen.wikipedia.org%3ALarge+language+model" class="Z3988"></span></span> </li> <li id="cite_note-Imb98-99"><span class="mw-cite-backlink"><b><a href="#cite_ref-Imb98_99-0">^</a></b></span> <span class="reference-text"><i><a rel="nofollow" class="external text" href="https://www.notion.so/A-Closer-Look-at-Large-Language-Models-Emergent-Abilities-493876b55df5479d80686f68a1abd72f">A Closer Look at Large Language Models Emergent Abilities</a> <a rel="nofollow" class="external text" href="https://web.archive.org/web/20230624012329/https://www.notion.so/A-Closer-Look-at-Large-Language-Models-Emergent-Abilities-493876b55df5479d80686f68a1abd72f">Archived</a> 2023-06-24 at the <a href="/wiki/Wayback_Machine" title="Wayback Machine">Wayback Machine</a></i> (Yao Fu, Nov 20, 2022)</span> </li> <li id="cite_note-CeQVF-100"><span class="mw-cite-backlink"><b><a href="#cite_ref-CeQVF_100-0">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFOrnes2023" class="citation web cs1">Ornes, Stephen (March 16, 2023). <a rel="nofollow" class="external text" href="https://www.quantamagazine.org/the-unpredictable-abilities-emerging-from-large-ai-models-20230316/">"The Unpredictable Abilities Emerging From Large AI Models"</a>. <i>Quanta Magazine</i>. <a rel="nofollow" class="external text" href="https://web.archive.org/web/20230316203438/https://www.quantamagazine.org/the-unpredictable-abilities-emerging-from-large-ai-models-20230316/">Archived</a> from the original on March 16, 2023<span class="reference-accessdate">. Retrieved <span class="nowrap">March 16,</span> 2023</span>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=unknown&rft.jtitle=Quanta+Magazine&rft.atitle=The+Unpredictable+Abilities+Emerging+From+Large+AI+Models&rft.date=2023-03-16&rft.aulast=Ornes&rft.aufirst=Stephen&rft_id=https%3A%2F%2Fwww.quantamagazine.org%2Fthe-unpredictable-abilities-emerging-from-large-ai-models-20230316%2F&rfr_id=info%3Asid%2Fen.wikipedia.org%3ALarge+language+model" class="Z3988"></span></span> </li> <li id="cite_note-C775b-101"><span class="mw-cite-backlink"><b><a href="#cite_ref-C775b_101-0">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFSchaefferMirandaKoyejo2023" class="citation arxiv cs1">Schaeffer, Rylan; Miranda, Brando; Koyejo, Sanmi (2023-04-01). "Are Emergent Abilities of Large Language Models a Mirage?". <a href="/wiki/ArXiv_(identifier)" class="mw-redirect" title="ArXiv (identifier)">arXiv</a>:<span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://arxiv.org/abs/2304.15004">2304.15004</a></span> [<a rel="nofollow" class="external text" href="https://arxiv.org/archive/cs.AI">cs.AI</a>].</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=preprint&rft.jtitle=arXiv&rft.atitle=Are+Emergent+Abilities+of+Large+Language+Models+a+Mirage%3F&rft.date=2023-04-01&rft_id=info%3Aarxiv%2F2304.15004&rft.aulast=Schaeffer&rft.aufirst=Rylan&rft.au=Miranda%2C+Brando&rft.au=Koyejo%2C+Sanmi&rfr_id=info%3Asid%2Fen.wikipedia.org%3ALarge+language+model" class="Z3988"></span></span> </li> <li id="cite_note-IZSIr-102"><span class="mw-cite-backlink"><b><a href="#cite_ref-IZSIr_102-0">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFLiHopkinsBauViégas2022" class="citation arxiv cs1">Li, Kenneth; Hopkins, Aspen K.; Bau, David; Viégas, Fernanda; Pfister, Hanspeter; Wattenberg, Martin (2022-10-01). "Emergent World Representations: Exploring a Sequence Model Trained on a Synthetic Task". <a href="/wiki/ArXiv_(identifier)" class="mw-redirect" title="ArXiv (identifier)">arXiv</a>:<span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://arxiv.org/abs/2210.13382">2210.13382</a></span> [<a rel="nofollow" class="external text" href="https://arxiv.org/archive/cs.LG">cs.LG</a>].</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=preprint&rft.jtitle=arXiv&rft.atitle=Emergent+World+Representations%3A+Exploring+a+Sequence+Model+Trained+on+a+Synthetic+Task&rft.date=2022-10-01&rft_id=info%3Aarxiv%2F2210.13382&rft.aulast=Li&rft.aufirst=Kenneth&rft.au=Hopkins%2C+Aspen+K.&rft.au=Bau%2C+David&rft.au=Vi%C3%A9gas%2C+Fernanda&rft.au=Pfister%2C+Hanspeter&rft.au=Wattenberg%2C+Martin&rfr_id=info%3Asid%2Fen.wikipedia.org%3ALarge+language+model" class="Z3988"></span></span> </li> <li id="cite_note-RLik9-103"><span class="mw-cite-backlink"><b><a href="#cite_ref-RLik9_103-0">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite class="citation web cs1"><a rel="nofollow" class="external text" href="https://thegradient.pub/othello/">"Large Language Model: world models or surface statistics?"</a>. <i>The Gradient</i>. 2023-01-21<span class="reference-accessdate">. Retrieved <span class="nowrap">2023-06-12</span></span>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=unknown&rft.jtitle=The+Gradient&rft.atitle=Large+Language+Model%3A+world+models+or+surface+statistics%3F&rft.date=2023-01-21&rft_id=https%3A%2F%2Fthegradient.pub%2Fothello%2F&rfr_id=info%3Asid%2Fen.wikipedia.org%3ALarge+language+model" class="Z3988"></span></span> </li> <li id="cite_note-Hln1l-104"><span class="mw-cite-backlink"><b><a href="#cite_ref-Hln1l_104-0">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFJinRinard2023" class="citation arxiv cs1">Jin, Charles; Rinard, Martin (2023-05-01). "Evidence of Meaning in Language Models Trained on Programs". <a href="/wiki/ArXiv_(identifier)" class="mw-redirect" title="ArXiv (identifier)">arXiv</a>:<span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://arxiv.org/abs/2305.11169">2305.11169</a></span> [<a rel="nofollow" class="external text" href="https://arxiv.org/archive/cs.LG">cs.LG</a>].</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=preprint&rft.jtitle=arXiv&rft.atitle=Evidence+of+Meaning+in+Language+Models+Trained+on+Programs&rft.date=2023-05-01&rft_id=info%3Aarxiv%2F2305.11169&rft.aulast=Jin&rft.aufirst=Charles&rft.au=Rinard%2C+Martin&rfr_id=info%3Asid%2Fen.wikipedia.org%3ALarge+language+model" class="Z3988"></span></span> </li> <li id="cite_note-oYGlo-105"><span class="mw-cite-backlink"><b><a href="#cite_ref-oYGlo_105-0">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFNandaChanLieberumSmith2023" class="citation arxiv cs1">Nanda, Neel; Chan, Lawrence; Lieberum, Tom; Smith, Jess; Steinhardt, Jacob (2023-01-01). "Progress measures for grokking via mechanistic interpretability". <a href="/wiki/ArXiv_(identifier)" class="mw-redirect" title="ArXiv (identifier)">arXiv</a>:<span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://arxiv.org/abs/2301.05217">2301.05217</a></span> [<a rel="nofollow" class="external text" href="https://arxiv.org/archive/cs.LG">cs.LG</a>].</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=preprint&rft.jtitle=arXiv&rft.atitle=Progress+measures+for+grokking+via+mechanistic+interpretability&rft.date=2023-01-01&rft_id=info%3Aarxiv%2F2301.05217&rft.aulast=Nanda&rft.aufirst=Neel&rft.au=Chan%2C+Lawrence&rft.au=Lieberum%2C+Tom&rft.au=Smith%2C+Jess&rft.au=Steinhardt%2C+Jacob&rfr_id=info%3Asid%2Fen.wikipedia.org%3ALarge+language+model" class="Z3988"></span></span> </li> <li id="cite_note-debate_understanding-106"><span class="mw-cite-backlink">^ <a href="#cite_ref-debate_understanding_106-0"><sup><i><b>a</b></i></sup></a> <a href="#cite_ref-debate_understanding_106-1"><sup><i><b>b</b></i></sup></a> <a href="#cite_ref-debate_understanding_106-2"><sup><i><b>c</b></i></sup></a> <a href="#cite_ref-debate_understanding_106-3"><sup><i><b>d</b></i></sup></a> <a href="#cite_ref-debate_understanding_106-4"><sup><i><b>e</b></i></sup></a></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFMitchellKrakauer2023" class="citation journal cs1">Mitchell, Melanie; Krakauer, David C. (28 March 2023). <a rel="nofollow" class="external text" href="https://www.ncbi.nlm.nih.gov/pmc/articles/PMC10068812">"The debate over understanding in AI's large language models"</a>. <i>Proceedings of the National Academy of Sciences</i>. <b>120</b> (13): e2215907120. <a href="/wiki/ArXiv_(identifier)" class="mw-redirect" title="ArXiv (identifier)">arXiv</a>:<span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://arxiv.org/abs/2210.13966">2210.13966</a></span>. <a href="/wiki/Bibcode_(identifier)" class="mw-redirect" title="Bibcode (identifier)">Bibcode</a>:<a rel="nofollow" class="external text" href="https://ui.adsabs.harvard.edu/abs/2023PNAS..12015907M">2023PNAS..12015907M</a>. <a href="/wiki/Doi_(identifier)" class="mw-redirect" title="Doi (identifier)">doi</a>:<a rel="nofollow" class="external text" href="https://doi.org/10.1073%2Fpnas.2215907120">10.1073/pnas.2215907120</a>. <a href="/wiki/PMC_(identifier)" class="mw-redirect" title="PMC (identifier)">PMC</a> <span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://www.ncbi.nlm.nih.gov/pmc/articles/PMC10068812">10068812</a></span>. <a href="/wiki/PMID_(identifier)" class="mw-redirect" title="PMID (identifier)">PMID</a> <a rel="nofollow" class="external text" href="https://pubmed.ncbi.nlm.nih.gov/36943882">36943882</a>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=article&rft.jtitle=Proceedings+of+the+National+Academy+of+Sciences&rft.atitle=The+debate+over+understanding+in+AI%27s+large+language+models&rft.volume=120&rft.issue=13&rft.pages=e2215907120&rft.date=2023-03-28&rft_id=https%3A%2F%2Fwww.ncbi.nlm.nih.gov%2Fpmc%2Farticles%2FPMC10068812%23id-name%3DPMC&rft_id=info%3Abibcode%2F2023PNAS..12015907M&rft_id=info%3Aarxiv%2F2210.13966&rft_id=info%3Apmid%2F36943882&rft_id=info%3Adoi%2F10.1073%2Fpnas.2215907120&rft.aulast=Mitchell&rft.aufirst=Melanie&rft.au=Krakauer%2C+David+C.&rft_id=https%3A%2F%2Fwww.ncbi.nlm.nih.gov%2Fpmc%2Farticles%2FPMC10068812&rfr_id=info%3Asid%2Fen.wikipedia.org%3ALarge+language+model" class="Z3988"></span></span> </li> <li id="cite_note-O8Upd-107"><span class="mw-cite-backlink"><b><a href="#cite_ref-O8Upd_107-0">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFMetz2023" class="citation news cs1">Metz, Cade (16 May 2023). <a rel="nofollow" class="external text" href="https://www.nytimes.com/2023/05/16/technology/microsoft-ai-human-reasoning.html">"Microsoft Says New A.I. Shows Signs of Human Reasoning"</a>. <i>The New York Times</i>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=article&rft.jtitle=The+New+York+Times&rft.atitle=Microsoft+Says+New+A.I.+Shows+Signs+of+Human+Reasoning&rft.date=2023-05-16&rft.aulast=Metz&rft.aufirst=Cade&rft_id=https%3A%2F%2Fwww.nytimes.com%2F2023%2F05%2F16%2Ftechnology%2Fmicrosoft-ai-human-reasoning.html&rfr_id=info%3Asid%2Fen.wikipedia.org%3ALarge+language+model" class="Z3988"></span></span> </li> <li id="cite_note-microsoft_sparks-108"><span class="mw-cite-backlink">^ <a href="#cite_ref-microsoft_sparks_108-0"><sup><i><b>a</b></i></sup></a> <a href="#cite_ref-microsoft_sparks_108-1"><sup><i><b>b</b></i></sup></a></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFBubeckChandrasekaranEldanGehrke2023" class="citation arxiv cs1">Bubeck, Sébastien; Chandrasekaran, Varun; Eldan, Ronen; Gehrke, Johannes; Horvitz, Eric; Kamar, Ece; Lee, Peter; Lee, Yin Tat; Li, Yuanzhi; Lundberg, Scott; Nori, Harsha; Palangi, Hamid; Ribeiro, Marco Tulio; Zhang, Yi (2023). "Sparks of Artificial General Intelligence: Early experiments with GPT-4". <a href="/wiki/ArXiv_(identifier)" class="mw-redirect" title="ArXiv (identifier)">arXiv</a>:<span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://arxiv.org/abs/2303.12712">2303.12712</a></span> [<a rel="nofollow" class="external text" href="https://arxiv.org/archive/cs.CL">cs.CL</a>].</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=preprint&rft.jtitle=arXiv&rft.atitle=Sparks+of+Artificial+General+Intelligence%3A+Early+experiments+with+GPT-4&rft.date=2023&rft_id=info%3Aarxiv%2F2303.12712&rft.aulast=Bubeck&rft.aufirst=S%C3%A9bastien&rft.au=Chandrasekaran%2C+Varun&rft.au=Eldan%2C+Ronen&rft.au=Gehrke%2C+Johannes&rft.au=Horvitz%2C+Eric&rft.au=Kamar%2C+Ece&rft.au=Lee%2C+Peter&rft.au=Lee%2C+Yin+Tat&rft.au=Li%2C+Yuanzhi&rft.au=Lundberg%2C+Scott&rft.au=Nori%2C+Harsha&rft.au=Palangi%2C+Hamid&rft.au=Ribeiro%2C+Marco+Tulio&rft.au=Zhang%2C+Yi&rfr_id=info%3Asid%2Fen.wikipedia.org%3ALarge+language+model" class="Z3988"></span></span> </li> <li id="cite_note-109"><span class="mw-cite-backlink"><b><a href="#cite_ref-109">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite class="citation news cs1"><a rel="nofollow" class="external text" href="https://www.fastcompany.com/91211163/anthropic-ceo-dario-amodei-pens-a-smart-look-at-our-ai-future">"Anthropic CEO Dario Amodei pens a smart look at our AI future"</a>. <i>Fast Company</i>. October 17, 2024.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=article&rft.jtitle=Fast+Company&rft.atitle=Anthropic+CEO+Dario+Amodei+pens+a+smart+look+at+our+AI+future&rft.date=2024-10-17&rft_id=https%3A%2F%2Fwww.fastcompany.com%2F91211163%2Fanthropic-ceo-dario-amodei-pens-a-smart-look-at-our-ai-future&rfr_id=info%3Asid%2Fen.wikipedia.org%3ALarge+language+model" class="Z3988"></span></span> </li> <li id="cite_note-rEEmH-110"><span class="mw-cite-backlink"><b><a href="#cite_ref-rEEmH_110-0">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite class="citation news cs1"><a rel="nofollow" class="external text" href="https://www.zdnet.com/article/chatgpt-is-more-like-an-alien-intelligence-than-a-human-brain-says-futurist/">"ChatGPT is more like an 'alien intelligence' than a human brain, says futurist"</a>. <i>ZDNET</i>. 2023. <a rel="nofollow" class="external text" href="https://web.archive.org/web/20230612065937/https://www.zdnet.com/article/chatgpt-is-more-like-an-alien-intelligence-than-a-human-brain-says-futurist/">Archived</a> from the original on 12 June 2023<span class="reference-accessdate">. Retrieved <span class="nowrap">12 June</span> 2023</span>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=article&rft.jtitle=ZDNET&rft.atitle=ChatGPT+is+more+like+an+%27alien+intelligence%27+than+a+human+brain%2C+says+futurist&rft.date=2023&rft_id=https%3A%2F%2Fwww.zdnet.com%2Farticle%2Fchatgpt-is-more-like-an-alien-intelligence-than-a-human-brain-says-futurist%2F&rfr_id=info%3Asid%2Fen.wikipedia.org%3ALarge+language+model" class="Z3988"></span></span> </li> <li id="cite_note-new_yorker_kind_of_mind-111"><span class="mw-cite-backlink">^ <a href="#cite_ref-new_yorker_kind_of_mind_111-0"><sup><i><b>a</b></i></sup></a> <a href="#cite_ref-new_yorker_kind_of_mind_111-1"><sup><i><b>b</b></i></sup></a></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFNewport2023" class="citation magazine cs1">Newport, Cal (13 April 2023). <a rel="nofollow" class="external text" href="https://www.newyorker.com/science/annals-of-artificial-intelligence/what-kind-of-mind-does-chatgpt-have">"What Kind of Mind Does ChatGPT Have?"</a>. <i>The New Yorker</i>. <a rel="nofollow" class="external text" href="https://web.archive.org/web/20230612071443/https://www.newyorker.com/science/annals-of-artificial-intelligence/what-kind-of-mind-does-chatgpt-have">Archived</a> from the original on 12 June 2023<span class="reference-accessdate">. Retrieved <span class="nowrap">12 June</span> 2023</span>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=article&rft.jtitle=The+New+Yorker&rft.atitle=What+Kind+of+Mind+Does+ChatGPT+Have%3F&rft.date=2023-04-13&rft.aulast=Newport&rft.aufirst=Cal&rft_id=https%3A%2F%2Fwww.newyorker.com%2Fscience%2Fannals-of-artificial-intelligence%2Fwhat-kind-of-mind-does-chatgpt-have&rfr_id=info%3Asid%2Fen.wikipedia.org%3ALarge+language+model" class="Z3988"></span></span> </li> <li id="cite_note-rAFIZ-112"><span class="mw-cite-backlink"><b><a href="#cite_ref-rAFIZ_112-0">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFRoose2023" class="citation news cs1">Roose, Kevin (30 May 2023). <a rel="nofollow" class="external text" href="https://www.nytimes.com/2023/05/30/technology/shoggoth-meme-ai.html">"Why an Octopus-like Creature Has Come to Symbolize the State of A.I."</a> <i>The New York Times</i>. <a rel="nofollow" class="external text" href="https://web.archive.org/web/20230530193814/https://www.nytimes.com/2023/05/30/technology/shoggoth-meme-ai.html">Archived</a> from the original on 30 May 2023<span class="reference-accessdate">. Retrieved <span class="nowrap">12 June</span> 2023</span>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=article&rft.jtitle=The+New+York+Times&rft.atitle=Why+an+Octopus-like+Creature+Has+Come+to+Symbolize+the+State+of+A.I.&rft.date=2023-05-30&rft.aulast=Roose&rft.aufirst=Kevin&rft_id=https%3A%2F%2Fwww.nytimes.com%2F2023%2F05%2F30%2Ftechnology%2Fshoggoth-meme-ai.html&rfr_id=info%3Asid%2Fen.wikipedia.org%3ALarge+language+model" class="Z3988"></span></span> </li> <li id="cite_note-4luKE-113"><span class="mw-cite-backlink"><b><a href="#cite_ref-4luKE_113-0">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite class="citation news cs1"><a rel="nofollow" class="external text" href="https://time.com/6271657/a-to-z-of-artificial-intelligence/">"The A to Z of Artificial Intelligence"</a>. <i>Time Magazine</i>. 13 April 2023. <a rel="nofollow" class="external text" href="https://web.archive.org/web/20230616123839/https://time.com/6271657/a-to-z-of-artificial-intelligence/">Archived</a> from the original on 16 June 2023<span class="reference-accessdate">. Retrieved <span class="nowrap">12 June</span> 2023</span>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=article&rft.jtitle=Time+Magazine&rft.atitle=The+A+to+Z+of+Artificial+Intelligence&rft.date=2023-04-13&rft_id=https%3A%2F%2Ftime.com%2F6271657%2Fa-to-z-of-artificial-intelligence%2F&rfr_id=info%3Asid%2Fen.wikipedia.org%3ALarge+language+model" class="Z3988"></span></span> </li> <li id="cite_note-hallucination-survey-114"><span class="mw-cite-backlink"><b><a href="#cite_ref-hallucination-survey_114-0">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFJiLeeFrieskeYu2022" class="citation journal cs1">Ji, Ziwei; Lee, Nayeon; Frieske, Rita; Yu, Tiezheng; Su, Dan; Xu, Yan; Ishii, Etsuko; Bang, Yejin; Dai, Wenliang; Madotto, Andrea; Fung, Pascale (November 2022). <a rel="nofollow" class="external text" href="https://dl.acm.org/doi/pdf/10.1145/3571730">"Survey of Hallucination in Natural Language Generation"</a> <span class="cs1-format">(pdf)</span>. <i>ACM Computing Surveys</i>. <b>55</b> (12). <a href="/wiki/Association_for_Computing_Machinery" title="Association for Computing Machinery">Association for Computing Machinery</a>: 1–38. <a href="/wiki/ArXiv_(identifier)" class="mw-redirect" title="ArXiv (identifier)">arXiv</a>:<span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://arxiv.org/abs/2202.03629">2202.03629</a></span>. <a href="/wiki/Doi_(identifier)" class="mw-redirect" title="Doi (identifier)">doi</a>:<a rel="nofollow" class="external text" href="https://doi.org/10.1145%2F3571730">10.1145/3571730</a>. <a href="/wiki/S2CID_(identifier)" class="mw-redirect" title="S2CID (identifier)">S2CID</a> <a rel="nofollow" class="external text" href="https://api.semanticscholar.org/CorpusID:246652372">246652372</a>. <a rel="nofollow" class="external text" href="https://web.archive.org/web/20230326145635/https://dl.acm.org/doi/pdf/10.1145/3571730">Archived</a> from the original on 26 March 2023<span class="reference-accessdate">. Retrieved <span class="nowrap">15 January</span> 2023</span>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=article&rft.jtitle=ACM+Computing+Surveys&rft.atitle=Survey+of+Hallucination+in+Natural+Language+Generation&rft.volume=55&rft.issue=12&rft.pages=1-38&rft.date=2022-11&rft_id=info%3Aarxiv%2F2202.03629&rft_id=https%3A%2F%2Fapi.semanticscholar.org%2FCorpusID%3A246652372%23id-name%3DS2CID&rft_id=info%3Adoi%2F10.1145%2F3571730&rft.aulast=Ji&rft.aufirst=Ziwei&rft.au=Lee%2C+Nayeon&rft.au=Frieske%2C+Rita&rft.au=Yu%2C+Tiezheng&rft.au=Su%2C+Dan&rft.au=Xu%2C+Yan&rft.au=Ishii%2C+Etsuko&rft.au=Bang%2C+Yejin&rft.au=Dai%2C+Wenliang&rft.au=Madotto%2C+Andrea&rft.au=Fung%2C+Pascale&rft_id=https%3A%2F%2Fdl.acm.org%2Fdoi%2Fpdf%2F10.1145%2F3571730&rfr_id=info%3Asid%2Fen.wikipedia.org%3ALarge+language+model" class="Z3988"></span></span> </li> <li id="cite_note-115"><span class="mw-cite-backlink"><b><a href="#cite_ref-115">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFVarshneyYaoZhangChen2023" class="citation arxiv cs1">Varshney, Neeraj; Yao, Wenlin; Zhang, Hongming; Chen, Jianshu; Yu, Dong (2023). "A Stitch in Time Saves Nine: Detecting and Mitigating Hallucinations of LLMs by Validating Low-Confidence Generation". <a href="/wiki/ArXiv_(identifier)" class="mw-redirect" title="ArXiv (identifier)">arXiv</a>:<span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://arxiv.org/abs/2307.03987">2307.03987</a></span> [<a rel="nofollow" class="external text" href="https://arxiv.org/archive/cs.CL">cs.CL</a>].</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=preprint&rft.jtitle=arXiv&rft.atitle=A+Stitch+in+Time+Saves+Nine%3A+Detecting+and+Mitigating+Hallucinations+of+LLMs+by+Validating+Low-Confidence+Generation&rft.date=2023&rft_id=info%3Aarxiv%2F2307.03987&rft.aulast=Varshney&rft.aufirst=Neeraj&rft.au=Yao%2C+Wenlin&rft.au=Zhang%2C+Hongming&rft.au=Chen%2C+Jianshu&rft.au=Yu%2C+Dong&rfr_id=info%3Asid%2Fen.wikipedia.org%3ALarge+language+model" class="Z3988"></span></span> </li> <li id="cite_note-116"><span class="mw-cite-backlink"><b><a href="#cite_ref-116">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFLakoff1999" class="citation book cs1">Lakoff, George (1999). <i>Philosophy in the Flesh: The Embodied Mind and Its Challenge to Western Philosophy; Appendix: The Neural Theory of Language Paradigm</i>. New York Basic Books. pp. 569–583. <a href="/wiki/ISBN_(identifier)" class="mw-redirect" title="ISBN (identifier)">ISBN</a> <a href="/wiki/Special:BookSources/978-0-465-05674-3" title="Special:BookSources/978-0-465-05674-3"><bdi>978-0-465-05674-3</bdi></a>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&rft.genre=book&rft.btitle=Philosophy+in+the+Flesh%3A+The+Embodied+Mind+and+Its+Challenge+to+Western+Philosophy%3B+Appendix%3A+The+Neural+Theory+of+Language+Paradigm&rft.pages=569-583&rft.pub=New+York+Basic+Books&rft.date=1999&rft.isbn=978-0-465-05674-3&rft.aulast=Lakoff&rft.aufirst=George&rfr_id=info%3Asid%2Fen.wikipedia.org%3ALarge+language+model" class="Z3988"></span></span> </li> <li id="cite_note-117"><span class="mw-cite-backlink"><b><a href="#cite_ref-117">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFEvans2014" class="citation book cs1">Evans, Vyvyan. (2014). <i>The Language Myth</i>. Cambridge University Press. <a href="/wiki/ISBN_(identifier)" class="mw-redirect" title="ISBN (identifier)">ISBN</a> <a href="/wiki/Special:BookSources/978-1-107-04396-1" title="Special:BookSources/978-1-107-04396-1"><bdi>978-1-107-04396-1</bdi></a>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&rft.genre=book&rft.btitle=The+Language+Myth&rft.pub=Cambridge+University+Press&rft.date=2014&rft.isbn=978-1-107-04396-1&rft.aulast=Evans&rft.aufirst=Vyvyan.&rfr_id=info%3Asid%2Fen.wikipedia.org%3ALarge+language+model" class="Z3988"></span></span> </li> <li id="cite_note-118"><span class="mw-cite-backlink"><b><a href="#cite_ref-118">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFFriston2022" class="citation book cs1">Friston, Karl J. (2022). <i>Active Inference: The Free Energy Principle in Mind, Brain, and Behavior; Chapter 4 The Generative Models of Active Inference</i>. The MIT Press. <a href="/wiki/ISBN_(identifier)" class="mw-redirect" title="ISBN (identifier)">ISBN</a> <a href="/wiki/Special:BookSources/978-0-262-36997-8" title="Special:BookSources/978-0-262-36997-8"><bdi>978-0-262-36997-8</bdi></a>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&rft.genre=book&rft.btitle=Active+Inference%3A+The+Free+Energy+Principle+in+Mind%2C+Brain%2C+and+Behavior%3B+Chapter+4+The+Generative+Models+of+Active+Inference&rft.pub=The+MIT+Press&rft.date=2022&rft.isbn=978-0-262-36997-8&rft.aulast=Friston&rft.aufirst=Karl+J.&rfr_id=info%3Asid%2Fen.wikipedia.org%3ALarge+language+model" class="Z3988"></span></span> </li> <li id="cite_note-Huyen-119"><span class="mw-cite-backlink">^ <a href="#cite_ref-Huyen_119-0"><sup><i><b>a</b></i></sup></a> <a href="#cite_ref-Huyen_119-1"><sup><i><b>b</b></i></sup></a></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFHuyen2019" class="citation web cs1">Huyen, Chip (October 18, 2019). <a rel="nofollow" class="external text" href="https://thegradient.pub/understanding-evaluation-metrics-for-language-models/">"Evaluation Metrics for Language Modeling"</a>. <i>The Gradient</i><span class="reference-accessdate">. Retrieved <span class="nowrap">January 14,</span> 2024</span>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=unknown&rft.jtitle=The+Gradient&rft.atitle=Evaluation+Metrics+for+Language+Modeling&rft.date=2019-10-18&rft.aulast=Huyen&rft.aufirst=Chip&rft_id=https%3A%2F%2Fthegradient.pub%2Funderstanding-evaluation-metrics-for-language-models%2F&rfr_id=info%3Asid%2Fen.wikipedia.org%3ALarge+language+model" class="Z3988"></span></span> </li> <li id="cite_note-boolq-120"><span class="mw-cite-backlink">^ <a href="#cite_ref-boolq_120-0"><sup><i><b>a</b></i></sup></a> <a href="#cite_ref-boolq_120-1"><sup><i><b>b</b></i></sup></a></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFClarkLeeChangKwiatkowski2019" class="citation arxiv cs1">Clark, Christopher; Lee, Kenton; Chang, Ming-Wei; Kwiatkowski, Tom; Collins, Michael; Toutanova, Kristina (2019). "BoolQ: Exploring the Surprising Difficulty of Natural Yes/No Questions". <a href="/wiki/ArXiv_(identifier)" class="mw-redirect" title="ArXiv (identifier)">arXiv</a>:<span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://arxiv.org/abs/1905.10044">1905.10044</a></span> [<a rel="nofollow" class="external text" href="https://arxiv.org/archive/cs.CL">cs.CL</a>].</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=preprint&rft.jtitle=arXiv&rft.atitle=BoolQ%3A+Exploring+the+Surprising+Difficulty+of+Natural+Yes%2FNo+Questions&rft.date=2019&rft_id=info%3Aarxiv%2F1905.10044&rft.aulast=Clark&rft.aufirst=Christopher&rft.au=Lee%2C+Kenton&rft.au=Chang%2C+Ming-Wei&rft.au=Kwiatkowski%2C+Tom&rft.au=Collins%2C+Michael&rft.au=Toutanova%2C+Kristina&rfr_id=info%3Asid%2Fen.wikipedia.org%3ALarge+language+model" class="Z3988"></span></span> </li> <li id="cite_note-survey-121"><span class="mw-cite-backlink">^ <a href="#cite_ref-survey_121-0"><sup><i><b>a</b></i></sup></a> <a href="#cite_ref-survey_121-1"><sup><i><b>b</b></i></sup></a> <a href="#cite_ref-survey_121-2"><sup><i><b>c</b></i></sup></a></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFWayne_Xin_ZhaoZhouLiTang2023" class="citation arxiv cs1">Wayne Xin Zhao; Zhou, Kun; Li, Junyi; Tang, Tianyi; Wang, Xiaolei; Hou, Yupeng; Min, Yingqian; Zhang, Beichen; Zhang, Junjie; Dong, Zican; Du, Yifan; Yang, Chen; Chen, Yushuo; Chen, Zhipeng; Jiang, Jinhao; Ren, Ruiyang; Li, Yifan; Tang, Xinyu; Liu, Zikang; Liu, Peiyu; Nie, Jian-Yun; Wen, Ji-Rong (2023). "A Survey of Large Language Models". <a href="/wiki/ArXiv_(identifier)" class="mw-redirect" title="ArXiv (identifier)">arXiv</a>:<span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://arxiv.org/abs/2303.18223">2303.18223</a></span> [<a rel="nofollow" class="external text" href="https://arxiv.org/archive/cs.CL">cs.CL</a>].</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=preprint&rft.jtitle=arXiv&rft.atitle=A+Survey+of+Large+Language+Models&rft.date=2023&rft_id=info%3Aarxiv%2F2303.18223&rft.au=Wayne+Xin+Zhao&rft.au=Zhou%2C+Kun&rft.au=Li%2C+Junyi&rft.au=Tang%2C+Tianyi&rft.au=Wang%2C+Xiaolei&rft.au=Hou%2C+Yupeng&rft.au=Min%2C+Yingqian&rft.au=Zhang%2C+Beichen&rft.au=Zhang%2C+Junjie&rft.au=Dong%2C+Zican&rft.au=Du%2C+Yifan&rft.au=Yang%2C+Chen&rft.au=Chen%2C+Yushuo&rft.au=Chen%2C+Zhipeng&rft.au=Jiang%2C+Jinhao&rft.au=Ren%2C+Ruiyang&rft.au=Li%2C+Yifan&rft.au=Tang%2C+Xinyu&rft.au=Liu%2C+Zikang&rft.au=Liu%2C+Peiyu&rft.au=Nie%2C+Jian-Yun&rft.au=Wen%2C+Ji-Rong&rfr_id=info%3Asid%2Fen.wikipedia.org%3ALarge+language+model" class="Z3988"></span></span> </li> <li id="cite_note-122"><span class="mw-cite-backlink"><b><a href="#cite_ref-122">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite class="citation cs2"><a rel="nofollow" class="external text" href="https://github.com/openai/simple-evals"><i>openai/simple-evals</i></a>, OpenAI, 2024-05-28<span class="reference-accessdate">, retrieved <span class="nowrap">2024-05-28</span></span></cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&rft.genre=book&rft.btitle=openai%2Fsimple-evals&rft.pub=OpenAI&rft.date=2024-05-28&rft_id=https%3A%2F%2Fgithub.com%2Fopenai%2Fsimple-evals&rfr_id=info%3Asid%2Fen.wikipedia.org%3ALarge+language+model" class="Z3988"></span></span> </li> <li id="cite_note-123"><span class="mw-cite-backlink"><b><a href="#cite_ref-123">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite class="citation cs2"><a rel="nofollow" class="external text" href="https://github.com/openai/evals"><i>openai/evals</i></a>, OpenAI, 2024-05-28, <a rel="nofollow" class="external text" href="https://web.archive.org/web/20240508225708/https://github.com/openai/evals">archived</a> from the original on 2024-05-08<span class="reference-accessdate">, retrieved <span class="nowrap">2024-05-28</span></span></cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&rft.genre=book&rft.btitle=openai%2Fevals&rft.pub=OpenAI&rft.date=2024-05-28&rft_id=https%3A%2F%2Fgithub.com%2Fopenai%2Fevals&rfr_id=info%3Asid%2Fen.wikipedia.org%3ALarge+language+model" class="Z3988"></span></span> </li> <li id="cite_note-124"><span class="mw-cite-backlink"><b><a href="#cite_ref-124">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite class="citation web cs1"><a rel="nofollow" class="external text" href="https://imbue.com/research/70b-evals/">"Sanitized open-source datasets for natural language and code understanding: how we evaluated our 70B model"</a>. <i>imbue.com</i>. <a rel="nofollow" class="external text" href="https://web.archive.org/web/20240726173012/https://imbue.com/research/70b-evals/">Archived</a> from the original on 2024-07-26<span class="reference-accessdate">. Retrieved <span class="nowrap">2024-07-24</span></span>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=unknown&rft.jtitle=imbue.com&rft.atitle=Sanitized+open-source+datasets+for+natural+language+and+code+understanding%3A+how+we+evaluated+our+70B+model&rft_id=https%3A%2F%2Fimbue.com%2Fresearch%2F70b-evals%2F&rfr_id=info%3Asid%2Fen.wikipedia.org%3ALarge+language+model" class="Z3988"></span></span> </li> <li id="cite_note-bigbench-125"><span class="mw-cite-backlink"><b><a href="#cite_ref-bigbench_125-0">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFSrivastavaRastogiRaoAbu_Awal_Md_Shoeb2022" class="citation arxiv cs1">Srivastava, Aarohi; et al. (2022). "Beyond the Imitation Game: Quantifying and extrapolating the capabilities of language models". <a href="/wiki/ArXiv_(identifier)" class="mw-redirect" title="ArXiv (identifier)">arXiv</a>:<span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://arxiv.org/abs/2206.04615">2206.04615</a></span> [<a rel="nofollow" class="external text" href="https://arxiv.org/archive/cs.CL">cs.CL</a>].</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=preprint&rft.jtitle=arXiv&rft.atitle=Beyond+the+Imitation+Game%3A+Quantifying+and+extrapolating+the+capabilities+of+language+models&rft.date=2022&rft_id=info%3Aarxiv%2F2206.04615&rft.aulast=Srivastava&rft.aufirst=Aarohi&rft.au=Rastogi%2C+Abhinav&rft.au=Rao%2C+Abhishek&rft.au=Abu+Awal+Md+Shoeb&rft.au=Abid%2C+Abubakar&rft.au=Fisch%2C+Adam&rft.au=Brown%2C+Adam+R.&rft.au=Santoro%2C+Adam&rft.au=Gupta%2C+Aditya&rft.au=Garriga-Alonso%2C+Adri%C3%A0&rft.au=Kluska%2C+Agnieszka&rft.au=Lewkowycz%2C+Aitor&rft.au=Agarwal%2C+Akshat&rft.au=Power%2C+Alethea&rft.au=Ray%2C+Alex&rft.au=Warstadt%2C+Alex&rft.au=Kocurek%2C+Alexander+W.&rft.au=Safaya%2C+Ali&rft.au=Tazarv%2C+Ali&rft.au=Xiang%2C+Alice&rft.au=Parrish%2C+Alicia&rft.au=Nie%2C+Allen&rft.au=Hussain%2C+Aman&rft.au=Askell%2C+Amanda&rft.au=Dsouza%2C+Amanda&rft.au=Slone%2C+Ambrose&rft.au=Rahane%2C+Ameet&rft.au=Iyer%2C+Anantharaman+S.&rft.au=Andreassen%2C+Anders&rft.au=Madotto%2C+Andrea&rfr_id=info%3Asid%2Fen.wikipedia.org%3ALarge+language+model" class="Z3988"></span></span> </li> <li id="cite_note-truthfulqa-126"><span class="mw-cite-backlink"><b><a href="#cite_ref-truthfulqa_126-0">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFLinHiltonEvans2021" class="citation arxiv cs1">Lin, Stephanie; Hilton, Jacob; Evans, Owain (2021). "TruthfulQA: Measuring How Models Mimic Human Falsehoods". <a href="/wiki/ArXiv_(identifier)" class="mw-redirect" title="ArXiv (identifier)">arXiv</a>:<span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://arxiv.org/abs/2109.07958">2109.07958</a></span> [<a rel="nofollow" class="external text" href="https://arxiv.org/archive/cs.CL">cs.CL</a>].</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=preprint&rft.jtitle=arXiv&rft.atitle=TruthfulQA%3A+Measuring+How+Models+Mimic+Human+Falsehoods&rft.date=2021&rft_id=info%3Aarxiv%2F2109.07958&rft.aulast=Lin&rft.aufirst=Stephanie&rft.au=Hilton%2C+Jacob&rft.au=Evans%2C+Owain&rfr_id=info%3Asid%2Fen.wikipedia.org%3ALarge+language+model" class="Z3988"></span></span> </li> <li id="cite_note-hellaswag-127"><span class="mw-cite-backlink">^ <a href="#cite_ref-hellaswag_127-0"><sup><i><b>a</b></i></sup></a> <a href="#cite_ref-hellaswag_127-1"><sup><i><b>b</b></i></sup></a></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFZellersHoltzmanBiskFarhadi2019" class="citation arxiv cs1">Zellers, Rowan; Holtzman, Ari; Bisk, Yonatan; Farhadi, Ali; Choi, Yejin (2019). "HellaSwag: Can a Machine Really Finish Your Sentence?". <a href="/wiki/ArXiv_(identifier)" class="mw-redirect" title="ArXiv (identifier)">arXiv</a>:<span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://arxiv.org/abs/1905.07830">1905.07830</a></span> [<a rel="nofollow" class="external text" href="https://arxiv.org/archive/cs.CL">cs.CL</a>].</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=preprint&rft.jtitle=arXiv&rft.atitle=HellaSwag%3A+Can+a+Machine+Really+Finish+Your+Sentence%3F&rft.date=2019&rft_id=info%3Aarxiv%2F1905.07830&rft.aulast=Zellers&rft.aufirst=Rowan&rft.au=Holtzman%2C+Ari&rft.au=Bisk%2C+Yonatan&rft.au=Farhadi%2C+Ali&rft.au=Choi%2C+Yejin&rfr_id=info%3Asid%2Fen.wikipedia.org%3ALarge+language+model" class="Z3988"></span></span> </li> <li id="cite_note-ZDTUM-128"><span class="mw-cite-backlink"><b><a href="#cite_ref-ZDTUM_128-0">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite class="citation journal cs1">"Prepare for truly useful large language models". <i>Nature Biomedical Engineering</i>. <b>7</b> (2): 85–86. 7 March 2023. <a href="/wiki/Doi_(identifier)" class="mw-redirect" title="Doi (identifier)">doi</a>:<a rel="nofollow" class="external text" href="https://doi.org/10.1038%2Fs41551-023-01012-6">10.1038/s41551-023-01012-6</a>. <a href="/wiki/PMID_(identifier)" class="mw-redirect" title="PMID (identifier)">PMID</a> <a rel="nofollow" class="external text" href="https://pubmed.ncbi.nlm.nih.gov/36882584">36882584</a>. <a href="/wiki/S2CID_(identifier)" class="mw-redirect" title="S2CID (identifier)">S2CID</a> <a rel="nofollow" class="external text" href="https://api.semanticscholar.org/CorpusID:257403466">257403466</a>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=article&rft.jtitle=Nature+Biomedical+Engineering&rft.atitle=Prepare+for+truly+useful+large+language+models&rft.volume=7&rft.issue=2&rft.pages=85-86&rft.date=2023-03-07&rft_id=https%3A%2F%2Fapi.semanticscholar.org%2FCorpusID%3A257403466%23id-name%3DS2CID&rft_id=info%3Apmid%2F36882584&rft_id=info%3Adoi%2F10.1038%2Fs41551-023-01012-6&rfr_id=info%3Asid%2Fen.wikipedia.org%3ALarge+language+model" class="Z3988"></span></span> </li> <li id="cite_note-81w7x-129"><span class="mw-cite-backlink"><b><a href="#cite_ref-81w7x_129-0">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite class="citation news cs1"><a rel="nofollow" class="external text" href="https://www.economist.com/finance-and-economics/2023/05/07/your-job-is-probably-safe-from-artificial-intelligence">"Your job is (probably) safe from artificial intelligence"</a>. <i>The Economist</i>. 7 May 2023. <a rel="nofollow" class="external text" href="https://web.archive.org/web/20230617225618/https://www.economist.com/finance-and-economics/2023/05/07/your-job-is-probably-safe-from-artificial-intelligence">Archived</a> from the original on 17 June 2023<span class="reference-accessdate">. Retrieved <span class="nowrap">18 June</span> 2023</span>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=article&rft.jtitle=The+Economist&rft.atitle=Your+job+is+%28probably%29+safe+from+artificial+intelligence&rft.date=2023-05-07&rft_id=https%3A%2F%2Fwww.economist.com%2Ffinance-and-economics%2F2023%2F05%2F07%2Fyour-job-is-probably-safe-from-artificial-intelligence&rfr_id=info%3Asid%2Fen.wikipedia.org%3ALarge+language+model" class="Z3988"></span></span> </li> <li id="cite_note-zIM6Y-130"><span class="mw-cite-backlink"><b><a href="#cite_ref-zIM6Y_130-0">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite class="citation web cs1"><a rel="nofollow" class="external text" href="https://www.goldmansachs.com/intelligence/pages/generative-ai-could-raise-global-gdp-by-7-percent.html">"Generative AI Could Raise Global GDP by 7%"</a>. <i>Goldman Sachs</i>. <a rel="nofollow" class="external text" href="https://web.archive.org/web/20230618013836/https://www.goldmansachs.com/intelligence/pages/generative-ai-could-raise-global-gdp-by-7-percent.html">Archived</a> from the original on 18 June 2023<span class="reference-accessdate">. Retrieved <span class="nowrap">18 June</span> 2023</span>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=unknown&rft.jtitle=Goldman+Sachs&rft.atitle=Generative+AI+Could+Raise+Global+GDP+by+7%25&rft_id=https%3A%2F%2Fwww.goldmansachs.com%2Fintelligence%2Fpages%2Fgenerative-ai-could-raise-global-gdp-by-7-percent.html&rfr_id=info%3Asid%2Fen.wikipedia.org%3ALarge+language+model" class="Z3988"></span></span> </li> <li id="cite_note-131"><span class="mw-cite-backlink"><b><a href="#cite_ref-131">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFPengWangDeng2023" class="citation journal cs1">Peng, Zhencan; Wang, Zhizhi; Deng, Dong (13 June 2023). <a rel="nofollow" class="external text" href="https://people.cs.rutgers.edu/~dd903/assets/papers/sigmod23.pdf">"Near-Duplicate Sequence Search at Scale for Large Language Model Memorization Evaluation"</a> <span class="cs1-format">(PDF)</span>. <i>Proceedings of the ACM on Management of Data</i>. <b>1</b> (2): 1–18. <a href="/wiki/Doi_(identifier)" class="mw-redirect" title="Doi (identifier)">doi</a>:<a rel="nofollow" class="external text" href="https://doi.org/10.1145%2F3589324">10.1145/3589324</a>. <a href="/wiki/S2CID_(identifier)" class="mw-redirect" title="S2CID (identifier)">S2CID</a> <a rel="nofollow" class="external text" href="https://api.semanticscholar.org/CorpusID:259213212">259213212</a>. <a rel="nofollow" class="external text" href="https://web.archive.org/web/20240827053753/https://people.cs.rutgers.edu/~dd903/assets/papers/sigmod23.pdf">Archived</a> <span class="cs1-format">(PDF)</span> from the original on 2024-08-27<span class="reference-accessdate">. Retrieved <span class="nowrap">2024-01-20</span></span>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=article&rft.jtitle=Proceedings+of+the+ACM+on+Management+of+Data&rft.atitle=Near-Duplicate+Sequence+Search+at+Scale+for+Large+Language+Model+Memorization+Evaluation&rft.volume=1&rft.issue=2&rft.pages=1-18&rft.date=2023-06-13&rft_id=info%3Adoi%2F10.1145%2F3589324&rft_id=https%3A%2F%2Fapi.semanticscholar.org%2FCorpusID%3A259213212%23id-name%3DS2CID&rft.aulast=Peng&rft.aufirst=Zhencan&rft.au=Wang%2C+Zhizhi&rft.au=Deng%2C+Dong&rft_id=https%3A%2F%2Fpeople.cs.rutgers.edu%2F~dd903%2Fassets%2Fpapers%2Fsigmod23.pdf&rfr_id=info%3Asid%2Fen.wikipedia.org%3ALarge+language+model" class="Z3988"></span> Citing Lee et al 2022.</span> </li> <li id="cite_note-132"><span class="mw-cite-backlink"><b><a href="#cite_ref-132">^</a></b></span> <span class="reference-text"><a href="#CITEREFPengWangDeng2023">Peng, Wang & Deng 2023</a>, p. 8.</span> </li> <li id="cite_note-nD6kH-133"><span class="mw-cite-backlink"><b><a href="#cite_ref-nD6kH_133-0">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFAlba2023" class="citation news cs1">Alba, Davey (1 May 2023). <a rel="nofollow" class="external text" href="https://www.japantimes.co.jp/news/2023/05/01/business/tech/ai-fake-news-content-farms/">"AI chatbots have been used to create dozens of news content farms"</a>. <i>The Japan Times</i><span class="reference-accessdate">. Retrieved <span class="nowrap">18 June</span> 2023</span>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=article&rft.jtitle=The+Japan+Times&rft.atitle=AI+chatbots+have+been+used+to+create+dozens+of+news+content+farms&rft.date=2023-05-01&rft.aulast=Alba&rft.aufirst=Davey&rft_id=https%3A%2F%2Fwww.japantimes.co.jp%2Fnews%2F2023%2F05%2F01%2Fbusiness%2Ftech%2Fai-fake-news-content-farms%2F&rfr_id=info%3Asid%2Fen.wikipedia.org%3ALarge+language+model" class="Z3988"></span></span> </li> <li id="cite_note-PKiPY-134"><span class="mw-cite-backlink"><b><a href="#cite_ref-PKiPY_134-0">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite class="citation journal cs1"><a rel="nofollow" class="external text" href="https://www.science.org/content/article/could-chatbots-help-devise-next-pandemic-virus">"Could chatbots help devise the next pandemic virus?"</a>. <i>Science</i>. 14 June 2023. <a href="/wiki/Doi_(identifier)" class="mw-redirect" title="Doi (identifier)">doi</a>:<a rel="nofollow" class="external text" href="https://doi.org/10.1126%2Fscience.adj2463">10.1126/science.adj2463</a>. <a rel="nofollow" class="external text" href="https://web.archive.org/web/20230618013834/https://www.science.org/content/article/could-chatbots-help-devise-next-pandemic-virus">Archived</a> from the original on 18 June 2023<span class="reference-accessdate">. Retrieved <span class="nowrap">18 June</span> 2023</span>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=article&rft.jtitle=Science&rft.atitle=Could+chatbots+help+devise+the+next+pandemic+virus%3F&rft.date=2023-06-14&rft_id=info%3Adoi%2F10.1126%2Fscience.adj2463&rft_id=https%3A%2F%2Fwww.science.org%2Fcontent%2Farticle%2Fcould-chatbots-help-devise-next-pandemic-virus&rfr_id=info%3Asid%2Fen.wikipedia.org%3ALarge+language+model" class="Z3988"></span></span> </li> <li id="cite_note-135"><span class="mw-cite-backlink"><b><a href="#cite_ref-135">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFStephen_Council2023" class="citation web cs1">Stephen Council (1 Dec 2023). <a rel="nofollow" class="external text" href="https://www.sfgate.com/tech/article/google-openai-chatgpt-break-model-18525445.php">"How Googlers cracked an SF rival's tech model with a single word"</a>. SFGATE. <a rel="nofollow" class="external text" href="https://web.archive.org/web/20231216160941/https://www.sfgate.com/tech/article/google-openai-chatgpt-break-model-18525445.php">Archived</a> from the original on 16 December 2023.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&rft.genre=unknown&rft.btitle=How+Googlers+cracked+an+SF+rival%27s+tech+model+with+a+single+word&rft.pub=SFGATE&rft.date=2023-12-01&rft.au=Stephen+Council&rft_id=https%3A%2F%2Fwww.sfgate.com%2Ftech%2Farticle%2Fgoogle-openai-chatgpt-break-model-18525445.php&rfr_id=info%3Asid%2Fen.wikipedia.org%3ALarge+language+model" class="Z3988"></span></span> </li> <li id="cite_note-136"><span class="mw-cite-backlink"><b><a href="#cite_ref-136">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFHubinger2024" class="citation arxiv cs1">Hubinger, Evan (10 January 2024). "Sleeper Agents: Training Deceptive LLMs that Persist Through Safety Training". <a href="/wiki/ArXiv_(identifier)" class="mw-redirect" title="ArXiv (identifier)">arXiv</a>:<span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://arxiv.org/abs/2401.05566">2401.05566</a></span> [<a rel="nofollow" class="external text" href="https://arxiv.org/archive/cs.CR">cs.CR</a>].</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=preprint&rft.jtitle=arXiv&rft.atitle=Sleeper+Agents%3A+Training+Deceptive+LLMs+that+Persist+Through+Safety+Training&rft.date=2024-01-10&rft_id=info%3Aarxiv%2F2401.05566&rft.aulast=Hubinger&rft.aufirst=Evan&rfr_id=info%3Asid%2Fen.wikipedia.org%3ALarge+language+model" class="Z3988"></span></span> </li> <li id="cite_note-137"><span class="mw-cite-backlink"><b><a href="#cite_ref-137">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFKang2023" class="citation arxiv cs1">Kang, Daniel (2023). "Exploiting programmatic behavior of LLMs: Dual-use through standard security attacks". <a href="/wiki/ArXiv_(identifier)" class="mw-redirect" title="ArXiv (identifier)">arXiv</a>:<span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://arxiv.org/abs/2302.05733">2302.05733</a></span> [<a rel="nofollow" class="external text" href="https://arxiv.org/archive/cs.CR">cs.CR</a>].</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=preprint&rft.jtitle=arXiv&rft.atitle=Exploiting+programmatic+behavior+of+LLMs%3A+Dual-use+through+standard+security+attacks&rft.date=2023&rft_id=info%3Aarxiv%2F2302.05733&rft.aulast=Kang&rft.aufirst=Daniel&rfr_id=info%3Asid%2Fen.wikipedia.org%3ALarge+language+model" class="Z3988"></span></span> </li> <li id="cite_note-138"><span class="mw-cite-backlink"><b><a href="#cite_ref-138">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFWang2024" class="citation web cs1">Wang, Yongge (20 June 2024). <a rel="nofollow" class="external text" href="https://eprint.iacr.org/2024/586.pdf">"Encryption Based Covert Channel for Large Language Models"</a> <span class="cs1-format">(PDF)</span>. IACR ePrint 2024/586. <a rel="nofollow" class="external text" href="https://web.archive.org/web/20240624191233/https://eprint.iacr.org/2024/586.pdf">Archived</a> <span class="cs1-format">(PDF)</span> from the original on 24 June 2024<span class="reference-accessdate">. Retrieved <span class="nowrap">24 June</span> 2024</span>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&rft.genre=unknown&rft.btitle=Encryption+Based+Covert+Channel+for+Large+Language+Models&rft.pub=IACR+ePrint+2024%2F586&rft.date=2024-06-20&rft.aulast=Wang&rft.aufirst=Yongge&rft_id=https%3A%2F%2Feprint.iacr.org%2F2024%2F586.pdf&rfr_id=info%3Asid%2Fen.wikipedia.org%3ALarge+language+model" class="Z3988"></span></span> </li> <li id="cite_note-:8-139"><span class="mw-cite-backlink">^ <a href="#cite_ref-:8_139-0"><sup><i><b>a</b></i></sup></a> <a href="#cite_ref-:8_139-1"><sup><i><b>b</b></i></sup></a></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFStokel-Walker2023" class="citation web cs1">Stokel-Walker, Chris (November 22, 2023). <a rel="nofollow" class="external text" href="https://www.scientificamerican.com/article/chatgpt-replicates-gender-bias-in-recommendation-letters/">"ChatGPT Replicates Gender Bias in Recommendation Letters"</a>. <i>Scientific American</i>. <a rel="nofollow" class="external text" href="https://web.archive.org/web/20231229043124/https://www.scientificamerican.com/article/chatgpt-replicates-gender-bias-in-recommendation-letters/">Archived</a> from the original on 2023-12-29<span class="reference-accessdate">. Retrieved <span class="nowrap">2023-12-29</span></span>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=unknown&rft.jtitle=Scientific+American&rft.atitle=ChatGPT+Replicates+Gender+Bias+in+Recommendation+Letters&rft.date=2023-11-22&rft.aulast=Stokel-Walker&rft.aufirst=Chris&rft_id=https%3A%2F%2Fwww.scientificamerican.com%2Farticle%2Fchatgpt-replicates-gender-bias-in-recommendation-letters%2F&rfr_id=info%3Asid%2Fen.wikipedia.org%3ALarge+language+model" class="Z3988"></span></span> </li> <li id="cite_note-:1-140"><span class="mw-cite-backlink"><b><a href="#cite_ref-:1_140-0">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFLuoPuettSmith2023" class="citation arxiv cs1">Luo, Queenie; Puett, Michael J.; Smith, Michael D. (2023-03-28). "A Perspectival Mirror of the Elephant: Investigating Language Bias on Google, ChatGPT, Wikipedia, and YouTube". <a href="/wiki/ArXiv_(identifier)" class="mw-redirect" title="ArXiv (identifier)">arXiv</a>:<span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://arxiv.org/abs/2303.16281v2">2303.16281v2</a></span> [<a rel="nofollow" class="external text" href="https://arxiv.org/archive/cs.CY">cs.CY</a>].</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=preprint&rft.jtitle=arXiv&rft.atitle=A+Perspectival+Mirror+of+the+Elephant%3A+Investigating+Language+Bias+on+Google%2C+ChatGPT%2C+Wikipedia%2C+and+YouTube&rft.date=2023-03-28&rft_id=info%3Aarxiv%2F2303.16281v2&rft.aulast=Luo&rft.aufirst=Queenie&rft.au=Puett%2C+Michael+J.&rft.au=Smith%2C+Michael+D.&rfr_id=info%3Asid%2Fen.wikipedia.org%3ALarge+language+model" class="Z3988"></span></span> </li> <li id="cite_note-141"><span class="mw-cite-backlink"><b><a href="#cite_ref-141">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFChengDurmusJurafsky2023" class="citation cs2">Cheng, Myra; Durmus, Esin; Jurafsky, Dan (2023-05-29), <i>Marked Personas: Using Natural Language Prompts to Measure Stereotypes in Language Models</i>, <a href="/wiki/ArXiv_(identifier)" class="mw-redirect" title="ArXiv (identifier)">arXiv</a>:<span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://arxiv.org/abs/2305.18189">2305.18189</a></span></cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&rft.genre=book&rft.btitle=Marked+Personas%3A+Using+Natural+Language+Prompts+to+Measure+Stereotypes+in+Language+Models&rft.date=2023-05-29&rft_id=info%3Aarxiv%2F2305.18189&rft.aulast=Cheng&rft.aufirst=Myra&rft.au=Durmus%2C+Esin&rft.au=Jurafsky%2C+Dan&rfr_id=info%3Asid%2Fen.wikipedia.org%3ALarge+language+model" class="Z3988"></span></span> </li> <li id="cite_note-142"><span class="mw-cite-backlink"><b><a href="#cite_ref-142">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFKotekDockumSun2023" class="citation book cs1">Kotek, Hadas; Dockum, Rikker; Sun, David (2023-11-05). <a rel="nofollow" class="external text" href="https://dl.acm.org/doi/10.1145/3582269.3615599">"Gender bias and stereotypes in Large Language Models"</a>. <i>Proceedings of the ACM Collective Intelligence Conference</i>. CI '23. New York, NY, USA: Association for Computing Machinery. pp. 12–24. <a href="/wiki/Doi_(identifier)" class="mw-redirect" title="Doi (identifier)">doi</a>:<a rel="nofollow" class="external text" href="https://doi.org/10.1145%2F3582269.3615599">10.1145/3582269.3615599</a>. <a href="/wiki/ISBN_(identifier)" class="mw-redirect" title="ISBN (identifier)">ISBN</a> <a href="/wiki/Special:BookSources/979-8-4007-0113-9" title="Special:BookSources/979-8-4007-0113-9"><bdi>979-8-4007-0113-9</bdi></a>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&rft.genre=bookitem&rft.atitle=Gender+bias+and+stereotypes+in+Large+Language+Models&rft.btitle=Proceedings+of+the+ACM+Collective+Intelligence+Conference&rft.place=New+York%2C+NY%2C+USA&rft.series=CI+%2723&rft.pages=12-24&rft.pub=Association+for+Computing+Machinery&rft.date=2023-11-05&rft_id=info%3Adoi%2F10.1145%2F3582269.3615599&rft.isbn=979-8-4007-0113-9&rft.aulast=Kotek&rft.aufirst=Hadas&rft.au=Dockum%2C+Rikker&rft.au=Sun%2C+David&rft_id=https%3A%2F%2Fdl.acm.org%2Fdoi%2F10.1145%2F3582269.3615599&rfr_id=info%3Asid%2Fen.wikipedia.org%3ALarge+language+model" class="Z3988"></span></span> </li> <li id="cite_note-143"><span class="mw-cite-backlink"><b><a href="#cite_ref-143">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFHeikkilä2023" class="citation web cs1">Heikkilä, Melissa (August 7, 2023). <a rel="nofollow" class="external text" href="https://www.technologyreview.com/2023/08/07/1077324/ai-language-models-are-rife-with-political-biases/">"AI language models are rife with different political biases"</a>. <i>MIT Technology Review</i><span class="reference-accessdate">. Retrieved <span class="nowrap">2023-12-29</span></span>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=unknown&rft.jtitle=MIT+Technology+Review&rft.atitle=AI+language+models+are+rife+with+different+political+biases&rft.date=2023-08-07&rft.aulast=Heikkil%C3%A4&rft.aufirst=Melissa&rft_id=https%3A%2F%2Fwww.technologyreview.com%2F2023%2F08%2F07%2F1077324%2Fai-language-models-are-rife-with-political-biases%2F&rfr_id=info%3Asid%2Fen.wikipedia.org%3ALarge+language+model" class="Z3988"></span></span> </li> <li id="cite_note-oai-unsup-147"><span class="mw-cite-backlink"><b><a href="#cite_ref-oai-unsup_147-0">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite class="citation web cs1"><a rel="nofollow" class="external text" href="https://openai.com/research/language-unsupervised">"Improving language understanding with unsupervised learning"</a>. <i>openai.com</i>. June 11, 2018. <a rel="nofollow" class="external text" href="https://web.archive.org/web/20230318210736/https://openai.com/research/language-unsupervised">Archived</a> from the original on 2023-03-18<span class="reference-accessdate">. Retrieved <span class="nowrap">2023-03-18</span></span>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=unknown&rft.jtitle=openai.com&rft.atitle=Improving+language+understanding+with+unsupervised+learning&rft.date=2018-06-11&rft_id=https%3A%2F%2Fopenai.com%2Fresearch%2Flanguage-unsupervised&rfr_id=info%3Asid%2Fen.wikipedia.org%3ALarge+language+model" class="Z3988"></span></span> </li> <li id="cite_note-gpt1-148"><span class="mw-cite-backlink"><b><a href="#cite_ref-gpt1_148-0">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite class="citation web cs1"><a rel="nofollow" class="external text" href="https://github.com/openai/finetune-transformer-lm">"finetune-transformer-lm"</a>. <i>GitHub</i>. <a rel="nofollow" class="external text" href="https://web.archive.org/web/20230519062127/https://github.com/openai/finetune-transformer-lm">Archived</a> from the original on 19 May 2023<span class="reference-accessdate">. Retrieved <span class="nowrap">2 January</span> 2024</span>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=unknown&rft.jtitle=GitHub&rft.atitle=finetune-transformer-lm&rft_id=https%3A%2F%2Fgithub.com%2Fopenai%2Ffinetune-transformer-lm&rfr_id=info%3Asid%2Fen.wikipedia.org%3ALarge+language+model" class="Z3988"></span></span> </li> <li id="cite_note-bert-paper-149"><span class="mw-cite-backlink">^ <a href="#cite_ref-bert-paper_149-0"><sup><i><b>a</b></i></sup></a> <a href="#cite_ref-bert-paper_149-1"><sup><i><b>b</b></i></sup></a></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFDevlinChangLeeToutanova2018" class="citation arxiv cs1">Devlin, Jacob; Chang, Ming-Wei; Lee, Kenton; Toutanova, Kristina (11 October 2018). "BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding". <a href="/wiki/ArXiv_(identifier)" class="mw-redirect" title="ArXiv (identifier)">arXiv</a>:<span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://arxiv.org/abs/1810.04805v2">1810.04805v2</a></span> [<a rel="nofollow" class="external text" href="https://arxiv.org/archive/cs.CL">cs.CL</a>].</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=preprint&rft.jtitle=arXiv&rft.atitle=BERT%3A+Pre-training+of+Deep+Bidirectional+Transformers+for+Language+Understanding&rft.date=2018-10-11&rft_id=info%3Aarxiv%2F1810.04805v2&rft.aulast=Devlin&rft.aufirst=Jacob&rft.au=Chang%2C+Ming-Wei&rft.au=Lee%2C+Kenton&rft.au=Toutanova%2C+Kristina&rfr_id=info%3Asid%2Fen.wikipedia.org%3ALarge+language+model" class="Z3988"></span></span> </li> <li id="cite_note-bHZJ2-150"><span class="mw-cite-backlink"><b><a href="#cite_ref-bHZJ2_150-0">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFPrickett2021" class="citation web cs1">Prickett, Nicole Hemsoth (2021-08-24). <a rel="nofollow" class="external text" href="https://www.nextplatform.com/2021/08/24/cerebras-shifts-architecture-to-meet-massive-ai-ml-models/">"Cerebras Shifts Architecture To Meet Massive AI/ML Models"</a>. <i>The Next Platform</i>. <a rel="nofollow" class="external text" href="https://web.archive.org/web/20230620151619/https://www.nextplatform.com/2021/08/24/cerebras-shifts-architecture-to-meet-massive-ai-ml-models/">Archived</a> from the original on 2023-06-20<span class="reference-accessdate">. Retrieved <span class="nowrap">2023-06-20</span></span>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=unknown&rft.jtitle=The+Next+Platform&rft.atitle=Cerebras+Shifts+Architecture+To+Meet+Massive+AI%2FML+Models&rft.date=2021-08-24&rft.aulast=Prickett&rft.aufirst=Nicole+Hemsoth&rft_id=https%3A%2F%2Fwww.nextplatform.com%2F2021%2F08%2F24%2Fcerebras-shifts-architecture-to-meet-massive-ai-ml-models%2F&rfr_id=info%3Asid%2Fen.wikipedia.org%3ALarge+language+model" class="Z3988"></span></span> </li> <li id="cite_note-bert-web-151"><span class="mw-cite-backlink"><b><a href="#cite_ref-bert-web_151-0">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite class="citation web cs1"><a rel="nofollow" class="external text" href="https://github.com/google-research/bert">"BERT"</a>. March 13, 2023. <a rel="nofollow" class="external text" href="https://web.archive.org/web/20210113211317/https://github.com/google-research/bert">Archived</a> from the original on January 13, 2021<span class="reference-accessdate">. Retrieved <span class="nowrap">March 13,</span> 2023</span> – via GitHub.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&rft.genre=unknown&rft.btitle=BERT&rft.date=2023-03-13&rft_id=https%3A%2F%2Fgithub.com%2Fgoogle-research%2Fbert&rfr_id=info%3Asid%2Fen.wikipedia.org%3ALarge+language+model" class="Z3988"></span></span> </li> <li id="cite_note-Ir545-152"><span class="mw-cite-backlink"><b><a href="#cite_ref-Ir545_152-0">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFPatelLiRasooliConstant2022" class="citation arxiv cs1">Patel, Ajay; Li, Bryan; Rasooli, Mohammad Sadegh; Constant, Noah; Raffel, Colin; Callison-Burch, Chris (2022). "Bidirectional Language Models Are Also Few-shot Learners". <a href="/wiki/ArXiv_(identifier)" class="mw-redirect" title="ArXiv (identifier)">arXiv</a>:<span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://arxiv.org/abs/2209.14500">2209.14500</a></span> [<a rel="nofollow" class="external text" href="https://arxiv.org/archive/cs.LG">cs.LG</a>].</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=preprint&rft.jtitle=arXiv&rft.atitle=Bidirectional+Language+Models+Are+Also+Few-shot+Learners&rft.date=2022&rft_id=info%3Aarxiv%2F2209.14500&rft.aulast=Patel&rft.aufirst=Ajay&rft.au=Li%2C+Bryan&rft.au=Rasooli%2C+Mohammad+Sadegh&rft.au=Constant%2C+Noah&rft.au=Raffel%2C+Colin&rft.au=Callison-Burch%2C+Chris&rfr_id=info%3Asid%2Fen.wikipedia.org%3ALarge+language+model" class="Z3988"></span></span> </li> <li id="cite_note-:02-153"><span class="mw-cite-backlink"><b><a href="#cite_ref-:02_153-0">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFDevlinChangLeeToutanova2018" class="citation arxiv cs1">Devlin, Jacob; Chang, Ming-Wei; Lee, Kenton; Toutanova, Kristina (11 October 2018). "BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding". <a href="/wiki/ArXiv_(identifier)" class="mw-redirect" title="ArXiv (identifier)">arXiv</a>:<span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://arxiv.org/abs/1810.04805v2">1810.04805v2</a></span> [<a rel="nofollow" class="external text" href="https://arxiv.org/archive/cs.CL">cs.CL</a>].</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=preprint&rft.jtitle=arXiv&rft.atitle=BERT%3A+Pre-training+of+Deep+Bidirectional+Transformers+for+Language+Understanding&rft.date=2018-10-11&rft_id=info%3Aarxiv%2F1810.04805v2&rft.aulast=Devlin&rft.aufirst=Jacob&rft.au=Chang%2C+Ming-Wei&rft.au=Lee%2C+Kenton&rft.au=Toutanova%2C+Kristina&rfr_id=info%3Asid%2Fen.wikipedia.org%3ALarge+language+model" class="Z3988"></span></span> </li> <li id="cite_note-:6-154"><span class="mw-cite-backlink">^ <a href="#cite_ref-:6_154-0"><sup><i><b>a</b></i></sup></a> <a href="#cite_ref-:6_154-1"><sup><i><b>b</b></i></sup></a></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFRaffelShazeerRobertsLee2020" class="citation journal cs1">Raffel, Colin; Shazeer, Noam; Roberts, Adam; Lee, Katherine; Narang, Sharan; Matena, Michael; Zhou, Yanqi; Li, Wei; Liu, Peter J. (2020). <a rel="nofollow" class="external text" href="http://jmlr.org/papers/v21/20-074.html">"Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer"</a>. <i>Journal of Machine Learning Research</i>. <b>21</b> (140): 1–67. <a href="/wiki/ArXiv_(identifier)" class="mw-redirect" title="ArXiv (identifier)">arXiv</a>:<span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://arxiv.org/abs/1910.10683">1910.10683</a></span>. <a href="/wiki/ISSN_(identifier)" class="mw-redirect" title="ISSN (identifier)">ISSN</a> <a rel="nofollow" class="external text" href="https://search.worldcat.org/issn/1533-7928">1533-7928</a>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=article&rft.jtitle=Journal+of+Machine+Learning+Research&rft.atitle=Exploring+the+Limits+of+Transfer+Learning+with+a+Unified+Text-to-Text+Transformer&rft.volume=21&rft.issue=140&rft.pages=1-67&rft.date=2020&rft_id=info%3Aarxiv%2F1910.10683&rft.issn=1533-7928&rft.aulast=Raffel&rft.aufirst=Colin&rft.au=Shazeer%2C+Noam&rft.au=Roberts%2C+Adam&rft.au=Lee%2C+Katherine&rft.au=Narang%2C+Sharan&rft.au=Matena%2C+Michael&rft.au=Zhou%2C+Yanqi&rft.au=Li%2C+Wei&rft.au=Liu%2C+Peter+J.&rft_id=http%3A%2F%2Fjmlr.org%2Fpapers%2Fv21%2F20-074.html&rfr_id=info%3Asid%2Fen.wikipedia.org%3ALarge+language+model" class="Z3988"></span></span> </li> <li id="cite_note-155"><span class="mw-cite-backlink"><b><a href="#cite_ref-155">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite class="citation cs2"><a rel="nofollow" class="external text" href="https://github.com/google-research/text-to-text-transfer-transformer"><i>google-research/text-to-text-transfer-transformer</i></a>, Google Research, 2024-04-02, <a rel="nofollow" class="external text" href="https://web.archive.org/web/20240329112957/https://github.com/google-research/text-to-text-transfer-transformer">archived</a> from the original on 2024-03-29<span class="reference-accessdate">, retrieved <span class="nowrap">2024-04-04</span></span></cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&rft.genre=book&rft.btitle=google-research%2Ftext-to-text-transfer-transformer&rft.pub=Google+Research&rft.date=2024-04-02&rft_id=https%3A%2F%2Fgithub.com%2Fgoogle-research%2Ftext-to-text-transfer-transformer&rfr_id=info%3Asid%2Fen.wikipedia.org%3ALarge+language+model" class="Z3988"></span></span> </li> <li id="cite_note-156"><span class="mw-cite-backlink"><b><a href="#cite_ref-156">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite class="citation web cs1"><a rel="nofollow" class="external text" href="https://imagen.research.google/">"Imagen: Text-to-Image Diffusion Models"</a>. <i>imagen.research.google</i>. <a rel="nofollow" class="external text" href="https://web.archive.org/web/20240327201713/https://imagen.research.google/">Archived</a> from the original on 2024-03-27<span class="reference-accessdate">. Retrieved <span class="nowrap">2024-04-04</span></span>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=unknown&rft.jtitle=imagen.research.google&rft.atitle=Imagen%3A+Text-to-Image+Diffusion+Models&rft_id=https%3A%2F%2Fimagen.research.google%2F&rfr_id=info%3Asid%2Fen.wikipedia.org%3ALarge+language+model" class="Z3988"></span></span> </li> <li id="cite_note-157"><span class="mw-cite-backlink"><b><a href="#cite_ref-157">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite class="citation web cs1"><a rel="nofollow" class="external text" href="https://huggingface.co/transformers/v2.0.0/pretrained_models.html">"Pretrained models — transformers 2.0.0 documentation"</a>. <i>huggingface.co</i>. <a rel="nofollow" class="external text" href="https://web.archive.org/web/20240805032110/https://huggingface.co/transformers/v2.0.0/pretrained_models.html">Archived</a> from the original on 2024-08-05<span class="reference-accessdate">. Retrieved <span class="nowrap">2024-08-05</span></span>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=unknown&rft.jtitle=huggingface.co&rft.atitle=Pretrained+models+%E2%80%94+transformers+2.0.0+documentation&rft_id=https%3A%2F%2Fhuggingface.co%2Ftransformers%2Fv2.0.0%2Fpretrained_models.html&rfr_id=info%3Asid%2Fen.wikipedia.org%3ALarge+language+model" class="Z3988"></span></span> </li> <li id="cite_note-xlnet-158"><span class="mw-cite-backlink"><b><a href="#cite_ref-xlnet_158-0">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite class="citation web cs1"><a rel="nofollow" class="external text" href="https://github.com/zihangdai/xlnet/">"xlnet"</a>. <i>GitHub</i>. <a rel="nofollow" class="external text" href="https://web.archive.org/web/20240102191842/https://github.com/zihangdai/xlnet/">Archived</a> from the original on 2 January 2024<span class="reference-accessdate">. Retrieved <span class="nowrap">2 January</span> 2024</span>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=unknown&rft.jtitle=GitHub&rft.atitle=xlnet&rft_id=https%3A%2F%2Fgithub.com%2Fzihangdai%2Fxlnet%2F&rfr_id=info%3Asid%2Fen.wikipedia.org%3ALarge+language+model" class="Z3988"></span></span> </li> <li id="cite_note-LX3rI-159"><span class="mw-cite-backlink"><b><a href="#cite_ref-LX3rI_159-0">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFYangDaiYangCarbonell2020" class="citation arxiv cs1">Yang, Zhilin; Dai, Zihang; Yang, Yiming; Carbonell, Jaime; Salakhutdinov, Ruslan; Le, Quoc V. (2 January 2020). "XLNet: Generalized Autoregressive Pretraining for Language Understanding". <a href="/wiki/ArXiv_(identifier)" class="mw-redirect" title="ArXiv (identifier)">arXiv</a>:<span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://arxiv.org/abs/1906.08237">1906.08237</a></span> [<a rel="nofollow" class="external text" href="https://arxiv.org/archive/cs.CL">cs.CL</a>].</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=preprint&rft.jtitle=arXiv&rft.atitle=XLNet%3A+Generalized+Autoregressive+Pretraining+for+Language+Understanding&rft.date=2020-01-02&rft_id=info%3Aarxiv%2F1906.08237&rft.aulast=Yang&rft.aufirst=Zhilin&rft.au=Dai%2C+Zihang&rft.au=Yang%2C+Yiming&rft.au=Carbonell%2C+Jaime&rft.au=Salakhutdinov%2C+Ruslan&rft.au=Le%2C+Quoc+V.&rfr_id=info%3Asid%2Fen.wikipedia.org%3ALarge+language+model" class="Z3988"></span></span> </li> <li id="cite_note-15Brelease-160"><span class="mw-cite-backlink"><b><a href="#cite_ref-15Brelease_160-0">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite class="citation web cs1"><a rel="nofollow" class="external text" href="https://openai.com/blog/gpt-2-1-5b-release/">"GPT-2: 1.5B Release"</a>. <i>OpenAI</i>. 2019-11-05. <a rel="nofollow" class="external text" href="https://web.archive.org/web/20191114074358/https://openai.com/blog/gpt-2-1-5b-release/">Archived</a> from the original on 2019-11-14<span class="reference-accessdate">. Retrieved <span class="nowrap">2019-11-14</span></span>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=unknown&rft.jtitle=OpenAI&rft.atitle=GPT-2%3A+1.5B+Release&rft.date=2019-11-05&rft_id=https%3A%2F%2Fopenai.com%2Fblog%2Fgpt-2-1-5b-release%2F&rfr_id=info%3Asid%2Fen.wikipedia.org%3ALarge+language+model" class="Z3988"></span></span> </li> <li id="cite_note-5T8u5-161"><span class="mw-cite-backlink"><b><a href="#cite_ref-5T8u5_161-0">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite class="citation web cs1"><a rel="nofollow" class="external text" href="https://openai.com/research/better-language-models">"Better language models and their implications"</a>. <i>openai.com</i>. <a rel="nofollow" class="external text" href="https://web.archive.org/web/20230316160730/https://openai.com/research/better-language-models">Archived</a> from the original on 2023-03-16<span class="reference-accessdate">. Retrieved <span class="nowrap">2023-03-13</span></span>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=unknown&rft.jtitle=openai.com&rft.atitle=Better+language+models+and+their+implications&rft_id=https%3A%2F%2Fopenai.com%2Fresearch%2Fbetter-language-models&rfr_id=info%3Asid%2Fen.wikipedia.org%3ALarge+language+model" class="Z3988"></span></span> </li> <li id="cite_note-LambdaLabs-162"><span class="mw-cite-backlink">^ <a href="#cite_ref-LambdaLabs_162-0"><sup><i><b>a</b></i></sup></a> <a href="#cite_ref-LambdaLabs_162-1"><sup><i><b>b</b></i></sup></a></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite class="citation web cs1"><a rel="nofollow" class="external text" href="https://lambdalabs.com/blog/demystifying-gpt-3">"OpenAI's GPT-3 Language Model: A Technical Overview"</a>. <i>lambdalabs.com</i>. 3 June 2020. <a rel="nofollow" class="external text" href="https://web.archive.org/web/20230327213811/https://lambdalabs.com/blog/demystifying-gpt-3">Archived</a> from the original on 27 March 2023<span class="reference-accessdate">. Retrieved <span class="nowrap">13 March</span> 2023</span>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=unknown&rft.jtitle=lambdalabs.com&rft.atitle=OpenAI%27s+GPT-3+Language+Model%3A+A+Technical+Overview&rft.date=2020-06-03&rft_id=https%3A%2F%2Flambdalabs.com%2Fblog%2Fdemystifying-gpt-3&rfr_id=info%3Asid%2Fen.wikipedia.org%3ALarge+language+model" class="Z3988"></span></span> </li> <li id="cite_note-:10-163"><span class="mw-cite-backlink">^ <a href="#cite_ref-:10_163-0"><sup><i><b>a</b></i></sup></a> <a href="#cite_ref-:10_163-1"><sup><i><b>b</b></i></sup></a></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite class="citation web cs1"><a rel="nofollow" class="external text" href="https://huggingface.co/openai-community/gpt2-xl">"openai-community/gpt2-xl · Hugging Face"</a>. <i>huggingface.co</i>. <a rel="nofollow" class="external text" href="https://web.archive.org/web/20240724041702/https://huggingface.co/openai-community/gpt2-xl">Archived</a> from the original on 2024-07-24<span class="reference-accessdate">. Retrieved <span class="nowrap">2024-07-24</span></span>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=unknown&rft.jtitle=huggingface.co&rft.atitle=openai-community%2Fgpt2-xl+%C2%B7+Hugging+Face&rft_id=https%3A%2F%2Fhuggingface.co%2Fopenai-community%2Fgpt2-xl&rfr_id=info%3Asid%2Fen.wikipedia.org%3ALarge+language+model" class="Z3988"></span></span> </li> <li id="cite_note-Sudbe-164"><span class="mw-cite-backlink"><b><a href="#cite_ref-Sudbe_164-0">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite class="citation web cs1"><a rel="nofollow" class="external text" href="https://github.com/openai/gpt-2">"gpt-2"</a>. <i>GitHub</i>. <a rel="nofollow" class="external text" href="https://web.archive.org/web/20230311154936/https://github.com/openai/gpt-2">Archived</a> from the original on 11 March 2023<span class="reference-accessdate">. Retrieved <span class="nowrap">13 March</span> 2023</span>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=unknown&rft.jtitle=GitHub&rft.atitle=gpt-2&rft_id=https%3A%2F%2Fgithub.com%2Fopenai%2Fgpt-2&rfr_id=info%3Asid%2Fen.wikipedia.org%3ALarge+language+model" class="Z3988"></span></span> </li> <li id="cite_note-:2-165"><span class="mw-cite-backlink"><b><a href="#cite_ref-:2_165-0">^</a></b></span> <span class="reference-text">Table D.1 in <link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFBrownMannRyderSubbiah2020" class="citation arxiv cs1">Brown, Tom B.; Mann, Benjamin; Ryder, Nick; Subbiah, Melanie; Kaplan, Jared; Dhariwal, Prafulla; Neelakantan, Arvind; Shyam, Pranav; Sastry, Girish; Askell, Amanda; Agarwal, Sandhini; Herbert-Voss, Ariel; Krueger, Gretchen; Henighan, Tom; Child, Rewon; Ramesh, Aditya; Ziegler, Daniel M.; Wu, Jeffrey; Winter, Clemens; Hesse, Christopher; Chen, Mark; Sigler, Eric; Litwin, Mateusz; Gray, Scott; Chess, Benjamin; Clark, Jack; Berner, Christopher; McCandlish, Sam; Radford, Alec; Sutskever, Ilya; Amodei, Dario (May 28, 2020). "Language Models are Few-Shot Learners". <a href="/wiki/ArXiv_(identifier)" class="mw-redirect" title="ArXiv (identifier)">arXiv</a>:<span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://arxiv.org/abs/2005.14165v4">2005.14165v4</a></span> [<a rel="nofollow" class="external text" href="https://arxiv.org/archive/cs.CL">cs.CL</a>].</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=preprint&rft.jtitle=arXiv&rft.atitle=Language+Models+are+Few-Shot+Learners&rft.date=2020-05-28&rft_id=info%3Aarxiv%2F2005.14165v4&rft.aulast=Brown&rft.aufirst=Tom+B.&rft.au=Mann%2C+Benjamin&rft.au=Ryder%2C+Nick&rft.au=Subbiah%2C+Melanie&rft.au=Kaplan%2C+Jared&rft.au=Dhariwal%2C+Prafulla&rft.au=Neelakantan%2C+Arvind&rft.au=Shyam%2C+Pranav&rft.au=Sastry%2C+Girish&rft.au=Askell%2C+Amanda&rft.au=Agarwal%2C+Sandhini&rft.au=Herbert-Voss%2C+Ariel&rft.au=Krueger%2C+Gretchen&rft.au=Henighan%2C+Tom&rft.au=Child%2C+Rewon&rft.au=Ramesh%2C+Aditya&rft.au=Ziegler%2C+Daniel+M.&rft.au=Wu%2C+Jeffrey&rft.au=Winter%2C+Clemens&rft.au=Hesse%2C+Christopher&rft.au=Chen%2C+Mark&rft.au=Sigler%2C+Eric&rft.au=Litwin%2C+Mateusz&rft.au=Gray%2C+Scott&rft.au=Chess%2C+Benjamin&rft.au=Clark%2C+Jack&rft.au=Berner%2C+Christopher&rft.au=McCandlish%2C+Sam&rft.au=Radford%2C+Alec&rft.au=Sutskever%2C+Ilya&rft.au=Amodei%2C+Dario&rfr_id=info%3Asid%2Fen.wikipedia.org%3ALarge+language+model" class="Z3988"></span></span> </li> <li id="cite_note-chatgpt-blog-166"><span class="mw-cite-backlink"><b><a href="#cite_ref-chatgpt-blog_166-0">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite class="citation web cs1"><a rel="nofollow" class="external text" href="https://openai.com/blog/chatgpt/">"ChatGPT: Optimizing Language Models for Dialogue"</a>. <i>OpenAI</i>. 2022-11-30. <a rel="nofollow" class="external text" href="https://web.archive.org/web/20221130180912/https://openai.com/blog/chatgpt/">Archived</a> from the original on 2022-11-30<span class="reference-accessdate">. Retrieved <span class="nowrap">2023-01-13</span></span>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=unknown&rft.jtitle=OpenAI&rft.atitle=ChatGPT%3A+Optimizing+Language+Models+for+Dialogue&rft.date=2022-11-30&rft_id=https%3A%2F%2Fopenai.com%2Fblog%2Fchatgpt%2F&rfr_id=info%3Asid%2Fen.wikipedia.org%3ALarge+language+model" class="Z3988"></span></span> </li> <li id="cite_note-gpt-neo-167"><span class="mw-cite-backlink"><b><a href="#cite_ref-gpt-neo_167-0">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite class="citation web cs1"><a rel="nofollow" class="external text" href="https://github.com/EleutherAI/gpt-neo">"GPT Neo"</a>. March 15, 2023. <a rel="nofollow" class="external text" href="https://web.archive.org/web/20230312225202/https://github.com/EleutherAI/gpt-neo">Archived</a> from the original on March 12, 2023<span class="reference-accessdate">. Retrieved <span class="nowrap">March 12,</span> 2023</span> – via GitHub.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&rft.genre=unknown&rft.btitle=GPT+Neo&rft.date=2023-03-15&rft_id=https%3A%2F%2Fgithub.com%2FEleutherAI%2Fgpt-neo&rfr_id=info%3Asid%2Fen.wikipedia.org%3ALarge+language+model" class="Z3988"></span></span> </li> <li id="cite_note-Pile-168"><span class="mw-cite-backlink">^ <a href="#cite_ref-Pile_168-0"><sup><i><b>a</b></i></sup></a> <a href="#cite_ref-Pile_168-1"><sup><i><b>b</b></i></sup></a> <a href="#cite_ref-Pile_168-2"><sup><i><b>c</b></i></sup></a></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFGaoBidermanBlackGolding2020" class="citation arxiv cs1">Gao, Leo; Biderman, Stella; Black, Sid; Golding, Laurence; Hoppe, Travis; Foster, Charles; Phang, Jason; He, Horace; Thite, Anish; Nabeshima, Noa; Presser, Shawn; Leahy, Connor (31 December 2020). "The Pile: An 800GB Dataset of Diverse Text for Language Modeling". <a href="/wiki/ArXiv_(identifier)" class="mw-redirect" title="ArXiv (identifier)">arXiv</a>:<span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://arxiv.org/abs/2101.00027">2101.00027</a></span> [<a rel="nofollow" class="external text" href="https://arxiv.org/archive/cs.CL">cs.CL</a>].</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=preprint&rft.jtitle=arXiv&rft.atitle=The+Pile%3A+An+800GB+Dataset+of+Diverse+Text+for+Language+Modeling&rft.date=2020-12-31&rft_id=info%3Aarxiv%2F2101.00027&rft.aulast=Gao&rft.aufirst=Leo&rft.au=Biderman%2C+Stella&rft.au=Black%2C+Sid&rft.au=Golding%2C+Laurence&rft.au=Hoppe%2C+Travis&rft.au=Foster%2C+Charles&rft.au=Phang%2C+Jason&rft.au=He%2C+Horace&rft.au=Thite%2C+Anish&rft.au=Nabeshima%2C+Noa&rft.au=Presser%2C+Shawn&rft.au=Leahy%2C+Connor&rfr_id=info%3Asid%2Fen.wikipedia.org%3ALarge+language+model" class="Z3988"></span></span> </li> <li id="cite_note-vb-gpt-neo-169"><span class="mw-cite-backlink">^ <a href="#cite_ref-vb-gpt-neo_169-0"><sup><i><b>a</b></i></sup></a> <a href="#cite_ref-vb-gpt-neo_169-1"><sup><i><b>b</b></i></sup></a></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFIyer2021" class="citation web cs1">Iyer, Abhishek (15 May 2021). <a rel="nofollow" class="external text" href="https://venturebeat.com/ai/gpt-3s-free-alternative-gpt-neo-is-something-to-be-excited-about/">"GPT-3's free alternative GPT-Neo is something to be excited about"</a>. <i>VentureBeat</i>. <a rel="nofollow" class="external text" href="https://web.archive.org/web/20230309012717/https://venturebeat.com/ai/gpt-3s-free-alternative-gpt-neo-is-something-to-be-excited-about/">Archived</a> from the original on 9 March 2023<span class="reference-accessdate">. Retrieved <span class="nowrap">13 March</span> 2023</span>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=unknown&rft.jtitle=VentureBeat&rft.atitle=GPT-3%27s+free+alternative+GPT-Neo+is+something+to+be+excited+about&rft.date=2021-05-15&rft.aulast=Iyer&rft.aufirst=Abhishek&rft_id=https%3A%2F%2Fventurebeat.com%2Fai%2Fgpt-3s-free-alternative-gpt-neo-is-something-to-be-excited-about%2F&rfr_id=info%3Asid%2Fen.wikipedia.org%3ALarge+language+model" class="Z3988"></span></span> </li> <li id="cite_note-JxohJ-170"><span class="mw-cite-backlink"><b><a href="#cite_ref-JxohJ_170-0">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite class="citation web cs1"><a rel="nofollow" class="external text" href="https://web.archive.org/web/20230309205439/https://www.forefront.ai/blog-posts/gpt-j-6b-an-introduction-to-the-largest-open-sourced-gpt-model">"GPT-J-6B: An Introduction to the Largest Open Source GPT Model | Forefront"</a>. <i>www.forefront.ai</i>. Archived from <a rel="nofollow" class="external text" href="https://www.forefront.ai/blog-posts/gpt-j-6b-an-introduction-to-the-largest-open-sourced-gpt-model">the original</a> on 2023-03-09<span class="reference-accessdate">. Retrieved <span class="nowrap">2023-02-28</span></span>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=unknown&rft.jtitle=www.forefront.ai&rft.atitle=GPT-J-6B%3A+An+Introduction+to+the+Largest+Open+Source+GPT+Model+%7C+Forefront&rft_id=https%3A%2F%2Fwww.forefront.ai%2Fblog-posts%2Fgpt-j-6b-an-introduction-to-the-largest-open-sourced-gpt-model&rfr_id=info%3Asid%2Fen.wikipedia.org%3ALarge+language+model" class="Z3988"></span></span> </li> <li id="cite_note-:3-171"><span class="mw-cite-backlink">^ <a href="#cite_ref-:3_171-0"><sup><i><b>a</b></i></sup></a> <a href="#cite_ref-:3_171-1"><sup><i><b>b</b></i></sup></a> <a href="#cite_ref-:3_171-2"><sup><i><b>c</b></i></sup></a> <a href="#cite_ref-:3_171-3"><sup><i><b>d</b></i></sup></a></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFDeyGosalZhimingChen2023" class="citation arxiv cs1">Dey, Nolan; Gosal, Gurpreet; Zhiming; Chen; Khachane, Hemant; Marshall, William; Pathria, Ribhu; Tom, Marvin; Hestness, Joel (2023-04-01). "Cerebras-GPT: Open Compute-Optimal Language Models Trained on the Cerebras Wafer-Scale Cluster". <a href="/wiki/ArXiv_(identifier)" class="mw-redirect" title="ArXiv (identifier)">arXiv</a>:<span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://arxiv.org/abs/2304.03208">2304.03208</a></span> [<a rel="nofollow" class="external text" href="https://arxiv.org/archive/cs.LG">cs.LG</a>].</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=preprint&rft.jtitle=arXiv&rft.atitle=Cerebras-GPT%3A+Open+Compute-Optimal+Language+Models+Trained+on+the+Cerebras+Wafer-Scale+Cluster&rft.date=2023-04-01&rft_id=info%3Aarxiv%2F2304.03208&rft.aulast=Dey&rft.aufirst=Nolan&rft.au=Gosal%2C+Gurpreet&rft.au=Zhiming&rft.au=Chen&rft.au=Khachane%2C+Hemant&rft.au=Marshall%2C+William&rft.au=Pathria%2C+Ribhu&rft.au=Tom%2C+Marvin&rft.au=Hestness%2C+Joel&rfr_id=info%3Asid%2Fen.wikipedia.org%3ALarge+language+model" class="Z3988"></span></span> </li> <li id="cite_note-BwnW5-172"><span class="mw-cite-backlink"><b><a href="#cite_ref-BwnW5_172-0">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFAlviKharya2021" class="citation web cs1">Alvi, Ali; Kharya, Paresh (11 October 2021). <a rel="nofollow" class="external text" href="https://www.microsoft.com/en-us/research/blog/using-deepspeed-and-megatron-to-train-megatron-turing-nlg-530b-the-worlds-largest-and-most-powerful-generative-language-model/">"Using DeepSpeed and Megatron to Train Megatron-Turing NLG 530B, the World's Largest and Most Powerful Generative Language Model"</a>. <i>Microsoft Research</i>. <a rel="nofollow" class="external text" href="https://web.archive.org/web/20230313180531/https://www.microsoft.com/en-us/research/blog/using-deepspeed-and-megatron-to-train-megatron-turing-nlg-530b-the-worlds-largest-and-most-powerful-generative-language-model/">Archived</a> from the original on 13 March 2023<span class="reference-accessdate">. Retrieved <span class="nowrap">13 March</span> 2023</span>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=unknown&rft.jtitle=Microsoft+Research&rft.atitle=Using+DeepSpeed+and+Megatron+to+Train+Megatron-Turing+NLG+530B%2C+the+World%27s+Largest+and+Most+Powerful+Generative+Language+Model&rft.date=2021-10-11&rft.aulast=Alvi&rft.aufirst=Ali&rft.au=Kharya%2C+Paresh&rft_id=https%3A%2F%2Fwww.microsoft.com%2Fen-us%2Fresearch%2Fblog%2Fusing-deepspeed-and-megatron-to-train-megatron-turing-nlg-530b-the-worlds-largest-and-most-powerful-generative-language-model%2F&rfr_id=info%3Asid%2Fen.wikipedia.org%3ALarge+language+model" class="Z3988"></span></span> </li> <li id="cite_note-mtnlg-preprint-173"><span class="mw-cite-backlink">^ <a href="#cite_ref-mtnlg-preprint_173-0"><sup><i><b>a</b></i></sup></a> <a href="#cite_ref-mtnlg-preprint_173-1"><sup><i><b>b</b></i></sup></a></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFSmithPatwaryNorickLeGresley2022" class="citation arxiv cs1">Smith, Shaden; Patwary, Mostofa; Norick, Brandon; LeGresley, Patrick; Rajbhandari, Samyam; Casper, Jared; Liu, Zhun; Prabhumoye, Shrimai; Zerveas, George; Korthikanti, Vijay; Zhang, Elton; Child, Rewon; Aminabadi, Reza Yazdani; Bernauer, Julie; Song, Xia (2022-02-04). "Using DeepSpeed and Megatron to Train Megatron-Turing NLG 530B, A Large-Scale Generative Language Model". <a href="/wiki/ArXiv_(identifier)" class="mw-redirect" title="ArXiv (identifier)">arXiv</a>:<span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://arxiv.org/abs/2201.11990">2201.11990</a></span> [<a rel="nofollow" class="external text" href="https://arxiv.org/archive/cs.CL">cs.CL</a>].</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=preprint&rft.jtitle=arXiv&rft.atitle=Using+DeepSpeed+and+Megatron+to+Train+Megatron-Turing+NLG+530B%2C+A+Large-Scale+Generative+Language+Model&rft.date=2022-02-04&rft_id=info%3Aarxiv%2F2201.11990&rft.aulast=Smith&rft.aufirst=Shaden&rft.au=Patwary%2C+Mostofa&rft.au=Norick%2C+Brandon&rft.au=LeGresley%2C+Patrick&rft.au=Rajbhandari%2C+Samyam&rft.au=Casper%2C+Jared&rft.au=Liu%2C+Zhun&rft.au=Prabhumoye%2C+Shrimai&rft.au=Zerveas%2C+George&rft.au=Korthikanti%2C+Vijay&rft.au=Zhang%2C+Elton&rft.au=Child%2C+Rewon&rft.au=Aminabadi%2C+Reza+Yazdani&rft.au=Bernauer%2C+Julie&rft.au=Song%2C+Xia&rfr_id=info%3Asid%2Fen.wikipedia.org%3ALarge+language+model" class="Z3988"></span></span> </li> <li id="cite_note-:11-174"><span class="mw-cite-backlink">^ <a href="#cite_ref-:11_174-0"><sup><i><b>a</b></i></sup></a> <a href="#cite_ref-:11_174-1"><sup><i><b>b</b></i></sup></a></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFRajbhandariLiYaoZhang2022" class="citation cs2">Rajbhandari, Samyam; Li, Conglong; Yao, Zhewei; Zhang, Minjia; Aminabadi, Reza Yazdani; Awan, Ammar Ahmad; Rasley, Jeff; He, Yuxiong (2022-07-21), <i>DeepSpeed-MoE: Advancing Mixture-of-Experts Inference and Training to Power Next-Generation AI Scale</i>, <a href="/wiki/ArXiv_(identifier)" class="mw-redirect" title="ArXiv (identifier)">arXiv</a>:<span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://arxiv.org/abs/2201.05596">2201.05596</a></span></cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&rft.genre=book&rft.btitle=DeepSpeed-MoE%3A+Advancing+Mixture-of-Experts+Inference+and+Training+to+Power+Next-Generation+AI+Scale&rft.date=2022-07-21&rft_id=info%3Aarxiv%2F2201.05596&rft.aulast=Rajbhandari&rft.aufirst=Samyam&rft.au=Li%2C+Conglong&rft.au=Yao%2C+Zhewei&rft.au=Zhang%2C+Minjia&rft.au=Aminabadi%2C+Reza+Yazdani&rft.au=Awan%2C+Ammar+Ahmad&rft.au=Rasley%2C+Jeff&rft.au=He%2C+Yuxiong&rfr_id=info%3Asid%2Fen.wikipedia.org%3ALarge+language+model" class="Z3988"></span></span> </li> <li id="cite_note-qeOB8-175"><span class="mw-cite-backlink"><b><a href="#cite_ref-qeOB8_175-0">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFWangSunXiangWu2021" class="citation arxiv cs1">Wang, Shuohuan; Sun, Yu; Xiang, Yang; Wu, Zhihua; Ding, Siyu; Gong, Weibao; Feng, Shikun; Shang, Junyuan; Zhao, Yanbin; Pang, Chao; Liu, Jiaxiang; Chen, Xuyi; Lu, Yuxiang; Liu, Weixin; Wang, Xi; Bai, Yangfan; Chen, Qiuliang; Zhao, Li; Li, Shiyong; Sun, Peng; Yu, Dianhai; Ma, Yanjun; Tian, Hao; Wu, Hua; Wu, Tian; Zeng, Wei; Li, Ge; Gao, Wen; Wang, Haifeng (December 23, 2021). "ERNIE 3.0 Titan: Exploring Larger-scale Knowledge Enhanced Pre-training for Language Understanding and Generation". <a href="/wiki/ArXiv_(identifier)" class="mw-redirect" title="ArXiv (identifier)">arXiv</a>:<span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://arxiv.org/abs/2112.12731">2112.12731</a></span> [<a rel="nofollow" class="external text" href="https://arxiv.org/archive/cs.CL">cs.CL</a>].</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=preprint&rft.jtitle=arXiv&rft.atitle=ERNIE+3.0+Titan%3A+Exploring+Larger-scale+Knowledge+Enhanced+Pre-training+for+Language+Understanding+and+Generation&rft.date=2021-12-23&rft_id=info%3Aarxiv%2F2112.12731&rft.aulast=Wang&rft.aufirst=Shuohuan&rft.au=Sun%2C+Yu&rft.au=Xiang%2C+Yang&rft.au=Wu%2C+Zhihua&rft.au=Ding%2C+Siyu&rft.au=Gong%2C+Weibao&rft.au=Feng%2C+Shikun&rft.au=Shang%2C+Junyuan&rft.au=Zhao%2C+Yanbin&rft.au=Pang%2C+Chao&rft.au=Liu%2C+Jiaxiang&rft.au=Chen%2C+Xuyi&rft.au=Lu%2C+Yuxiang&rft.au=Liu%2C+Weixin&rft.au=Wang%2C+Xi&rft.au=Bai%2C+Yangfan&rft.au=Chen%2C+Qiuliang&rft.au=Zhao%2C+Li&rft.au=Li%2C+Shiyong&rft.au=Sun%2C+Peng&rft.au=Yu%2C+Dianhai&rft.au=Ma%2C+Yanjun&rft.au=Tian%2C+Hao&rft.au=Wu%2C+Hua&rft.au=Wu%2C+Tian&rft.au=Zeng%2C+Wei&rft.au=Li%2C+Ge&rft.au=Gao%2C+Wen&rft.au=Wang%2C+Haifeng&rfr_id=info%3Asid%2Fen.wikipedia.org%3ALarge+language+model" class="Z3988"></span></span> </li> <li id="cite_note-i8jc4-176"><span class="mw-cite-backlink"><b><a href="#cite_ref-i8jc4_176-0">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite class="citation web cs1"><a rel="nofollow" class="external text" href="https://www.anthropic.com/product">"Product"</a>. <i>Anthropic</i>. <a rel="nofollow" class="external text" href="https://web.archive.org/web/20230316145444/https://www.anthropic.com/product">Archived</a> from the original on 16 March 2023<span class="reference-accessdate">. Retrieved <span class="nowrap">14 March</span> 2023</span>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=unknown&rft.jtitle=Anthropic&rft.atitle=Product&rft_id=https%3A%2F%2Fwww.anthropic.com%2Fproduct&rfr_id=info%3Asid%2Fen.wikipedia.org%3ALarge+language+model" class="Z3988"></span></span> </li> <li id="cite_note-AnthroArch-177"><span class="mw-cite-backlink">^ <a href="#cite_ref-AnthroArch_177-0"><sup><i><b>a</b></i></sup></a> <a href="#cite_ref-AnthroArch_177-1"><sup><i><b>b</b></i></sup></a></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFAskellBaiChenDrain2021" class="citation arxiv cs1">Askell, Amanda; Bai, Yuntao; Chen, Anna; et al. (9 December 2021). "A General Language Assistant as a Laboratory for Alignment". <a href="/wiki/ArXiv_(identifier)" class="mw-redirect" title="ArXiv (identifier)">arXiv</a>:<span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://arxiv.org/abs/2112.00861">2112.00861</a></span> [<a rel="nofollow" class="external text" href="https://arxiv.org/archive/cs.CL">cs.CL</a>].</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=preprint&rft.jtitle=arXiv&rft.atitle=A+General+Language+Assistant+as+a+Laboratory+for+Alignment&rft.date=2021-12-09&rft_id=info%3Aarxiv%2F2112.00861&rft.aulast=Askell&rft.aufirst=Amanda&rft.au=Bai%2C+Yuntao&rft.au=Chen%2C+Anna&rft.au=Drain%2C+Dawn&rft.au=Ganguli%2C+Deep&rft.au=Henighan%2C+Tom&rft.au=Jones%2C+Andy&rft.au=Joseph%2C+Nicholas&rft.au=Mann%2C+Ben&rft.au=DasSarma%2C+Nova&rft.au=Elhage%2C+Nelson&rft.au=Hatfield-Dodds%2C+Zac&rft.au=Hernandez%2C+Danny&rft.au=Kernion%2C+Jackson&rft.au=Ndousse%2C+Kamal&rft.au=Olsson%2C+Catherine&rft.au=Amodei%2C+Dario&rft.au=Brown%2C+Tom&rft.au=Clark%2C+Jack&rft.au=McCandlish%2C+Sam&rft.au=Olah%2C+Chris&rft.au=Kaplan%2C+Jared&rfr_id=info%3Asid%2Fen.wikipedia.org%3ALarge+language+model" class="Z3988"></span></span> </li> <li id="cite_note-RZqhw-178"><span class="mw-cite-backlink"><b><a href="#cite_ref-RZqhw_178-0">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFBaiKadavathKunduAskell2022" class="citation arxiv cs1">Bai, Yuntao; Kadavath, Saurav; Kundu, Sandipan; et al. (15 December 2022). "Constitutional AI: Harmlessness from AI Feedback". <a href="/wiki/ArXiv_(identifier)" class="mw-redirect" title="ArXiv (identifier)">arXiv</a>:<span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://arxiv.org/abs/2212.08073">2212.08073</a></span> [<a rel="nofollow" class="external text" href="https://arxiv.org/archive/cs.CL">cs.CL</a>].</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=preprint&rft.jtitle=arXiv&rft.atitle=Constitutional+AI%3A+Harmlessness+from+AI+Feedback&rft.date=2022-12-15&rft_id=info%3Aarxiv%2F2212.08073&rft.aulast=Bai&rft.aufirst=Yuntao&rft.au=Kadavath%2C+Saurav&rft.au=Kundu%2C+Sandipan&rft.au=Askell%2C+Amanda&rft.au=Kernion%2C+Jackson&rft.au=Jones%2C+Andy&rft.au=Chen%2C+Anna&rft.au=Goldie%2C+Anna&rft.au=Mirhoseini%2C+Azalia&rft.au=McKinnon%2C+Cameron&rft.au=Chen%2C+Carol&rft.au=Olsson%2C+Catherine&rft.au=Olah%2C+Christopher&rft.au=Hernandez%2C+Danny&rft.au=Drain%2C+Dawn&rft.au=Ganguli%2C+Deep&rft.au=Li%2C+Dustin&rft.au=Tran-Johnson%2C+Eli&rft.au=Perez%2C+Ethan&rft.au=Kerr%2C+Jamie&rft.au=Mueller%2C+Jared&rft.au=Ladish%2C+Jeffrey&rft.au=Landau%2C+Joshua&rft.au=Ndousse%2C+Kamal&rft.au=Lukosuite%2C+Kamile&rft.au=Lovitt%2C+Liane&rft.au=Sellitto%2C+Michael&rft.au=Elhage%2C+Nelson&rft.au=Schiefer%2C+Nicholas&rft.au=Mercado%2C+Noemi&rft.au=DasSarma%2C+Nova&rft.au=Lasenby%2C+Robert&rft.au=Larson%2C+Robin&rft.au=Ringer%2C+Sam&rft.au=Johnston%2C+Scott&rft.au=Kravec%2C+Shauna&rft.au=Showk%2C+Sheer+El&rft.au=Fort%2C+Stanislav&rft.au=Lanham%2C+Tamera&rft.au=Telleen-Lawton%2C+Timothy&rft.au=Conerly%2C+Tom&rft.au=Henighan%2C+Tom&rft.au=Hume%2C+Tristan&rft.au=Bowman%2C+Samuel+R.&rft.au=Hatfield-Dodds%2C+Zac&rft.au=Mann%2C+Ben&rft.au=Amodei%2C+Dario&rft.au=Joseph%2C+Nicholas&rft.au=McCandlish%2C+Sam&rft.au=Brown%2C+Tom&rft.au=Kaplan%2C+Jared&rfr_id=info%3Asid%2Fen.wikipedia.org%3ALarge+language+model" class="Z3988"></span></span> </li> <li id="cite_note-mD5eE-179"><span class="mw-cite-backlink"><b><a href="#cite_ref-mD5eE_179-0">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite class="citation web cs1"><a rel="nofollow" class="external text" href="https://www.deepmind.com/blog/language-modelling-at-scale-gopher-ethical-considerations-and-retrieval">"Language modelling at scale: Gopher, ethical considerations, and retrieval"</a>. <i>www.deepmind.com</i>. 8 December 2021. <a rel="nofollow" class="external text" href="https://web.archive.org/web/20230320082323/https://www.deepmind.com/blog/language-modelling-at-scale-gopher-ethical-considerations-and-retrieval">Archived</a> from the original on 20 March 2023<span class="reference-accessdate">. Retrieved <span class="nowrap">20 March</span> 2023</span>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=unknown&rft.jtitle=www.deepmind.com&rft.atitle=Language+modelling+at+scale%3A+Gopher%2C+ethical+considerations%2C+and+retrieval&rft.date=2021-12-08&rft_id=https%3A%2F%2Fwww.deepmind.com%2Fblog%2Flanguage-modelling-at-scale-gopher-ethical-considerations-and-retrieval&rfr_id=info%3Asid%2Fen.wikipedia.org%3ALarge+language+model" class="Z3988"></span></span> </li> <li id="cite_note-hoffman-180"><span class="mw-cite-backlink">^ <a href="#cite_ref-hoffman_180-0"><sup><i><b>a</b></i></sup></a> <a href="#cite_ref-hoffman_180-1"><sup><i><b>b</b></i></sup></a> <a href="#cite_ref-hoffman_180-2"><sup><i><b>c</b></i></sup></a></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFHoffmannBorgeaudMenschBuchatskaya2022" class="citation arxiv cs1">Hoffmann, Jordan; Borgeaud, Sebastian; Mensch, Arthur; et al. (29 March 2022). "Training Compute-Optimal Large Language Models". <a href="/wiki/ArXiv_(identifier)" class="mw-redirect" title="ArXiv (identifier)">arXiv</a>:<span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://arxiv.org/abs/2203.15556">2203.15556</a></span> [<a rel="nofollow" class="external text" href="https://arxiv.org/archive/cs.CL">cs.CL</a>].</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=preprint&rft.jtitle=arXiv&rft.atitle=Training+Compute-Optimal+Large+Language+Models&rft.date=2022-03-29&rft_id=info%3Aarxiv%2F2203.15556&rft.aulast=Hoffmann&rft.aufirst=Jordan&rft.au=Borgeaud%2C+Sebastian&rft.au=Mensch%2C+Arthur&rft.au=Buchatskaya%2C+Elena&rft.au=Cai%2C+Trevor&rft.au=Rutherford%2C+Eliza&rft.au=Casas%2C+Diego+de+Las&rft.au=Hendricks%2C+Lisa+Anne&rft.au=Welbl%2C+Johannes&rft.au=Clark%2C+Aidan&rft.au=Hennigan%2C+Tom&rft.au=Noland%2C+Eric&rft.au=Millican%2C+Katie&rft.au=Driessche%2C+George+van+den&rft.au=Damoc%2C+Bogdan&rft.au=Guy%2C+Aurelia&rft.au=Osindero%2C+Simon&rft.au=Simonyan%2C+Karen&rft.au=Elsen%2C+Erich&rft.au=Rae%2C+Jack+W.&rft.au=Vinyals%2C+Oriol&rft.au=Sifre%2C+Laurent&rfr_id=info%3Asid%2Fen.wikipedia.org%3ALarge+language+model" class="Z3988"></span></span> </li> <li id="cite_note-:4-181"><span class="mw-cite-backlink">^ <a href="#cite_ref-:4_181-0"><sup><i><b>a</b></i></sup></a> <a href="#cite_ref-:4_181-1"><sup><i><b>b</b></i></sup></a> <a href="#cite_ref-:4_181-2"><sup><i><b>c</b></i></sup></a> <a href="#cite_ref-:4_181-3"><sup><i><b>d</b></i></sup></a></span> <span class="reference-text">Table 20 and page 66 of <i><a rel="nofollow" class="external text" href="https://storage.googleapis.com/pathways-language-model/PaLM-paper.pdf">PaLM: Scaling Language Modeling with Pathways</a> <a rel="nofollow" class="external text" href="https://web.archive.org/web/20230610040050/https://storage.googleapis.com/pathways-language-model/PaLM-paper.pdf">Archived</a> 2023-06-10 at the <a href="/wiki/Wayback_Machine" title="Wayback Machine">Wayback Machine</a></i></span> </li> <li id="cite_note-lamda-blog-182"><span class="mw-cite-backlink">^ <a href="#cite_ref-lamda-blog_182-0"><sup><i><b>a</b></i></sup></a> <a href="#cite_ref-lamda-blog_182-1"><sup><i><b>b</b></i></sup></a></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFChengThoppilan2022" class="citation web cs1">Cheng, Heng-Tze; Thoppilan, Romal (January 21, 2022). <a rel="nofollow" class="external text" href="https://ai.googleblog.com/2022/01/lamda-towards-safe-grounded-and-high.html">"LaMDA: Towards Safe, Grounded, and High-Quality Dialog Models for Everything"</a>. <i>ai.googleblog.com</i>. <a rel="nofollow" class="external text" href="https://web.archive.org/web/20220325014118/https://ai.googleblog.com/2022/01/lamda-towards-safe-grounded-and-high.html">Archived</a> from the original on 2022-03-25<span class="reference-accessdate">. Retrieved <span class="nowrap">2023-03-09</span></span>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=unknown&rft.jtitle=ai.googleblog.com&rft.atitle=LaMDA%3A+Towards+Safe%2C+Grounded%2C+and+High-Quality+Dialog+Models+for+Everything&rft.date=2022-01-21&rft.aulast=Cheng&rft.aufirst=Heng-Tze&rft.au=Thoppilan%2C+Romal&rft_id=https%3A%2F%2Fai.googleblog.com%2F2022%2F01%2Flamda-towards-safe-grounded-and-high.html&rfr_id=info%3Asid%2Fen.wikipedia.org%3ALarge+language+model" class="Z3988"></span></span> </li> <li id="cite_note-DMs9Z-183"><span class="mw-cite-backlink"><b><a href="#cite_ref-DMs9Z_183-0">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFThoppilanDe_FreitasHallShazeer2022" class="citation arxiv cs1">Thoppilan, Romal; De Freitas, Daniel; Hall, Jamie; Shazeer, Noam; Kulshreshtha, Apoorv; Cheng, Heng-Tze; Jin, Alicia; Bos, Taylor; Baker, Leslie; Du, Yu; Li, YaGuang; Lee, Hongrae; Zheng, Huaixiu Steven; Ghafouri, Amin; Menegali, Marcelo (2022-01-01). "LaMDA: Language Models for Dialog Applications". <a href="/wiki/ArXiv_(identifier)" class="mw-redirect" title="ArXiv (identifier)">arXiv</a>:<span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://arxiv.org/abs/2201.08239">2201.08239</a></span> [<a rel="nofollow" class="external text" href="https://arxiv.org/archive/cs.CL">cs.CL</a>].</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=preprint&rft.jtitle=arXiv&rft.atitle=LaMDA%3A+Language+Models+for+Dialog+Applications&rft.date=2022-01-01&rft_id=info%3Aarxiv%2F2201.08239&rft.aulast=Thoppilan&rft.aufirst=Romal&rft.au=De+Freitas%2C+Daniel&rft.au=Hall%2C+Jamie&rft.au=Shazeer%2C+Noam&rft.au=Kulshreshtha%2C+Apoorv&rft.au=Cheng%2C+Heng-Tze&rft.au=Jin%2C+Alicia&rft.au=Bos%2C+Taylor&rft.au=Baker%2C+Leslie&rft.au=Du%2C+Yu&rft.au=Li%2C+YaGuang&rft.au=Lee%2C+Hongrae&rft.au=Zheng%2C+Huaixiu+Steven&rft.au=Ghafouri%2C+Amin&rft.au=Menegali%2C+Marcelo&rfr_id=info%3Asid%2Fen.wikipedia.org%3ALarge+language+model" class="Z3988"></span></span> </li> <li id="cite_note-gpt-neox-20b-184"><span class="mw-cite-backlink"><b><a href="#cite_ref-gpt-neox-20b_184-0">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFBlackBidermanHallahan2022" class="citation conference cs1 cs1-prop-long-vol">Black, Sidney; Biderman, Stella; Hallahan, Eric; et al. (2022-05-01). <a rel="nofollow" class="external text" href="https://aclanthology.org/2022.bigscience-1.9/"><i>GPT-NeoX-20B: An Open-Source Autoregressive Language Model</i></a>. Proceedings of BigScience Episode #5 – Workshop on Challenges & Perspectives in Creating Large Language Models. Vol. Proceedings of BigScience Episode #5 – Workshop on Challenges & Perspectives in Creating Large Language Models. pp. 95–136. <a rel="nofollow" class="external text" href="https://web.archive.org/web/20221210082456/https://aclanthology.org/2022.bigscience-1.9/">Archived</a> from the original on 2022-12-10<span class="reference-accessdate">. Retrieved <span class="nowrap">2022-12-19</span></span>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&rft.genre=conference&rft.btitle=GPT-NeoX-20B%3A+An+Open-Source+Autoregressive+Language+Model&rft.pages=95-136&rft.date=2022-05-01&rft.aulast=Black&rft.aufirst=Sidney&rft.au=Biderman%2C+Stella&rft.au=Hallahan%2C+Eric&rft_id=https%3A%2F%2Faclanthology.org%2F2022.bigscience-1.9%2F&rfr_id=info%3Asid%2Fen.wikipedia.org%3ALarge+language+model" class="Z3988"></span></span> </li> <li id="cite_note-chinchilla-blog-185"><span class="mw-cite-backlink">^ <a href="#cite_ref-chinchilla-blog_185-0"><sup><i><b>a</b></i></sup></a> <a href="#cite_ref-chinchilla-blog_185-1"><sup><i><b>b</b></i></sup></a> <a href="#cite_ref-chinchilla-blog_185-2"><sup><i><b>c</b></i></sup></a></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFHoffmannBorgeaudMenschSifre2022" class="citation web cs1">Hoffmann, Jordan; Borgeaud, Sebastian; Mensch, Arthur; Sifre, Laurent (12 April 2022). <a rel="nofollow" class="external text" href="https://www.deepmind.com/blog/an-empirical-analysis-of-compute-optimal-large-language-model-training">"An empirical analysis of compute-optimal large language model training"</a>. <i>Deepmind Blog</i>. <a rel="nofollow" class="external text" href="https://web.archive.org/web/20220413014510/https://www.deepmind.com/blog/an-empirical-analysis-of-compute-optimal-large-language-model-training">Archived</a> from the original on 13 April 2022<span class="reference-accessdate">. Retrieved <span class="nowrap">9 March</span> 2023</span>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=unknown&rft.jtitle=Deepmind+Blog&rft.atitle=An+empirical+analysis+of+compute-optimal+large+language+model+training&rft.date=2022-04-12&rft.aulast=Hoffmann&rft.aufirst=Jordan&rft.au=Borgeaud%2C+Sebastian&rft.au=Mensch%2C+Arthur&rft.au=Sifre%2C+Laurent&rft_id=https%3A%2F%2Fwww.deepmind.com%2Fblog%2Fan-empirical-analysis-of-compute-optimal-large-language-model-training&rfr_id=info%3Asid%2Fen.wikipedia.org%3ALarge+language+model" class="Z3988"></span></span> </li> <li id="cite_note-palm-blog-186"><span class="mw-cite-backlink"><b><a href="#cite_ref-palm-blog_186-0">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFNarangChowdhery2022" class="citation web cs1">Narang, Sharan; Chowdhery, Aakanksha (April 4, 2022). <a rel="nofollow" class="external text" href="https://ai.googleblog.com/2022/04/pathways-language-model-palm-scaling-to.html">"Pathways Language Model (PaLM): Scaling to 540 Billion Parameters for Breakthrough Performance"</a>. <i>ai.googleblog.com</i>. <a rel="nofollow" class="external text" href="https://web.archive.org/web/20220404161447/https://ai.googleblog.com/2022/04/pathways-language-model-palm-scaling-to.html">Archived</a> from the original on 2022-04-04<span class="reference-accessdate">. Retrieved <span class="nowrap">2023-03-09</span></span>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=unknown&rft.jtitle=ai.googleblog.com&rft.atitle=Pathways+Language+Model+%28PaLM%29%3A+Scaling+to+540+Billion+Parameters+for+Breakthrough+Performance&rft.date=2022-04-04&rft.aulast=Narang&rft.aufirst=Sharan&rft.au=Chowdhery%2C+Aakanksha&rft_id=https%3A%2F%2Fai.googleblog.com%2F2022%2F04%2Fpathways-language-model-palm-scaling-to.html&rfr_id=info%3Asid%2Fen.wikipedia.org%3ALarge+language+model" class="Z3988"></span></span> </li> <li id="cite_note-jlof8-187"><span class="mw-cite-backlink"><b><a href="#cite_ref-jlof8_187-0">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFSusan_ZhangMona_DiabLuke_Zettlemoyer" class="citation web cs1">Susan Zhang; Mona Diab; Luke Zettlemoyer. <a rel="nofollow" class="external text" href="https://ai.facebook.com/blog/democratizing-access-to-large-scale-language-models-with-opt-175b/">"Democratizing access to large-scale language models with OPT-175B"</a>. <i>ai.facebook.com</i>. <a rel="nofollow" class="external text" href="https://web.archive.org/web/20230312231820/https://ai.facebook.com/blog/democratizing-access-to-large-scale-language-models-with-opt-175b/">Archived</a> from the original on 2023-03-12<span class="reference-accessdate">. Retrieved <span class="nowrap">2023-03-12</span></span>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=unknown&rft.jtitle=ai.facebook.com&rft.atitle=Democratizing+access+to+large-scale+language+models+with+OPT-175B&rft.au=Susan+Zhang&rft.au=Mona+Diab&rft.au=Luke+Zettlemoyer&rft_id=https%3A%2F%2Fai.facebook.com%2Fblog%2Fdemocratizing-access-to-large-scale-language-models-with-opt-175b%2F&rfr_id=info%3Asid%2Fen.wikipedia.org%3ALarge+language+model" class="Z3988"></span></span> </li> <li id="cite_note-QjTIc-188"><span class="mw-cite-backlink"><b><a href="#cite_ref-QjTIc_188-0">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFZhangRollerGoyalArtetxe2022" class="citation arxiv cs1">Zhang, Susan; Roller, Stephen; Goyal, Naman; Artetxe, Mikel; Chen, Moya; Chen, Shuohui; Dewan, Christopher; Diab, Mona; Li, Xian; Lin, Xi Victoria; Mihaylov, Todor; Ott, Myle; Shleifer, Sam; Shuster, Kurt; Simig, Daniel; Koura, Punit Singh; Sridhar, Anjali; Wang, Tianlu; Zettlemoyer, Luke (21 June 2022). "OPT: Open Pre-trained Transformer Language Models". <a href="/wiki/ArXiv_(identifier)" class="mw-redirect" title="ArXiv (identifier)">arXiv</a>:<span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://arxiv.org/abs/2205.01068">2205.01068</a></span> [<a rel="nofollow" class="external text" href="https://arxiv.org/archive/cs.CL">cs.CL</a>].</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=preprint&rft.jtitle=arXiv&rft.atitle=OPT%3A+Open+Pre-trained+Transformer+Language+Models&rft.date=2022-06-21&rft_id=info%3Aarxiv%2F2205.01068&rft.aulast=Zhang&rft.aufirst=Susan&rft.au=Roller%2C+Stephen&rft.au=Goyal%2C+Naman&rft.au=Artetxe%2C+Mikel&rft.au=Chen%2C+Moya&rft.au=Chen%2C+Shuohui&rft.au=Dewan%2C+Christopher&rft.au=Diab%2C+Mona&rft.au=Li%2C+Xian&rft.au=Lin%2C+Xi+Victoria&rft.au=Mihaylov%2C+Todor&rft.au=Ott%2C+Myle&rft.au=Shleifer%2C+Sam&rft.au=Shuster%2C+Kurt&rft.au=Simig%2C+Daniel&rft.au=Koura%2C+Punit+Singh&rft.au=Sridhar%2C+Anjali&rft.au=Wang%2C+Tianlu&rft.au=Zettlemoyer%2C+Luke&rfr_id=info%3Asid%2Fen.wikipedia.org%3ALarge+language+model" class="Z3988"></span></span> </li> <li id="cite_note-190"><span class="mw-cite-backlink"><b><a href="#cite_ref-190">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite class="citation web cs1"><a rel="nofollow" class="external text" href="https://github.com/facebookresearch/metaseq/tree/main/projects/OPT/chronicles">"metaseq/projects/OPT/chronicles at main · facebookresearch/metaseq"</a>. <i>GitHub</i><span class="reference-accessdate">. Retrieved <span class="nowrap">2024-10-18</span></span>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=unknown&rft.jtitle=GitHub&rft.atitle=metaseq%2Fprojects%2FOPT%2Fchronicles+at+main+%C2%B7+facebookresearch%2Fmetaseq&rft_id=https%3A%2F%2Fgithub.com%2Ffacebookresearch%2Fmetaseq%2Ftree%2Fmain%2Fprojects%2FOPT%2Fchronicles&rfr_id=info%3Asid%2Fen.wikipedia.org%3ALarge+language+model" class="Z3988"></span></span> </li> <li id="cite_note-yalm-repo-191"><span class="mw-cite-backlink">^ <a href="#cite_ref-yalm-repo_191-0"><sup><i><b>a</b></i></sup></a> <a href="#cite_ref-yalm-repo_191-1"><sup><i><b>b</b></i></sup></a></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFKhrushchevVasilevPetrovZinov2022" class="citation cs2">Khrushchev, Mikhail; Vasilev, Ruslan; Petrov, Alexey; Zinov, Nikolay (2022-06-22), <a rel="nofollow" class="external text" href="https://github.com/yandex/YaLM-100B"><i>YaLM 100B</i></a>, <a rel="nofollow" class="external text" href="https://web.archive.org/web/20230616050056/https://github.com/yandex/YaLM-100B">archived</a> from the original on 2023-06-16<span class="reference-accessdate">, retrieved <span class="nowrap">2023-03-18</span></span></cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&rft.genre=book&rft.btitle=YaLM+100B&rft.date=2022-06-22&rft.aulast=Khrushchev&rft.aufirst=Mikhail&rft.au=Vasilev%2C+Ruslan&rft.au=Petrov%2C+Alexey&rft.au=Zinov%2C+Nikolay&rft_id=https%3A%2F%2Fgithub.com%2Fyandex%2FYaLM-100B&rfr_id=info%3Asid%2Fen.wikipedia.org%3ALarge+language+model" class="Z3988"></span></span> </li> <li id="cite_note-minerva-paper-192"><span class="mw-cite-backlink">^ <a href="#cite_ref-minerva-paper_192-0"><sup><i><b>a</b></i></sup></a> <a href="#cite_ref-minerva-paper_192-1"><sup><i><b>b</b></i></sup></a></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFLewkowyczAndreassenDohanDyer2022" class="citation arxiv cs1">Lewkowycz, Aitor; Andreassen, Anders; Dohan, David; Dyer, Ethan; Michalewski, Henryk; Ramasesh, Vinay; Slone, Ambrose; Anil, Cem; Schlag, Imanol; Gutman-Solo, Theo; Wu, Yuhuai; Neyshabur, Behnam; Gur-Ari, Guy; Misra, Vedant (30 June 2022). "Solving Quantitative Reasoning Problems with Language Models". <a href="/wiki/ArXiv_(identifier)" class="mw-redirect" title="ArXiv (identifier)">arXiv</a>:<span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://arxiv.org/abs/2206.14858">2206.14858</a></span> [<a rel="nofollow" class="external text" href="https://arxiv.org/archive/cs.CL">cs.CL</a>].</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=preprint&rft.jtitle=arXiv&rft.atitle=Solving+Quantitative+Reasoning+Problems+with+Language+Models&rft.date=2022-06-30&rft_id=info%3Aarxiv%2F2206.14858&rft.aulast=Lewkowycz&rft.aufirst=Aitor&rft.au=Andreassen%2C+Anders&rft.au=Dohan%2C+David&rft.au=Dyer%2C+Ethan&rft.au=Michalewski%2C+Henryk&rft.au=Ramasesh%2C+Vinay&rft.au=Slone%2C+Ambrose&rft.au=Anil%2C+Cem&rft.au=Schlag%2C+Imanol&rft.au=Gutman-Solo%2C+Theo&rft.au=Wu%2C+Yuhuai&rft.au=Neyshabur%2C+Behnam&rft.au=Gur-Ari%2C+Guy&rft.au=Misra%2C+Vedant&rfr_id=info%3Asid%2Fen.wikipedia.org%3ALarge+language+model" class="Z3988"></span></span> </li> <li id="cite_note-FfCNK-193"><span class="mw-cite-backlink"><b><a href="#cite_ref-FfCNK_193-0">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite class="citation web cs1"><a rel="nofollow" class="external text" href="https://ai.googleblog.com/2022/06/minerva-solving-quantitative-reasoning.html">"Minerva: Solving Quantitative Reasoning Problems with Language Models"</a>. <i>ai.googleblog.com</i>. 30 June 2022<span class="reference-accessdate">. Retrieved <span class="nowrap">20 March</span> 2023</span>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=unknown&rft.jtitle=ai.googleblog.com&rft.atitle=Minerva%3A+Solving+Quantitative+Reasoning+Problems+with+Language+Models&rft.date=2022-06-30&rft_id=https%3A%2F%2Fai.googleblog.com%2F2022%2F06%2Fminerva-solving-quantitative-reasoning.html&rfr_id=info%3Asid%2Fen.wikipedia.org%3ALarge+language+model" class="Z3988"></span></span> </li> <li id="cite_note-bigger-better-194"><span class="mw-cite-backlink"><b><a href="#cite_ref-bigger-better_194-0">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFAnanthaswamy2023" class="citation journal cs1">Ananthaswamy, Anil (8 March 2023). <a rel="nofollow" class="external text" href="https://www.nature.com/articles/d41586-023-00641-w">"In AI, is bigger always better?"</a>. <i>Nature</i>. <b>615</b> (7951): 202–205. <a href="/wiki/Bibcode_(identifier)" class="mw-redirect" title="Bibcode (identifier)">Bibcode</a>:<a rel="nofollow" class="external text" href="https://ui.adsabs.harvard.edu/abs/2023Natur.615..202A">2023Natur.615..202A</a>. <a href="/wiki/Doi_(identifier)" class="mw-redirect" title="Doi (identifier)">doi</a>:<a rel="nofollow" class="external text" href="https://doi.org/10.1038%2Fd41586-023-00641-w">10.1038/d41586-023-00641-w</a>. <a href="/wiki/PMID_(identifier)" class="mw-redirect" title="PMID (identifier)">PMID</a> <a rel="nofollow" class="external text" href="https://pubmed.ncbi.nlm.nih.gov/36890378">36890378</a>. <a href="/wiki/S2CID_(identifier)" class="mw-redirect" title="S2CID (identifier)">S2CID</a> <a rel="nofollow" class="external text" href="https://api.semanticscholar.org/CorpusID:257380916">257380916</a>. <a rel="nofollow" class="external text" href="https://web.archive.org/web/20230316181013/https://www.nature.com/articles/d41586-023-00641-w">Archived</a> from the original on 16 March 2023<span class="reference-accessdate">. Retrieved <span class="nowrap">9 March</span> 2023</span>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=article&rft.jtitle=Nature&rft.atitle=In+AI%2C+is+bigger+always+better%3F&rft.volume=615&rft.issue=7951&rft.pages=202-205&rft.date=2023-03-08&rft_id=info%3Adoi%2F10.1038%2Fd41586-023-00641-w&rft_id=https%3A%2F%2Fapi.semanticscholar.org%2FCorpusID%3A257380916%23id-name%3DS2CID&rft_id=info%3Apmid%2F36890378&rft_id=info%3Abibcode%2F2023Natur.615..202A&rft.aulast=Ananthaswamy&rft.aufirst=Anil&rft_id=https%3A%2F%2Fwww.nature.com%2Farticles%2Fd41586-023-00641-w&rfr_id=info%3Asid%2Fen.wikipedia.org%3ALarge+language+model" class="Z3988"></span></span> </li> <li id="cite_note-B8wB2-195"><span class="mw-cite-backlink"><b><a href="#cite_ref-B8wB2_195-0">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite class="citation web cs1"><a rel="nofollow" class="external text" href="https://huggingface.co/bigscience/bloom">"bigscience/bloom · Hugging Face"</a>. <i>huggingface.co</i>. <a rel="nofollow" class="external text" href="https://web.archive.org/web/20230412002547/https://huggingface.co/bigscience/bloom">Archived</a> from the original on 2023-04-12<span class="reference-accessdate">. Retrieved <span class="nowrap">2023-03-13</span></span>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=unknown&rft.jtitle=huggingface.co&rft.atitle=bigscience%2Fbloom+%C2%B7+Hugging+Face&rft_id=https%3A%2F%2Fhuggingface.co%2Fbigscience%2Fbloom&rfr_id=info%3Asid%2Fen.wikipedia.org%3ALarge+language+model" class="Z3988"></span></span> </li> <li id="cite_note-37sY6-196"><span class="mw-cite-backlink"><b><a href="#cite_ref-37sY6_196-0">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFTaylorKardasCucurullScialom2022" class="citation arxiv cs1">Taylor, Ross; Kardas, Marcin; Cucurull, Guillem; Scialom, Thomas; Hartshorn, Anthony; Saravia, Elvis; Poulton, Andrew; Kerkez, Viktor; Stojnic, Robert (16 November 2022). "Galactica: A Large Language Model for Science". <a href="/wiki/ArXiv_(identifier)" class="mw-redirect" title="ArXiv (identifier)">arXiv</a>:<span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://arxiv.org/abs/2211.09085">2211.09085</a></span> [<a rel="nofollow" class="external text" href="https://arxiv.org/archive/cs.CL">cs.CL</a>].</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=preprint&rft.jtitle=arXiv&rft.atitle=Galactica%3A+A+Large+Language+Model+for+Science&rft.date=2022-11-16&rft_id=info%3Aarxiv%2F2211.09085&rft.aulast=Taylor&rft.aufirst=Ross&rft.au=Kardas%2C+Marcin&rft.au=Cucurull%2C+Guillem&rft.au=Scialom%2C+Thomas&rft.au=Hartshorn%2C+Anthony&rft.au=Saravia%2C+Elvis&rft.au=Poulton%2C+Andrew&rft.au=Kerkez%2C+Viktor&rft.au=Stojnic%2C+Robert&rfr_id=info%3Asid%2Fen.wikipedia.org%3ALarge+language+model" class="Z3988"></span></span> </li> <li id="cite_note-u5szh-197"><span class="mw-cite-backlink"><b><a href="#cite_ref-u5szh_197-0">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite class="citation web cs1"><a rel="nofollow" class="external text" href="https://www.amazon.science/blog/20b-parameter-alexa-model-sets-new-marks-in-few-shot-learning">"20B-parameter Alexa model sets new marks in few-shot learning"</a>. <i>Amazon Science</i>. 2 August 2022. <a rel="nofollow" class="external text" href="https://web.archive.org/web/20230315190223/https://www.amazon.science/blog/20b-parameter-alexa-model-sets-new-marks-in-few-shot-learning">Archived</a> from the original on 15 March 2023<span class="reference-accessdate">. Retrieved <span class="nowrap">12 March</span> 2023</span>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=unknown&rft.jtitle=Amazon+Science&rft.atitle=20B-parameter+Alexa+model+sets+new+marks+in+few-shot+learning&rft.date=2022-08-02&rft_id=https%3A%2F%2Fwww.amazon.science%2Fblog%2F20b-parameter-alexa-model-sets-new-marks-in-few-shot-learning&rfr_id=info%3Asid%2Fen.wikipedia.org%3ALarge+language+model" class="Z3988"></span></span> </li> <li id="cite_note-HaA7l-198"><span class="mw-cite-backlink"><b><a href="#cite_ref-HaA7l_198-0">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFSoltanAnanthakrishnanFitzGeraldGupta2022" class="citation arxiv cs1">Soltan, Saleh; Ananthakrishnan, Shankar; FitzGerald, Jack; et al. (3 August 2022). "AlexaTM 20B: Few-Shot Learning Using a Large-Scale Multilingual Seq2Seq Model". <a href="/wiki/ArXiv_(identifier)" class="mw-redirect" title="ArXiv (identifier)">arXiv</a>:<span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://arxiv.org/abs/2208.01448">2208.01448</a></span> [<a rel="nofollow" class="external text" href="https://arxiv.org/archive/cs.CL">cs.CL</a>].</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=preprint&rft.jtitle=arXiv&rft.atitle=AlexaTM+20B%3A+Few-Shot+Learning+Using+a+Large-Scale+Multilingual+Seq2Seq+Model&rft.date=2022-08-03&rft_id=info%3Aarxiv%2F2208.01448&rft.aulast=Soltan&rft.aufirst=Saleh&rft.au=Ananthakrishnan%2C+Shankar&rft.au=FitzGerald%2C+Jack&rft.au=Gupta%2C+Rahul&rft.au=Hamza%2C+Wael&rft.au=Khan%2C+Haidar&rft.au=Peris%2C+Charith&rft.au=Rawls%2C+Stephen&rft.au=Rosenbaum%2C+Andy&rft.au=Rumshisky%2C+Anna&rft.au=Prakash%2C+Chandana+Satya&rft.au=Sridhar%2C+Mukund&rft.au=Triefenbach%2C+Fabian&rft.au=Verma%2C+Apurv&rft.au=Tur%2C+Gokhan&rft.au=Natarajan%2C+Prem&rfr_id=info%3Asid%2Fen.wikipedia.org%3ALarge+language+model" class="Z3988"></span></span> </li> <li id="cite_note-rpehM-199"><span class="mw-cite-backlink"><b><a href="#cite_ref-rpehM_199-0">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite class="citation web cs1"><a rel="nofollow" class="external text" href="https://aws.amazon.com/blogs/machine-learning/alexatm-20b-is-now-available-in-amazon-sagemaker-jumpstart/">"AlexaTM 20B is now available in Amazon SageMaker JumpStart | AWS Machine Learning Blog"</a>. <i>aws.amazon.com</i>. 17 November 2022. <a rel="nofollow" class="external text" href="https://web.archive.org/web/20230313163933/https://aws.amazon.com/blogs/machine-learning/alexatm-20b-is-now-available-in-amazon-sagemaker-jumpstart/">Archived</a> from the original on 13 March 2023<span class="reference-accessdate">. Retrieved <span class="nowrap">13 March</span> 2023</span>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=unknown&rft.jtitle=aws.amazon.com&rft.atitle=AlexaTM+20B+is+now+available+in+Amazon+SageMaker+JumpStart+%7C+AWS+Machine+Learning+Blog&rft.date=2022-11-17&rft_id=https%3A%2F%2Faws.amazon.com%2Fblogs%2Fmachine-learning%2Falexatm-20b-is-now-available-in-amazon-sagemaker-jumpstart%2F&rfr_id=info%3Asid%2Fen.wikipedia.org%3ALarge+language+model" class="Z3988"></span></span> </li> <li id="cite_note-llama-blog-200"><span class="mw-cite-backlink">^ <a href="#cite_ref-llama-blog_200-0"><sup><i><b>a</b></i></sup></a> <a href="#cite_ref-llama-blog_200-1"><sup><i><b>b</b></i></sup></a> <a href="#cite_ref-llama-blog_200-2"><sup><i><b>c</b></i></sup></a></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite class="citation web cs1"><a rel="nofollow" class="external text" href="https://ai.facebook.com/blog/large-language-model-llama-meta-ai/">"Introducing LLaMA: A foundational, 65-billion-parameter large language model"</a>. <i>Meta AI</i>. 24 February 2023. <a rel="nofollow" class="external text" href="https://web.archive.org/web/20230303112302/https://ai.facebook.com/blog/large-language-model-llama-meta-ai/">Archived</a> from the original on 3 March 2023<span class="reference-accessdate">. Retrieved <span class="nowrap">9 March</span> 2023</span>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=unknown&rft.jtitle=Meta+AI&rft.atitle=Introducing+LLaMA%3A+A+foundational%2C+65-billion-parameter+large+language+model&rft.date=2023-02-24&rft_id=https%3A%2F%2Fai.facebook.com%2Fblog%2Flarge-language-model-llama-meta-ai%2F&rfr_id=info%3Asid%2Fen.wikipedia.org%3ALarge+language+model" class="Z3988"></span></span> </li> <li id="cite_note-:5-201"><span class="mw-cite-backlink">^ <a href="#cite_ref-:5_201-0"><sup><i><b>a</b></i></sup></a> <a href="#cite_ref-:5_201-1"><sup><i><b>b</b></i></sup></a> <a href="#cite_ref-:5_201-2"><sup><i><b>c</b></i></sup></a></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite class="citation web cs1"><a rel="nofollow" class="external text" href="https://huggingface.co/blog/falcon">"The Falcon has landed in the Hugging Face ecosystem"</a>. <i>huggingface.co</i>. <a rel="nofollow" class="external text" href="https://web.archive.org/web/20230620002832/https://huggingface.co/blog/falcon">Archived</a> from the original on 2023-06-20<span class="reference-accessdate">. Retrieved <span class="nowrap">2023-06-20</span></span>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=unknown&rft.jtitle=huggingface.co&rft.atitle=The+Falcon+has+landed+in+the+Hugging+Face+ecosystem&rft_id=https%3A%2F%2Fhuggingface.co%2Fblog%2Ffalcon&rfr_id=info%3Asid%2Fen.wikipedia.org%3ALarge+language+model" class="Z3988"></span></span> </li> <li id="cite_note-GPT4Tech-203"><span class="mw-cite-backlink"><b><a href="#cite_ref-GPT4Tech_203-0">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite class="citation web cs1"><a rel="nofollow" class="external text" href="https://cdn.openai.com/papers/gpt-4.pdf">"GPT-4 Technical Report"</a> <span class="cs1-format">(PDF)</span>. <i><a href="/wiki/OpenAI" title="OpenAI">OpenAI</a></i>. 2023. <a rel="nofollow" class="external text" href="https://web.archive.org/web/20230314190904/https://cdn.openai.com/papers/gpt-4.pdf">Archived</a> <span class="cs1-format">(PDF)</span> from the original on March 14, 2023<span class="reference-accessdate">. Retrieved <span class="nowrap">March 14,</span> 2023</span>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=unknown&rft.jtitle=OpenAI&rft.atitle=GPT-4+Technical+Report&rft.date=2023&rft_id=https%3A%2F%2Fcdn.openai.com%2Fpapers%2Fgpt-4.pdf&rfr_id=info%3Asid%2Fen.wikipedia.org%3ALarge+language+model" class="Z3988"></span></span> </li> <li id="cite_note-205"><span class="mw-cite-backlink"><b><a href="#cite_ref-205">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFSchreiner2023" class="citation web cs1">Schreiner, Maximilian (2023-07-11). <a rel="nofollow" class="external text" href="https://the-decoder.com/gpt-4-architecture-datasets-costs-and-more-leaked/">"GPT-4 architecture, datasets, costs and more leaked"</a>. <i>THE DECODER</i>. <a rel="nofollow" class="external text" href="https://web.archive.org/web/20230712123915/https://the-decoder.com/gpt-4-architecture-datasets-costs-and-more-leaked/">Archived</a> from the original on 2023-07-12<span class="reference-accessdate">. Retrieved <span class="nowrap">2024-07-26</span></span>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=unknown&rft.jtitle=THE+DECODER&rft.atitle=GPT-4+architecture%2C+datasets%2C+costs+and+more+leaked&rft.date=2023-07-11&rft.aulast=Schreiner&rft.aufirst=Maximilian&rft_id=https%3A%2F%2Fthe-decoder.com%2Fgpt-4-architecture-datasets-costs-and-more-leaked%2F&rfr_id=info%3Asid%2Fen.wikipedia.org%3ALarge+language+model" class="Z3988"></span></span> </li> <li id="cite_note-206"><span class="mw-cite-backlink"><b><a href="#cite_ref-206">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFDickson2024" class="citation news cs1">Dickson, Ben (22 May 2024). <a rel="nofollow" class="external text" href="https://venturebeat.com/ai/meta-introduces-chameleon-a-state-of-the-art-multimodal-model/">"Meta introduces Chameleon, a state-of-the-art multimodal model"</a>. <i>VentureBeat</i>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=article&rft.jtitle=VentureBeat&rft.atitle=Meta+introduces+Chameleon%2C+a+state-of-the-art+multimodal+model&rft.date=2024-05-22&rft.aulast=Dickson&rft.aufirst=Ben&rft_id=https%3A%2F%2Fventurebeat.com%2Fai%2Fmeta-introduces-chameleon-a-state-of-the-art-multimodal-model%2F&rfr_id=info%3Asid%2Fen.wikipedia.org%3ALarge+language+model" class="Z3988"></span></span> </li> <li id="cite_note-D0k2a-207"><span class="mw-cite-backlink"><b><a href="#cite_ref-D0k2a_207-0">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFDey2023" class="citation web cs1">Dey, Nolan (March 28, 2023). <a rel="nofollow" class="external text" href="https://www.cerebras.net/blog/cerebras-gpt-a-family-of-open-compute-efficient-large-language-models/">"Cerebras-GPT: A Family of Open, Compute-efficient, Large Language Models"</a>. <i>Cerebras</i>. <a rel="nofollow" class="external text" href="https://web.archive.org/web/20230328213339/https://www.cerebras.net/blog/cerebras-gpt-a-family-of-open-compute-efficient-large-language-models/">Archived</a> from the original on March 28, 2023<span class="reference-accessdate">. Retrieved <span class="nowrap">March 28,</span> 2023</span>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=unknown&rft.jtitle=Cerebras&rft.atitle=Cerebras-GPT%3A+A+Family+of+Open%2C+Compute-efficient%2C+Large+Language+Models&rft.date=2023-03-28&rft.aulast=Dey&rft.aufirst=Nolan&rft_id=https%3A%2F%2Fwww.cerebras.net%2Fblog%2Fcerebras-gpt-a-family-of-open-compute-efficient-large-language-models%2F&rfr_id=info%3Asid%2Fen.wikipedia.org%3ALarge+language+model" class="Z3988"></span></span> </li> <li id="cite_note-falcon-208"><span class="mw-cite-backlink"><b><a href="#cite_ref-falcon_208-0">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite class="citation web cs1"><a rel="nofollow" class="external text" href="https://fastcompanyme.com/news/abu-dhabi-based-tii-launches-its-own-version-of-chatgpt/">"Abu Dhabi-based TII launches its own version of ChatGPT"</a>. <i>tii.ae</i>. <a rel="nofollow" class="external text" href="https://web.archive.org/web/20230403021729/https://fastcompanyme.com/news/abu-dhabi-based-tii-launches-its-own-version-of-chatgpt/">Archived</a> from the original on 2023-04-03<span class="reference-accessdate">. Retrieved <span class="nowrap">2023-04-03</span></span>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=unknown&rft.jtitle=tii.ae&rft.atitle=Abu+Dhabi-based+TII+launches+its+own+version+of+ChatGPT&rft_id=https%3A%2F%2Ffastcompanyme.com%2Fnews%2Fabu-dhabi-based-tii-launches-its-own-version-of-chatgpt%2F&rfr_id=info%3Asid%2Fen.wikipedia.org%3ALarge+language+model" class="Z3988"></span></span> </li> <li id="cite_note-Xb1gq-209"><span class="mw-cite-backlink"><b><a href="#cite_ref-Xb1gq_209-0">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFPenedoMalarticHesslowCojocaru2023" class="citation arxiv cs1">Penedo, Guilherme; Malartic, Quentin; Hesslow, Daniel; Cojocaru, Ruxandra; Cappelli, Alessandro; Alobeidli, Hamza; Pannier, Baptiste; Almazrouei, Ebtesam; Launay, Julien (2023-06-01). "The RefinedWeb Dataset for Falcon LLM: Outperforming Curated Corpora with Web Data, and Web Data Only". <a href="/wiki/ArXiv_(identifier)" class="mw-redirect" title="ArXiv (identifier)">arXiv</a>:<span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://arxiv.org/abs/2306.01116">2306.01116</a></span> [<a rel="nofollow" class="external text" href="https://arxiv.org/archive/cs.CL">cs.CL</a>].</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=preprint&rft.jtitle=arXiv&rft.atitle=The+RefinedWeb+Dataset+for+Falcon+LLM%3A+Outperforming+Curated+Corpora+with+Web+Data%2C+and+Web+Data+Only&rft.date=2023-06-01&rft_id=info%3Aarxiv%2F2306.01116&rft.aulast=Penedo&rft.aufirst=Guilherme&rft.au=Malartic%2C+Quentin&rft.au=Hesslow%2C+Daniel&rft.au=Cojocaru%2C+Ruxandra&rft.au=Cappelli%2C+Alessandro&rft.au=Alobeidli%2C+Hamza&rft.au=Pannier%2C+Baptiste&rft.au=Almazrouei%2C+Ebtesam&rft.au=Launay%2C+Julien&rfr_id=info%3Asid%2Fen.wikipedia.org%3ALarge+language+model" class="Z3988"></span></span> </li> <li id="cite_note-gzTNw-210"><span class="mw-cite-backlink"><b><a href="#cite_ref-gzTNw_210-0">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite class="citation web cs1"><a rel="nofollow" class="external text" href="https://huggingface.co/tiiuae/falcon-40b">"tiiuae/falcon-40b · Hugging Face"</a>. <i>huggingface.co</i>. 2023-06-09<span class="reference-accessdate">. Retrieved <span class="nowrap">2023-06-20</span></span>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=unknown&rft.jtitle=huggingface.co&rft.atitle=tiiuae%2Ffalcon-40b+%C2%B7+Hugging+Face&rft.date=2023-06-09&rft_id=https%3A%2F%2Fhuggingface.co%2Ftiiuae%2Ffalcon-40b&rfr_id=info%3Asid%2Fen.wikipedia.org%3ALarge+language+model" class="Z3988"></span></span> </li> <li id="cite_note-Wmlcs-211"><span class="mw-cite-backlink"><b><a href="#cite_ref-Wmlcs_211-0">^</a></b></span> <span class="reference-text"><a rel="nofollow" class="external text" href="https://www.businesswire.com/news/home/20230531005608/en/UAE's-Falcon-40B-World's-Top-Ranked-AI-Model-from-Technology-Innovation-Institute-is-Now-Royalty-Free">UAE's Falcon 40B, World's Top-Ranked AI Model from Technology Innovation Institute, is Now Royalty-Free</a> <a rel="nofollow" class="external text" href="https://web.archive.org/web/20240208133040/https://www.businesswire.com/news/home/20230531005608/en/UAE%27s-Falcon-40B-World%27s-Top-Ranked-AI-Model-from-Technology-Innovation-Institute-is-Now-Royalty-Free">Archived</a> 2024-02-08 at the <a href="/wiki/Wayback_Machine" title="Wayback Machine">Wayback Machine</a>, 31 May 2023</span> </li> <li id="cite_note-nGOSu-212"><span class="mw-cite-backlink"><b><a href="#cite_ref-nGOSu_212-0">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFWuIrsoyLuDabravolski2023" class="citation arxiv cs1">Wu, Shijie; Irsoy, Ozan; Lu, Steven; Dabravolski, Vadim; Dredze, Mark; Gehrmann, Sebastian; Kambadur, Prabhanjan; Rosenberg, David; Mann, Gideon (March 30, 2023). "BloombergGPT: A Large Language Model for Finance". <a href="/wiki/ArXiv_(identifier)" class="mw-redirect" title="ArXiv (identifier)">arXiv</a>:<span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://arxiv.org/abs/2303.17564">2303.17564</a></span> [<a rel="nofollow" class="external text" href="https://arxiv.org/archive/cs.LG">cs.LG</a>].</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=preprint&rft.jtitle=arXiv&rft.atitle=BloombergGPT%3A+A+Large+Language+Model+for+Finance&rft.date=2023-03-30&rft_id=info%3Aarxiv%2F2303.17564&rft.aulast=Wu&rft.aufirst=Shijie&rft.au=Irsoy%2C+Ozan&rft.au=Lu%2C+Steven&rft.au=Dabravolski%2C+Vadim&rft.au=Dredze%2C+Mark&rft.au=Gehrmann%2C+Sebastian&rft.au=Kambadur%2C+Prabhanjan&rft.au=Rosenberg%2C+David&rft.au=Mann%2C+Gideon&rfr_id=info%3Asid%2Fen.wikipedia.org%3ALarge+language+model" class="Z3988"></span></span> </li> <li id="cite_note-9WSFw-213"><span class="mw-cite-backlink"><b><a href="#cite_ref-9WSFw_213-0">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFRenZhouMengHuang2023" class="citation arxiv cs1">Ren, Xiaozhe; Zhou, Pingyi; Meng, Xinfan; Huang, Xinjing; Wang, Yadao; Wang, Weichao; Li, Pengfei; Zhang, Xiaoda; Podolskiy, Alexander; Arshinov, Grigory; Bout, Andrey; Piontkovskaya, Irina; Wei, Jiansheng; Jiang, Xin; Su, Teng; Liu, Qun; Yao, Jun (March 19, 2023). "PanGu-Σ: Towards Trillion Parameter Language Model with Sparse Heterogeneous Computing". <a href="/wiki/ArXiv_(identifier)" class="mw-redirect" title="ArXiv (identifier)">arXiv</a>:<span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://arxiv.org/abs/2303.10845">2303.10845</a></span> [<a rel="nofollow" class="external text" href="https://arxiv.org/archive/cs.CL">cs.CL</a>].</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=preprint&rft.jtitle=arXiv&rft.atitle=PanGu-%CE%A3%3A+Towards+Trillion+Parameter+Language+Model+with+Sparse+Heterogeneous+Computing&rft.date=2023-03-19&rft_id=info%3Aarxiv%2F2303.10845&rft.aulast=Ren&rft.aufirst=Xiaozhe&rft.au=Zhou%2C+Pingyi&rft.au=Meng%2C+Xinfan&rft.au=Huang%2C+Xinjing&rft.au=Wang%2C+Yadao&rft.au=Wang%2C+Weichao&rft.au=Li%2C+Pengfei&rft.au=Zhang%2C+Xiaoda&rft.au=Podolskiy%2C+Alexander&rft.au=Arshinov%2C+Grigory&rft.au=Bout%2C+Andrey&rft.au=Piontkovskaya%2C+Irina&rft.au=Wei%2C+Jiansheng&rft.au=Jiang%2C+Xin&rft.au=Su%2C+Teng&rft.au=Liu%2C+Qun&rft.au=Yao%2C+Jun&rfr_id=info%3Asid%2Fen.wikipedia.org%3ALarge+language+model" class="Z3988"></span></span> </li> <li id="cite_note-JiOl8-214"><span class="mw-cite-backlink"><b><a href="#cite_ref-JiOl8_214-0">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFKöpfKilchervon_RütteAnagnostidis2023" class="citation arxiv cs1">Köpf, Andreas; Kilcher, Yannic; von Rütte, Dimitri; Anagnostidis, Sotiris; Tam, Zhi-Rui; Stevens, Keith; Barhoum, Abdullah; Duc, Nguyen Minh; Stanley, Oliver; Nagyfi, Richárd; ES, Shahul; Suri, Sameer; Glushkov, David; Dantuluri, Arnav; Maguire, Andrew (2023-04-14). "OpenAssistant Conversations – Democratizing Large Language Model Alignment". <a href="/wiki/ArXiv_(identifier)" class="mw-redirect" title="ArXiv (identifier)">arXiv</a>:<span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://arxiv.org/abs/2304.07327">2304.07327</a></span> [<a rel="nofollow" class="external text" href="https://arxiv.org/archive/cs.CL">cs.CL</a>].</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=preprint&rft.jtitle=arXiv&rft.atitle=OpenAssistant+Conversations+%E2%80%93+Democratizing+Large+Language+Model+Alignment&rft.date=2023-04-14&rft_id=info%3Aarxiv%2F2304.07327&rft.aulast=K%C3%B6pf&rft.aufirst=Andreas&rft.au=Kilcher%2C+Yannic&rft.au=von+R%C3%BCtte%2C+Dimitri&rft.au=Anagnostidis%2C+Sotiris&rft.au=Tam%2C+Zhi-Rui&rft.au=Stevens%2C+Keith&rft.au=Barhoum%2C+Abdullah&rft.au=Duc%2C+Nguyen+Minh&rft.au=Stanley%2C+Oliver&rft.au=Nagyfi%2C+Rich%C3%A1rd&rft.au=ES%2C+Shahul&rft.au=Suri%2C+Sameer&rft.au=Glushkov%2C+David&rft.au=Dantuluri%2C+Arnav&rft.au=Maguire%2C+Andrew&rfr_id=info%3Asid%2Fen.wikipedia.org%3ALarge+language+model" class="Z3988"></span></span> </li> <li id="cite_note-215"><span class="mw-cite-backlink"><b><a href="#cite_ref-215">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFWrobel" class="citation web cs1">Wrobel, Sharon. <a rel="nofollow" class="external text" href="https://www.timesofisrael.com/ai21-labs-rolls-out-new-advanced-ai-language-model-to-rival-openai/">"Tel Aviv startup rolls out new advanced AI language model to rival OpenAI"</a>. <i>www.timesofisrael.com</i>. <a rel="nofollow" class="external text" href="https://web.archive.org/web/20230724191823/https://www.timesofisrael.com/ai21-labs-rolls-out-new-advanced-ai-language-model-to-rival-openai/">Archived</a> from the original on 2023-07-24<span class="reference-accessdate">. Retrieved <span class="nowrap">2023-07-24</span></span>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=unknown&rft.jtitle=www.timesofisrael.com&rft.atitle=Tel+Aviv+startup+rolls+out+new+advanced+AI+language+model+to+rival+OpenAI&rft.aulast=Wrobel&rft.aufirst=Sharon&rft_id=https%3A%2F%2Fwww.timesofisrael.com%2Fai21-labs-rolls-out-new-advanced-ai-language-model-to-rival-openai%2F&rfr_id=info%3Asid%2Fen.wikipedia.org%3ALarge+language+model" class="Z3988"></span></span> </li> <li id="cite_note-216"><span class="mw-cite-backlink"><b><a href="#cite_ref-216">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFWiggers2023" class="citation web cs1">Wiggers, Kyle (2023-04-13). <a rel="nofollow" class="external text" href="https://techcrunch.com/2023/04/13/with-bedrock-amazon-enters-the-generative-ai-race/">"With Bedrock, Amazon enters the generative AI race"</a>. <i>TechCrunch</i>. <a rel="nofollow" class="external text" href="https://web.archive.org/web/20230724102458/https://techcrunch.com/2023/04/13/with-bedrock-amazon-enters-the-generative-ai-race/">Archived</a> from the original on 2023-07-24<span class="reference-accessdate">. Retrieved <span class="nowrap">2023-07-24</span></span>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=unknown&rft.jtitle=TechCrunch&rft.atitle=With+Bedrock%2C+Amazon+enters+the+generative+AI+race&rft.date=2023-04-13&rft.aulast=Wiggers&rft.aufirst=Kyle&rft_id=https%3A%2F%2Ftechcrunch.com%2F2023%2F04%2F13%2Fwith-bedrock-amazon-enters-the-generative-ai-race%2F&rfr_id=info%3Asid%2Fen.wikipedia.org%3ALarge+language+model" class="Z3988"></span></span> </li> <li id="cite_note-cnbc-20230516-217"><span class="mw-cite-backlink">^ <a href="#cite_ref-cnbc-20230516_217-0"><sup><i><b>a</b></i></sup></a> <a href="#cite_ref-cnbc-20230516_217-1"><sup><i><b>b</b></i></sup></a></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFElias2023" class="citation web cs1">Elias, Jennifer (16 May 2023). <a rel="nofollow" class="external text" href="https://www.cnbc.com/2023/05/16/googles-palm-2-uses-nearly-five-times-more-text-data-than-predecessor.html">"Google's newest A.I. model uses nearly five times more text data for training than its predecessor"</a>. <i><a href="/wiki/CNBC" title="CNBC">CNBC</a></i>. <a rel="nofollow" class="external text" href="https://web.archive.org/web/20230516225326/https://www.cnbc.com/2023/05/16/googles-palm-2-uses-nearly-five-times-more-text-data-than-predecessor.html">Archived</a> from the original on 16 May 2023<span class="reference-accessdate">. Retrieved <span class="nowrap">18 May</span> 2023</span>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=unknown&rft.jtitle=CNBC&rft.atitle=Google%27s+newest+A.I.+model+uses+nearly+five+times+more+text+data+for+training+than+its+predecessor&rft.date=2023-05-16&rft.aulast=Elias&rft.aufirst=Jennifer&rft_id=https%3A%2F%2Fwww.cnbc.com%2F2023%2F05%2F16%2Fgoogles-palm-2-uses-nearly-five-times-more-text-data-than-predecessor.html&rfr_id=info%3Asid%2Fen.wikipedia.org%3ALarge+language+model" class="Z3988"></span></span> </li> <li id="cite_note-pWyLA-218"><span class="mw-cite-backlink"><b><a href="#cite_ref-pWyLA_218-0">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite class="citation web cs1"><a rel="nofollow" class="external text" href="https://blog.google/technology/ai/google-palm-2-ai-large-language-model/">"Introducing PaLM 2"</a>. <i>Google</i>. May 10, 2023. <a rel="nofollow" class="external text" href="https://web.archive.org/web/20230518213209/https://blog.google/technology/ai/google-palm-2-ai-large-language-model/">Archived</a> from the original on May 18, 2023<span class="reference-accessdate">. Retrieved <span class="nowrap">May 18,</span> 2023</span>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=unknown&rft.jtitle=Google&rft.atitle=Introducing+PaLM+2&rft.date=2023-05-10&rft_id=https%3A%2F%2Fblog.google%2Ftechnology%2Fai%2Fgoogle-palm-2-ai-large-language-model%2F&rfr_id=info%3Asid%2Fen.wikipedia.org%3ALarge+language+model" class="Z3988"></span></span> </li> <li id="cite_note-meta-20230719-219"><span class="mw-cite-backlink">^ <a href="#cite_ref-meta-20230719_219-0"><sup><i><b>a</b></i></sup></a> <a href="#cite_ref-meta-20230719_219-1"><sup><i><b>b</b></i></sup></a></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite class="citation web cs1"><a rel="nofollow" class="external text" href="https://ai.meta.com/llama/">"Introducing Llama 2: The Next Generation of Our Open Source Large Language Model"</a>. <i>Meta AI</i>. 2023. <a rel="nofollow" class="external text" href="https://web.archive.org/web/20240105234629/https://ai.meta.com/llama/">Archived</a> from the original on 2024-01-05<span class="reference-accessdate">. Retrieved <span class="nowrap">2023-07-19</span></span>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=unknown&rft.jtitle=Meta+AI&rft.atitle=Introducing+Llama+2%3A+The+Next+Generation+of+Our+Open+Source+Large+Language+Model&rft.date=2023&rft_id=https%3A%2F%2Fai.meta.com%2Fllama%2F&rfr_id=info%3Asid%2Fen.wikipedia.org%3ALarge+language+model" class="Z3988"></span></span> </li> <li id="cite_note-220"><span class="mw-cite-backlink"><b><a href="#cite_ref-220">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite class="citation web cs1"><a rel="nofollow" class="external text" href="https://github.com/meta-llama/llama/blob/main/MODEL_CARD.md">"llama/MODEL_CARD.md at main · meta-llama/llama"</a>. <i>GitHub</i>. <a rel="nofollow" class="external text" href="https://web.archive.org/web/20240528090541/https://github.com/meta-llama/llama/blob/main/MODEL_CARD.md">Archived</a> from the original on 2024-05-28<span class="reference-accessdate">. Retrieved <span class="nowrap">2024-05-28</span></span>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=unknown&rft.jtitle=GitHub&rft.atitle=llama%2FMODEL_CARD.md+at+main+%C2%B7+meta-llama%2Fllama&rft_id=https%3A%2F%2Fgithub.com%2Fmeta-llama%2Fllama%2Fblob%2Fmain%2FMODEL_CARD.md&rfr_id=info%3Asid%2Fen.wikipedia.org%3ALarge+language+model" class="Z3988"></span></span> </li> <li id="cite_note-221"><span class="mw-cite-backlink"><b><a href="#cite_ref-221">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite class="citation web cs1"><a rel="nofollow" class="external text" href="https://www.anthropic.com/index/claude-2">"Claude 2"</a>. <i>anthropic.com</i>. <a rel="nofollow" class="external text" href="https://web.archive.org/web/20231215212208/https://www.anthropic.com/index/claude-2">Archived</a> from the original on 15 December 2023<span class="reference-accessdate">. Retrieved <span class="nowrap">12 December</span> 2023</span>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=unknown&rft.jtitle=anthropic.com&rft.atitle=Claude+2&rft_id=https%3A%2F%2Fwww.anthropic.com%2Findex%2Fclaude-2&rfr_id=info%3Asid%2Fen.wikipedia.org%3ALarge+language+model" class="Z3988"></span></span> </li> <li id="cite_note-222"><span class="mw-cite-backlink"><b><a href="#cite_ref-222">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFNirmal2023" class="citation web cs1">Nirmal, Dinesh (2023-09-07). <a rel="nofollow" class="external text" href="https://www.ibm.com/blog/building-ai-for-business-ibms-granite-foundation-models">"Building AI for business: IBM's Granite foundation models"</a>. <i>IBM Blog</i>. <a rel="nofollow" class="external text" href="https://web.archive.org/web/20240722083855/https://www.ibm.com/blog/building-ai-for-business-ibms-granite-foundation-models/">Archived</a> from the original on 2024-07-22<span class="reference-accessdate">. Retrieved <span class="nowrap">2024-08-11</span></span>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=unknown&rft.jtitle=IBM+Blog&rft.atitle=Building+AI+for+business%3A+IBM%27s+Granite+foundation+models&rft.date=2023-09-07&rft.aulast=Nirmal&rft.aufirst=Dinesh&rft_id=https%3A%2F%2Fwww.ibm.com%2Fblog%2Fbuilding-ai-for-business-ibms-granite-foundation-models&rfr_id=info%3Asid%2Fen.wikipedia.org%3ALarge+language+model" class="Z3988"></span></span> </li> <li id="cite_note-mistral-20230927-223"><span class="mw-cite-backlink"><b><a href="#cite_ref-mistral-20230927_223-0">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite class="citation web cs1"><a rel="nofollow" class="external text" href="https://mistral.ai/news/announcing-mistral-7b/">"Announcing Mistral 7B"</a>. <i>Mistral</i>. 2023. <a rel="nofollow" class="external text" href="https://web.archive.org/web/20240106051047/https://mistral.ai/news/announcing-mistral-7b/">Archived</a> from the original on 2024-01-06<span class="reference-accessdate">. Retrieved <span class="nowrap">2023-10-06</span></span>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=unknown&rft.jtitle=Mistral&rft.atitle=Announcing+Mistral+7B&rft.date=2023&rft_id=https%3A%2F%2Fmistral.ai%2Fnews%2Fannouncing-mistral-7b%2F&rfr_id=info%3Asid%2Fen.wikipedia.org%3ALarge+language+model" class="Z3988"></span></span> </li> <li id="cite_note-224"><span class="mw-cite-backlink"><b><a href="#cite_ref-224">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite class="citation web cs1"><a rel="nofollow" class="external text" href="https://www.anthropic.com/index/claude-2-1">"Introducing Claude 2.1"</a>. <i>anthropic.com</i>. <a rel="nofollow" class="external text" href="https://web.archive.org/web/20231215201726/https://www.anthropic.com/index/claude-2-1">Archived</a> from the original on 15 December 2023<span class="reference-accessdate">. Retrieved <span class="nowrap">12 December</span> 2023</span>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=unknown&rft.jtitle=anthropic.com&rft.atitle=Introducing+Claude+2.1&rft_id=https%3A%2F%2Fwww.anthropic.com%2Findex%2Fclaude-2-1&rfr_id=info%3Asid%2Fen.wikipedia.org%3ALarge+language+model" class="Z3988"></span></span> </li> <li id="cite_note-225"><span class="mw-cite-backlink"><b><a href="#cite_ref-225">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite class="citation cs2"><a rel="nofollow" class="external text" href="https://github.com/xai-org/grok-1"><i>xai-org/grok-1</i></a>, xai-org, 2024-03-19, <a rel="nofollow" class="external text" href="https://web.archive.org/web/20240528170731/https://github.com/xai-org/grok-1">archived</a> from the original on 2024-05-28<span class="reference-accessdate">, retrieved <span class="nowrap">2024-03-19</span></span></cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&rft.genre=book&rft.btitle=xai-org%2Fgrok-1&rft.pub=xai-org&rft.date=2024-03-19&rft_id=https%3A%2F%2Fgithub.com%2Fxai-org%2Fgrok-1&rfr_id=info%3Asid%2Fen.wikipedia.org%3ALarge+language+model" class="Z3988"></span></span> </li> <li id="cite_note-226"><span class="mw-cite-backlink"><b><a href="#cite_ref-226">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite class="citation web cs1"><a rel="nofollow" class="external text" href="https://x.ai/model-card/">"Grok-1 model card"</a>. <i>x.ai</i><span class="reference-accessdate">. Retrieved <span class="nowrap">12 December</span> 2023</span>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=unknown&rft.jtitle=x.ai&rft.atitle=Grok-1+model+card&rft_id=https%3A%2F%2Fx.ai%2Fmodel-card%2F&rfr_id=info%3Asid%2Fen.wikipedia.org%3ALarge+language+model" class="Z3988"></span></span> </li> <li id="cite_note-227"><span class="mw-cite-backlink"><b><a href="#cite_ref-227">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite class="citation web cs1"><a rel="nofollow" class="external text" href="https://deepmind.google/technologies/gemini/#capabilities">"Gemini – Google DeepMind"</a>. <i>deepmind.google</i>. <a rel="nofollow" class="external text" href="https://web.archive.org/web/20231208015607/https://deepmind.google/technologies/gemini/#capabilities">Archived</a> from the original on 8 December 2023<span class="reference-accessdate">. Retrieved <span class="nowrap">12 December</span> 2023</span>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=unknown&rft.jtitle=deepmind.google&rft.atitle=Gemini+%E2%80%93+Google+DeepMind&rft_id=https%3A%2F%2Fdeepmind.google%2Ftechnologies%2Fgemini%2F%23capabilities&rfr_id=info%3Asid%2Fen.wikipedia.org%3ALarge+language+model" class="Z3988"></span></span> </li> <li id="cite_note-228"><span class="mw-cite-backlink"><b><a href="#cite_ref-228">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFFranzen2023" class="citation web cs1">Franzen, Carl (11 December 2023). <a rel="nofollow" class="external text" href="https://venturebeat.com/ai/mistral-shocks-ai-community-as-latest-open-source-model-eclipses-gpt-3-5-performance/">"Mistral shocks AI community as latest open source model eclipses GPT-3.5 performance"</a>. <i>VentureBeat</i>. <a rel="nofollow" class="external text" href="https://web.archive.org/web/20231211213640/https://venturebeat.com/ai/mistral-shocks-ai-community-as-latest-open-source-model-eclipses-gpt-3-5-performance/">Archived</a> from the original on 11 December 2023<span class="reference-accessdate">. Retrieved <span class="nowrap">12 December</span> 2023</span>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=unknown&rft.jtitle=VentureBeat&rft.atitle=Mistral+shocks+AI+community+as+latest+open+source+model+eclipses+GPT-3.5+performance&rft.date=2023-12-11&rft.aulast=Franzen&rft.aufirst=Carl&rft_id=https%3A%2F%2Fventurebeat.com%2Fai%2Fmistral-shocks-ai-community-as-latest-open-source-model-eclipses-gpt-3-5-performance%2F&rfr_id=info%3Asid%2Fen.wikipedia.org%3ALarge+language+model" class="Z3988"></span></span> </li> <li id="cite_note-229"><span class="mw-cite-backlink"><b><a href="#cite_ref-229">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite class="citation web cs1"><a rel="nofollow" class="external text" href="https://mistral.ai/news/mixtral-of-experts/">"Mixtral of experts"</a>. <i>mistral.ai</i>. 11 December 2023. <a rel="nofollow" class="external text" href="https://web.archive.org/web/20240213104049/https://mistral.ai/news/mixtral-of-experts/">Archived</a> from the original on 13 February 2024<span class="reference-accessdate">. Retrieved <span class="nowrap">12 December</span> 2023</span>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=unknown&rft.jtitle=mistral.ai&rft.atitle=Mixtral+of+experts&rft.date=2023-12-11&rft_id=https%3A%2F%2Fmistral.ai%2Fnews%2Fmixtral-of-experts%2F&rfr_id=info%3Asid%2Fen.wikipedia.org%3ALarge+language+model" class="Z3988"></span></span> </li> <li id="cite_note-230"><span class="mw-cite-backlink"><b><a href="#cite_ref-230">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFAI2024" class="citation web cs1">AI, Mistral (2024-04-17). <a rel="nofollow" class="external text" href="https://mistral.ai/news/mixtral-8x22b/">"Cheaper, Better, Faster, Stronger"</a>. <i>mistral.ai</i>. <a rel="nofollow" class="external text" href="https://web.archive.org/web/20240505023828/https://mistral.ai/news/mixtral-8x22b/">Archived</a> from the original on 2024-05-05<span class="reference-accessdate">. Retrieved <span class="nowrap">2024-05-05</span></span>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=unknown&rft.jtitle=mistral.ai&rft.atitle=Cheaper%2C+Better%2C+Faster%2C+Stronger&rft.date=2024-04-17&rft.aulast=AI&rft.aufirst=Mistral&rft_id=https%3A%2F%2Fmistral.ai%2Fnews%2Fmixtral-8x22b%2F&rfr_id=info%3Asid%2Fen.wikipedia.org%3ALarge+language+model" class="Z3988"></span></span> </li> <li id="cite_note-:9-231"><span class="mw-cite-backlink">^ <a href="#cite_ref-:9_231-0"><sup><i><b>a</b></i></sup></a> <a href="#cite_ref-:9_231-1"><sup><i><b>b</b></i></sup></a></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFHughes2023" class="citation web cs1">Hughes, Alyssa (12 December 2023). <a rel="nofollow" class="external text" href="https://www.microsoft.com/en-us/research/blog/phi-2-the-surprising-power-of-small-language-models/">"Phi-2: The surprising power of small language models"</a>. <i>Microsoft Research</i>. <a rel="nofollow" class="external text" href="https://web.archive.org/web/20231212232647/https://www.microsoft.com/en-us/research/blog/phi-2-the-surprising-power-of-small-language-models/">Archived</a> from the original on 12 December 2023<span class="reference-accessdate">. Retrieved <span class="nowrap">13 December</span> 2023</span>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=unknown&rft.jtitle=Microsoft+Research&rft.atitle=Phi-2%3A+The+surprising+power+of+small+language+models&rft.date=2023-12-12&rft.aulast=Hughes&rft.aufirst=Alyssa&rft_id=https%3A%2F%2Fwww.microsoft.com%2Fen-us%2Fresearch%2Fblog%2Fphi-2-the-surprising-power-of-small-language-models%2F&rfr_id=info%3Asid%2Fen.wikipedia.org%3ALarge+language+model" class="Z3988"></span></span> </li> <li id="cite_note-232"><span class="mw-cite-backlink"><b><a href="#cite_ref-232">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite class="citation web cs1"><a rel="nofollow" class="external text" href="https://blog.google/technology/ai/google-gemini-next-generation-model-february-2024/#context-window">"Our next-generation model: Gemini 1.5"</a>. <i>Google</i>. 15 February 2024. <a rel="nofollow" class="external text" href="https://web.archive.org/web/20240216003052/https://blog.google/technology/ai/google-gemini-next-generation-model-february-2024/#context-window">Archived</a> from the original on 16 February 2024<span class="reference-accessdate">. Retrieved <span class="nowrap">16 February</span> 2024</span>. <q>This means 1.5 Pro can process vast amounts of information in one go — including 1 hour of video, 11 hours of audio, codebases with over 30,000 lines of code or over 700,000 words. In our research, we've also successfully tested up to 10 million tokens.</q></cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=unknown&rft.jtitle=Google&rft.atitle=Our+next-generation+model%3A+Gemini+1.5&rft.date=2024-02-15&rft_id=https%3A%2F%2Fblog.google%2Ftechnology%2Fai%2Fgoogle-gemini-next-generation-model-february-2024%2F%23context-window&rfr_id=info%3Asid%2Fen.wikipedia.org%3ALarge+language+model" class="Z3988"></span></span> </li> <li id="cite_note-gemma-233"><span class="mw-cite-backlink"><b><a href="#cite_ref-gemma_233-0">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite class="citation web cs1"><a rel="nofollow" class="external text" href="https://ai.google.dev/gemma/terms">"Gemma"</a> – via GitHub.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&rft.genre=unknown&rft.btitle=Gemma&rft_id=https%3A%2F%2Fai.google.dev%2Fgemma%2Fterms&rfr_id=info%3Asid%2Fen.wikipedia.org%3ALarge+language+model" class="Z3988"></span></span> </li> <li id="cite_note-234"><span class="mw-cite-backlink"><b><a href="#cite_ref-234">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite class="citation web cs1"><a rel="nofollow" class="external text" href="https://www.anthropic.com/news/claude-3-family">"Introducing the next generation of Claude"</a>. <i>www.anthropic.com</i>. <a rel="nofollow" class="external text" href="https://web.archive.org/web/20240304143650/https://www.anthropic.com/news/claude-3-family">Archived</a> from the original on 2024-03-04<span class="reference-accessdate">. Retrieved <span class="nowrap">2024-03-04</span></span>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=unknown&rft.jtitle=www.anthropic.com&rft.atitle=Introducing+the+next+generation+of+Claude&rft_id=https%3A%2F%2Fwww.anthropic.com%2Fnews%2Fclaude-3-family&rfr_id=info%3Asid%2Fen.wikipedia.org%3ALarge+language+model" class="Z3988"></span></span> </li> <li id="cite_note-235"><span class="mw-cite-backlink"><b><a href="#cite_ref-235">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite class="citation web cs1"><a rel="nofollow" class="external text" href="https://huggingface.co/Fugaku-LLM/Fugaku-LLM-13B">"Fugaku-LLM/Fugaku-LLM-13B · Hugging Face"</a>. <i>huggingface.co</i>. <a rel="nofollow" class="external text" href="https://web.archive.org/web/20240517135225/https://huggingface.co/Fugaku-LLM/Fugaku-LLM-13B">Archived</a> from the original on 2024-05-17<span class="reference-accessdate">. Retrieved <span class="nowrap">2024-05-17</span></span>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=unknown&rft.jtitle=huggingface.co&rft.atitle=Fugaku-LLM%2FFugaku-LLM-13B+%C2%B7+Hugging+Face&rft_id=https%3A%2F%2Fhuggingface.co%2FFugaku-LLM%2FFugaku-LLM-13B&rfr_id=info%3Asid%2Fen.wikipedia.org%3ALarge+language+model" class="Z3988"></span></span> </li> <li id="cite_note-236"><span class="mw-cite-backlink"><b><a href="#cite_ref-236">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite class="citation web cs1"><a rel="nofollow" class="external text" href="https://azure.microsoft.com/en-us/blog/introducing-phi-3-redefining-whats-possible-with-slms">"Phi-3"</a>. <i>azure.microsoft.com</i>. 23 April 2024. <a rel="nofollow" class="external text" href="https://web.archive.org/web/20240427043835/https://azure.microsoft.com/en-us/blog/introducing-phi-3-redefining-whats-possible-with-slms/">Archived</a> from the original on 2024-04-27<span class="reference-accessdate">. Retrieved <span class="nowrap">2024-04-28</span></span>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=unknown&rft.jtitle=azure.microsoft.com&rft.atitle=Phi-3&rft.date=2024-04-23&rft_id=https%3A%2F%2Fazure.microsoft.com%2Fen-us%2Fblog%2Fintroducing-phi-3-redefining-whats-possible-with-slms&rfr_id=info%3Asid%2Fen.wikipedia.org%3ALarge+language+model" class="Z3988"></span></span> </li> <li id="cite_note-237"><span class="mw-cite-backlink"><b><a href="#cite_ref-237">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite class="citation web cs1"><a rel="nofollow" class="external text" href="https://huggingface.co/docs/transformers/main/en/model_doc/phi3">"Phi-3 Model Documentation"</a>. <i>huggingface.co</i>. <a rel="nofollow" class="external text" href="https://web.archive.org/web/20240513141513/https://huggingface.co/docs/transformers/main/en/model_doc/phi3">Archived</a> from the original on 2024-05-13<span class="reference-accessdate">. Retrieved <span class="nowrap">2024-04-28</span></span>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=unknown&rft.jtitle=huggingface.co&rft.atitle=Phi-3+Model+Documentation&rft_id=https%3A%2F%2Fhuggingface.co%2Fdocs%2Ftransformers%2Fmain%2Fen%2Fmodel_doc%2Fphi3&rfr_id=info%3Asid%2Fen.wikipedia.org%3ALarge+language+model" class="Z3988"></span></span> </li> <li id="cite_note-238"><span class="mw-cite-backlink"><b><a href="#cite_ref-238">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite class="citation web cs1"><a rel="nofollow" class="external text" href="https://github.com/QwenLM/Qwen2?spm=a3c0i.28768018.7084722650.1.5cd35c10NEqBXm&file=Qwen1.5">"Qwen2"</a>. <i><a href="/wiki/GitHub" title="GitHub">GitHub</a></i>. <a rel="nofollow" class="external text" href="https://web.archive.org/web/20240617072401/https://github.com/QwenLM/Qwen2?spm=a3c0i.28768018.7084722650.1.5cd35c10NEqBXm&file=Qwen1.5">Archived</a> from the original on 2024-06-17<span class="reference-accessdate">. Retrieved <span class="nowrap">2024-06-17</span></span>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=unknown&rft.jtitle=GitHub&rft.atitle=Qwen2&rft_id=https%3A%2F%2Fgithub.com%2FQwenLM%2FQwen2%3Fspm%3Da3c0i.28768018.7084722650.1.5cd35c10NEqBXm%26file%3DQwen1.5&rfr_id=info%3Asid%2Fen.wikipedia.org%3ALarge+language+model" class="Z3988"></span></span> </li> <li id="cite_note-239"><span class="mw-cite-backlink"><b><a href="#cite_ref-239">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite class="citation web cs1"><a rel="nofollow" class="external text" href="https://huggingface.co/nvidia/Nemotron-4-340B-Base">"nvidia/Nemotron-4-340B-Base · Hugging Face"</a>. <i>huggingface.co</i>. 2024-06-14. <a rel="nofollow" class="external text" href="https://web.archive.org/web/20240615010323/https://huggingface.co/nvidia/Nemotron-4-340B-Base">Archived</a> from the original on 2024-06-15<span class="reference-accessdate">. Retrieved <span class="nowrap">2024-06-15</span></span>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=unknown&rft.jtitle=huggingface.co&rft.atitle=nvidia%2FNemotron-4-340B-Base+%C2%B7+Hugging+Face&rft.date=2024-06-14&rft_id=https%3A%2F%2Fhuggingface.co%2Fnvidia%2FNemotron-4-340B-Base&rfr_id=info%3Asid%2Fen.wikipedia.org%3ALarge+language+model" class="Z3988"></span></span> </li> <li id="cite_note-240"><span class="mw-cite-backlink"><b><a href="#cite_ref-240">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite class="citation web cs1"><a rel="nofollow" class="external text" href="https://research.nvidia.com/publication/2024-06_nemotron-4-340b">"Nemotron-4 340B | Research"</a>. <i>research.nvidia.com</i>. <a rel="nofollow" class="external text" href="https://web.archive.org/web/20240615010323/https://research.nvidia.com/publication/2024-06_nemotron-4-340b">Archived</a> from the original on 2024-06-15<span class="reference-accessdate">. Retrieved <span class="nowrap">2024-06-15</span></span>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=unknown&rft.jtitle=research.nvidia.com&rft.atitle=Nemotron-4+340B+%7C+Research&rft_id=https%3A%2F%2Fresearch.nvidia.com%2Fpublication%2F2024-06_nemotron-4-340b&rfr_id=info%3Asid%2Fen.wikipedia.org%3ALarge+language+model" class="Z3988"></span></span> </li> <li id="cite_note-241"><span class="mw-cite-backlink"><b><a href="#cite_ref-241">^</a></b></span> <span class="reference-text"><a rel="nofollow" class="external text" href="https://ai.meta.com/research/publications/the-llama-3-herd-of-models/">"The Llama 3 Herd of Models" (July 23, 2024) Llama Team, AI @ Meta</a></span> </li> <li id="cite_note-242"><span class="mw-cite-backlink"><b><a href="#cite_ref-242">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite class="citation web cs1"><a rel="nofollow" class="external text" href="https://github.com/meta-llama/llama-models/blob/main/models/llama3_1/MODEL_CARD.md">"llama-models/models/llama3_1/MODEL_CARD.md at main · meta-llama/llama-models"</a>. <i>GitHub</i>. <a rel="nofollow" class="external text" href="https://web.archive.org/web/20240723151851/https://github.com/meta-llama/llama-models/blob/main/models/llama3_1/MODEL_CARD.md">Archived</a> from the original on 2024-07-23<span class="reference-accessdate">. Retrieved <span class="nowrap">2024-07-23</span></span>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=unknown&rft.jtitle=GitHub&rft.atitle=llama-models%2Fmodels%2Fllama3_1%2FMODEL_CARD.md+at+main+%C2%B7+meta-llama%2Fllama-models&rft_id=https%3A%2F%2Fgithub.com%2Fmeta-llama%2Fllama-models%2Fblob%2Fmain%2Fmodels%2Fllama3_1%2FMODEL_CARD.md&rfr_id=info%3Asid%2Fen.wikipedia.org%3ALarge+language+model" class="Z3988"></span></span> </li> </ol></div></div> <p><br /> </p> <div class="mw-heading mw-heading2"><h2 id="Further_reading">Further reading</h2><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Large_language_model&action=edit&section=39" title="Edit section: Further reading"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <ul><li><a href="/wiki/Dan_Jurafsky" title="Dan Jurafsky">Jurafsky, Dan</a>, Martin, James. H. <a rel="nofollow" class="external text" href="https://web.stanford.edu/~jurafsky/slp3/ed3book_jan72023.pdf"><i>Speech and Language Processing: An Introduction to Natural Language Processing, Computational Linguistics, and Speech Recognition</i></a>, 3rd Edition draft, 2023.</li> <li><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFZhaoZhouLi2023" class="citation arxiv cs1">Zhao, Wayne Xin; et al. (2023). "A Survey of Large Language Models". <a href="/wiki/ArXiv_(identifier)" class="mw-redirect" title="ArXiv (identifier)">arXiv</a>:<span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://arxiv.org/abs/2303.18223">2303.18223</a></span> [<a rel="nofollow" class="external text" href="https://arxiv.org/archive/cs.CL">cs.CL</a>].</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=preprint&rft.jtitle=arXiv&rft.atitle=A+Survey+of+Large+Language+Models&rft.date=2023&rft_id=info%3Aarxiv%2F2303.18223&rft.aulast=Zhao&rft.aufirst=Wayne+Xin&rft.au=Zhou%2C+Kun&rft.au=Li%2C+Junyi&rfr_id=info%3Asid%2Fen.wikipedia.org%3ALarge+language+model" class="Z3988"></span></li> <li><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFKaddour2023" class="citation arxiv cs1">Kaddour, Jean; et al. (2023). "Challenges and Applications of Large Language Models". <a href="/wiki/ArXiv_(identifier)" class="mw-redirect" title="ArXiv (identifier)">arXiv</a>:<span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://arxiv.org/abs/2307.10169">2307.10169</a></span> [<a rel="nofollow" class="external text" href="https://arxiv.org/archive/cs.CL">cs.CL</a>].</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=preprint&rft.jtitle=arXiv&rft.atitle=Challenges+and+Applications+of+Large+Language+Models&rft.date=2023&rft_id=info%3Aarxiv%2F2307.10169&rft.aulast=Kaddour&rft.aufirst=Jean&rfr_id=info%3Asid%2Fen.wikipedia.org%3ALarge+language+model" class="Z3988"></span></li> <li><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFYinFuZhaoLi2023" class="citation arxiv cs1">Yin, Shukang; Fu, Chaoyou; Zhao, Sirui; Li, Ke; Sun, Xing; Xu, Tong; Chen, Enhong (2023-06-01). "A Survey on Multimodal Large Language Models". <a href="/wiki/ArXiv_(identifier)" class="mw-redirect" title="ArXiv (identifier)">arXiv</a>:<span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://arxiv.org/abs/2306.13549">2306.13549</a></span> [<a rel="nofollow" class="external text" href="https://arxiv.org/archive/cs.CV">cs.CV</a>].</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=preprint&rft.jtitle=arXiv&rft.atitle=A+Survey+on+Multimodal+Large+Language+Models&rft.date=2023-06-01&rft_id=info%3Aarxiv%2F2306.13549&rft.aulast=Yin&rft.aufirst=Shukang&rft.au=Fu%2C+Chaoyou&rft.au=Zhao%2C+Sirui&rft.au=Li%2C+Ke&rft.au=Sun%2C+Xing&rft.au=Xu%2C+Tong&rft.au=Chen%2C+Enhong&rfr_id=info%3Asid%2Fen.wikipedia.org%3ALarge+language+model" class="Z3988"></span></li> <li><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite class="citation web cs1"><a rel="nofollow" class="external text" href="https://aiindex.stanford.edu/report/">"AI Index Report 2024 – Artificial Intelligence Index"</a>. <i>aiindex.stanford.edu</i><span class="reference-accessdate">. Retrieved <span class="nowrap">2024-05-05</span></span>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=unknown&rft.jtitle=aiindex.stanford.edu&rft.atitle=AI+Index+Report+2024+%E2%80%93+Artificial+Intelligence+Index&rft_id=https%3A%2F%2Faiindex.stanford.edu%2Freport%2F&rfr_id=info%3Asid%2Fen.wikipedia.org%3ALarge+language+model" class="Z3988"></span></li> <li><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFFrank2023" class="citation journal cs1">Frank, Michael C. (27 June 2023). <a rel="nofollow" class="external text" href="https://www.nature.com/articles/s44159-023-00211-x">"Baby steps in evaluating the capacities of large language models"</a>. <i>Nature Reviews Psychology</i>. <b>2</b> (8): 451–452. <a href="/wiki/Doi_(identifier)" class="mw-redirect" title="Doi (identifier)">doi</a>:<a rel="nofollow" class="external text" href="https://doi.org/10.1038%2Fs44159-023-00211-x">10.1038/s44159-023-00211-x</a>. <a href="/wiki/ISSN_(identifier)" class="mw-redirect" title="ISSN (identifier)">ISSN</a> <a rel="nofollow" class="external text" href="https://search.worldcat.org/issn/2731-0574">2731-0574</a>. <a href="/wiki/S2CID_(identifier)" class="mw-redirect" title="S2CID (identifier)">S2CID</a> <a rel="nofollow" class="external text" href="https://api.semanticscholar.org/CorpusID:259713140">259713140</a><span class="reference-accessdate">. Retrieved <span class="nowrap">2 July</span> 2023</span>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=article&rft.jtitle=Nature+Reviews+Psychology&rft.atitle=Baby+steps+in+evaluating+the+capacities+of+large+language+models&rft.volume=2&rft.issue=8&rft.pages=451-452&rft.date=2023-06-27&rft_id=https%3A%2F%2Fapi.semanticscholar.org%2FCorpusID%3A259713140%23id-name%3DS2CID&rft.issn=2731-0574&rft_id=info%3Adoi%2F10.1038%2Fs44159-023-00211-x&rft.aulast=Frank&rft.aufirst=Michael+C.&rft_id=https%3A%2F%2Fwww.nature.com%2Farticles%2Fs44159-023-00211-x&rfr_id=info%3Asid%2Fen.wikipedia.org%3ALarge+language+model" class="Z3988"></span></li></ul> <div class="navbox-styles"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1129693374"><style data-mw-deduplicate="TemplateStyles:r1236075235">.mw-parser-output .navbox{box-sizing:border-box;border:1px solid #a2a9b1;width:100%;clear:both;font-size:88%;text-align:center;padding:1px;margin:1em auto 0}.mw-parser-output .navbox .navbox{margin-top:0}.mw-parser-output .navbox+.navbox,.mw-parser-output .navbox+.navbox-styles+.navbox{margin-top:-1px}.mw-parser-output .navbox-inner,.mw-parser-output .navbox-subgroup{width:100%}.mw-parser-output .navbox-group,.mw-parser-output .navbox-title,.mw-parser-output .navbox-abovebelow{padding:0.25em 1em;line-height:1.5em;text-align:center}.mw-parser-output .navbox-group{white-space:nowrap;text-align:right}.mw-parser-output .navbox,.mw-parser-output .navbox-subgroup{background-color:#fdfdfd}.mw-parser-output .navbox-list{line-height:1.5em;border-color:#fdfdfd}.mw-parser-output .navbox-list-with-group{text-align:left;border-left-width:2px;border-left-style:solid}.mw-parser-output tr+tr>.navbox-abovebelow,.mw-parser-output tr+tr>.navbox-group,.mw-parser-output tr+tr>.navbox-image,.mw-parser-output tr+tr>.navbox-list{border-top:2px solid #fdfdfd}.mw-parser-output .navbox-title{background-color:#ccf}.mw-parser-output .navbox-abovebelow,.mw-parser-output .navbox-group,.mw-parser-output .navbox-subgroup .navbox-title{background-color:#ddf}.mw-parser-output .navbox-subgroup .navbox-group,.mw-parser-output .navbox-subgroup .navbox-abovebelow{background-color:#e6e6ff}.mw-parser-output .navbox-even{background-color:#f7f7f7}.mw-parser-output .navbox-odd{background-color:transparent}.mw-parser-output .navbox .hlist td dl,.mw-parser-output .navbox .hlist td ol,.mw-parser-output .navbox .hlist td ul,.mw-parser-output .navbox td.hlist dl,.mw-parser-output .navbox td.hlist ol,.mw-parser-output .navbox td.hlist ul{padding:0.125em 0}.mw-parser-output .navbox .navbar{display:block;font-size:100%}.mw-parser-output .navbox-title .navbar{float:left;text-align:left;margin-right:0.5em}body.skin--responsive .mw-parser-output .navbox-image img{max-width:none!important}@media print{body.ns-0 .mw-parser-output .navbox{display:none!important}}</style></div><div role="navigation" class="navbox" aria-labelledby="Natural_language_processing" style="padding:3px"><table class="nowraplinks hlist mw-collapsible autocollapse navbox-inner" style="border-spacing:0;background:transparent;color:inherit"><tbody><tr><th scope="col" class="navbox-title" colspan="2"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1129693374"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1239400231"><div class="navbar plainlinks hlist navbar-mini"><ul><li class="nv-view"><a href="/wiki/Template:Natural_language_processing" title="Template:Natural language processing"><abbr title="View this template">v</abbr></a></li><li class="nv-talk"><a href="/wiki/Template_talk:Natural_language_processing" title="Template talk:Natural language processing"><abbr title="Discuss this template">t</abbr></a></li><li class="nv-edit"><a href="/wiki/Special:EditPage/Template:Natural_language_processing" title="Special:EditPage/Template:Natural language processing"><abbr title="Edit this template">e</abbr></a></li></ul></div><div id="Natural_language_processing" style="font-size:114%;margin:0 4em"><a href="/wiki/Natural_language_processing" title="Natural language processing">Natural language processing</a></div></th></tr><tr><th scope="row" class="navbox-group" style="width:1%">General terms</th><td class="navbox-list-with-group navbox-list navbox-odd" style="width:100%;padding:0"><div style="padding:0 0.25em"> <ul><li><a href="/wiki/AI-complete" title="AI-complete">AI-complete</a></li> <li><a href="/wiki/Bag-of-words_model" title="Bag-of-words model">Bag-of-words</a></li> <li><a href="/wiki/N-gram" title="N-gram">n-gram</a> <ul><li><a href="/wiki/Bigram" title="Bigram">Bigram</a></li> <li><a href="/wiki/Trigram" title="Trigram">Trigram</a></li></ul></li> <li><a href="/wiki/Computational_linguistics" title="Computational linguistics">Computational linguistics</a></li> <li><a href="/wiki/Natural_language_understanding" title="Natural language understanding">Natural language understanding</a></li> <li><a href="/wiki/Stop_word" title="Stop word">Stop words</a></li> <li><a href="/wiki/Text_processing" title="Text processing">Text processing</a></li></ul> </div></td></tr><tr><th scope="row" class="navbox-group" style="width:1%"><a href="/wiki/Text_mining" title="Text mining">Text analysis</a></th><td class="navbox-list-with-group navbox-list navbox-even" style="width:100%;padding:0"><div style="padding:0 0.25em"> <ul><li><a href="/wiki/Argument_mining" title="Argument mining">Argument mining</a></li> <li><a href="/wiki/Collocation_extraction" title="Collocation extraction">Collocation extraction</a></li> <li><a href="/wiki/Concept_mining" title="Concept mining">Concept mining</a></li> <li><a href="/wiki/Coreference#Coreference_resolution" title="Coreference">Coreference resolution</a></li> <li><a href="/wiki/Deep_linguistic_processing" title="Deep linguistic processing">Deep linguistic processing</a></li> <li><a href="/wiki/Distant_reading" title="Distant reading">Distant reading</a></li> <li><a href="/wiki/Information_extraction" title="Information extraction">Information extraction</a></li> <li><a href="/wiki/Named-entity_recognition" title="Named-entity recognition">Named-entity recognition</a></li> <li><a href="/wiki/Ontology_learning" title="Ontology learning">Ontology learning</a></li> <li><a href="/wiki/Parsing" title="Parsing">Parsing</a> <ul><li><a href="/wiki/Semantic_parsing" title="Semantic parsing">Semantic parsing</a></li> <li><a href="/wiki/Syntactic_parsing_(computational_linguistics)" title="Syntactic parsing (computational linguistics)">Syntactic parsing</a></li></ul></li> <li><a href="/wiki/Part-of-speech_tagging" title="Part-of-speech tagging">Part-of-speech tagging</a></li> <li><a href="/wiki/Semantic_analysis_(machine_learning)" title="Semantic analysis (machine learning)">Semantic analysis</a></li> <li><a href="/wiki/Semantic_role_labeling" title="Semantic role labeling">Semantic role labeling</a></li> <li><a href="/wiki/Semantic_decomposition_(natural_language_processing)" title="Semantic decomposition (natural language processing)">Semantic decomposition</a></li> <li><a href="/wiki/Semantic_similarity" title="Semantic similarity">Semantic similarity</a></li> <li><a href="/wiki/Sentiment_analysis" title="Sentiment analysis">Sentiment analysis</a></li></ul> <ul><li><a href="/wiki/Terminology_extraction" title="Terminology extraction">Terminology extraction</a></li> <li><a href="/wiki/Text_mining" title="Text mining">Text mining</a></li> <li><a href="/wiki/Textual_entailment" title="Textual entailment">Textual entailment</a></li> <li><a href="/wiki/Truecasing" title="Truecasing">Truecasing</a></li> <li><a href="/wiki/Word-sense_disambiguation" title="Word-sense disambiguation">Word-sense disambiguation</a></li> <li><a href="/wiki/Word-sense_induction" title="Word-sense induction">Word-sense induction</a></li></ul> </div><table class="nowraplinks navbox-subgroup" style="border-spacing:0"><tbody><tr><th id="Text_segmentation" scope="row" class="navbox-group" style="width:1%"><a href="/wiki/Text_segmentation" title="Text segmentation">Text segmentation</a></th><td class="navbox-list-with-group navbox-list navbox-odd" style="width:100%;padding:0"><div style="padding:0 0.25em"> <ul><li><a href="/wiki/Compound-term_processing" title="Compound-term processing">Compound-term processing</a></li> <li><a href="/wiki/Lemmatisation" class="mw-redirect" title="Lemmatisation">Lemmatisation</a></li> <li><a href="/wiki/Lexical_analysis" title="Lexical analysis">Lexical analysis</a></li> <li><a href="/wiki/Shallow_parsing" title="Shallow parsing">Text chunking</a></li> <li><a href="/wiki/Stemming" title="Stemming">Stemming</a></li> <li><a href="/wiki/Sentence_boundary_disambiguation" title="Sentence boundary disambiguation">Sentence segmentation</a></li> <li><a href="/wiki/Word#Word_boundaries" title="Word">Word segmentation</a></li></ul> </div></td></tr></tbody></table><div> </div></td></tr><tr><th scope="row" class="navbox-group" style="width:1%"><a href="/wiki/Automatic_summarization" title="Automatic summarization">Automatic summarization</a></th><td class="navbox-list-with-group navbox-list navbox-even" style="width:100%;padding:0"><div style="padding:0 0.25em"> <ul><li><a href="/wiki/Multi-document_summarization" title="Multi-document summarization">Multi-document summarization</a></li> <li><a href="/wiki/Sentence_extraction" title="Sentence extraction">Sentence extraction</a></li> <li><a href="/wiki/Text_simplification" title="Text simplification">Text simplification</a></li></ul> </div></td></tr><tr><th scope="row" class="navbox-group" style="width:1%"><a href="/wiki/Machine_translation" title="Machine translation">Machine translation</a></th><td class="navbox-list-with-group navbox-list navbox-odd" style="width:100%;padding:0"><div style="padding:0 0.25em"> <ul><li><a href="/wiki/Computer-assisted_translation" title="Computer-assisted translation">Computer-assisted</a></li> <li><a href="/wiki/Example-based_machine_translation" title="Example-based machine translation">Example-based</a></li> <li><a href="/wiki/Rule-based_machine_translation" title="Rule-based machine translation">Rule-based</a></li> <li><a href="/wiki/Statistical_machine_translation" title="Statistical machine translation">Statistical</a></li> <li><a href="/wiki/Transfer-based_machine_translation" title="Transfer-based machine translation">Transfer-based</a></li> <li><a href="/wiki/Neural_machine_translation" title="Neural machine translation">Neural</a></li></ul> </div></td></tr><tr><th scope="row" class="navbox-group" style="width:1%"><a href="/wiki/Distributional_semantics" title="Distributional semantics">Distributional semantics</a> models</th><td class="navbox-list-with-group navbox-list navbox-even" style="width:100%;padding:0"><div style="padding:0 0.25em"> <ul><li><a href="/wiki/BERT_(language_model)" title="BERT (language model)">BERT</a></li> <li><a href="/wiki/Document-term_matrix" title="Document-term matrix">Document-term matrix</a></li> <li><a href="/wiki/Explicit_semantic_analysis" title="Explicit semantic analysis">Explicit semantic analysis</a></li> <li><a href="/wiki/FastText" title="FastText">fastText</a></li> <li><a href="/wiki/GloVe" title="GloVe">GloVe</a></li> <li><a href="/wiki/Language_model" title="Language model">Language model</a> (<a class="mw-selflink selflink">large</a>)</li> <li><a href="/wiki/Latent_semantic_analysis" title="Latent semantic analysis">Latent semantic analysis</a></li> <li><a href="/wiki/Seq2seq" title="Seq2seq">Seq2seq</a></li> <li><a href="/wiki/Word_embedding" title="Word embedding">Word embedding</a></li> <li><a href="/wiki/Word2vec" title="Word2vec">Word2vec</a></li></ul> </div></td></tr><tr><th scope="row" class="navbox-group" style="width:1%"><a href="/wiki/Language_resource" title="Language resource">Language resources</a>,<br />datasets and corpora</th><td class="navbox-list-with-group navbox-list navbox-odd" style="width:100%;padding:0"><div style="padding:0 0.25em"></div><table class="nowraplinks navbox-subgroup" style="border-spacing:0"><tbody><tr><th scope="row" class="navbox-group" style="width:1%">Types and<br />standards</th><td class="navbox-list-with-group navbox-list navbox-odd" style="width:100%;padding:0"><div style="padding:0 0.25em"> <ul><li><a href="/wiki/Corpus_linguistics" title="Corpus linguistics">Corpus linguistics</a></li> <li><a href="/wiki/Lexical_resource" title="Lexical resource">Lexical resource</a></li> <li><a href="/wiki/Linguistic_Linked_Open_Data" title="Linguistic Linked Open Data">Linguistic Linked Open Data</a></li> <li><a href="/wiki/Machine-readable_dictionary" title="Machine-readable dictionary">Machine-readable dictionary</a></li> <li><a href="/wiki/Parallel_text" title="Parallel text">Parallel text</a></li> <li><a href="/wiki/PropBank" title="PropBank">PropBank</a></li> <li><a href="/wiki/Semantic_network" title="Semantic network">Semantic network</a></li> <li><a href="/wiki/Simple_Knowledge_Organization_System" title="Simple Knowledge Organization System">Simple Knowledge Organization System</a></li> <li><a href="/wiki/Speech_corpus" title="Speech corpus">Speech corpus</a></li> <li><a href="/wiki/Text_corpus" title="Text corpus">Text corpus</a></li> <li><a href="/wiki/Thesaurus_(information_retrieval)" title="Thesaurus (information retrieval)">Thesaurus (information retrieval)</a></li> <li><a href="/wiki/Treebank" title="Treebank">Treebank</a></li> <li><a href="/wiki/Universal_Dependencies" title="Universal Dependencies">Universal Dependencies</a></li></ul> </div></td></tr><tr><th scope="row" class="navbox-group" style="width:1%">Data</th><td class="navbox-list-with-group navbox-list navbox-even" style="width:100%;padding:0"><div style="padding:0 0.25em"> <ul><li><a href="/wiki/BabelNet" title="BabelNet">BabelNet</a></li> <li><a href="/wiki/Bank_of_English" title="Bank of English">Bank of English</a></li> <li><a href="/wiki/DBpedia" title="DBpedia">DBpedia</a></li> <li><a href="/wiki/FrameNet" title="FrameNet">FrameNet</a></li> <li><a href="/wiki/Google_Ngram_Viewer" class="mw-redirect" title="Google Ngram Viewer">Google Ngram Viewer</a></li> <li><a href="/wiki/UBY" title="UBY">UBY</a></li> <li><a href="/wiki/WordNet" title="WordNet">WordNet</a></li> <li><a href="/wiki/Wikidata" title="Wikidata">Wikidata</a></li></ul> </div></td></tr></tbody></table><div></div></td></tr><tr><th scope="row" class="navbox-group" style="width:1%"><a href="/wiki/Automatic_identification_and_data_capture" title="Automatic identification and data capture">Automatic identification<br />and data capture</a></th><td class="navbox-list-with-group navbox-list navbox-odd" style="width:100%;padding:0"><div style="padding:0 0.25em"> <ul><li><a href="/wiki/Speech_recognition" title="Speech recognition">Speech recognition</a></li> <li><a href="/wiki/Speech_segmentation" title="Speech segmentation">Speech segmentation</a></li> <li><a href="/wiki/Speech_synthesis" title="Speech synthesis">Speech synthesis</a></li> <li><a href="/wiki/Natural_language_generation" title="Natural language generation">Natural language generation</a></li> <li><a href="/wiki/Optical_character_recognition" title="Optical character recognition">Optical character recognition</a></li></ul> </div></td></tr><tr><th scope="row" class="navbox-group" style="width:1%"><a href="/wiki/Topic_model" title="Topic model">Topic model</a></th><td class="navbox-list-with-group navbox-list navbox-even" style="width:100%;padding:0"><div style="padding:0 0.25em"> <ul><li><a href="/wiki/Document_classification" title="Document classification">Document classification</a></li> <li><a href="/wiki/Latent_Dirichlet_allocation" title="Latent Dirichlet allocation">Latent Dirichlet allocation</a></li> <li><a href="/wiki/Pachinko_allocation" title="Pachinko allocation">Pachinko allocation</a></li></ul> </div></td></tr><tr><th scope="row" class="navbox-group" style="width:1%"><a href="/wiki/Computer-assisted_reviewing" title="Computer-assisted reviewing">Computer-assisted<br />reviewing</a></th><td class="navbox-list-with-group navbox-list navbox-odd" style="width:100%;padding:0"><div style="padding:0 0.25em"> <ul><li><a href="/wiki/Automated_essay_scoring" title="Automated essay scoring">Automated essay scoring</a></li> <li><a href="/wiki/Concordancer" title="Concordancer">Concordancer</a></li> <li><a href="/wiki/Grammar_checker" title="Grammar checker">Grammar checker</a></li> <li><a href="/wiki/Predictive_text" title="Predictive text">Predictive text</a></li> <li><a href="/wiki/Pronunciation_assessment" title="Pronunciation assessment">Pronunciation assessment</a></li> <li><a href="/wiki/Spell_checker" title="Spell checker">Spell checker</a></li></ul> </div></td></tr><tr><th scope="row" class="navbox-group" style="width:1%"><a href="/wiki/Natural-language_user_interface" title="Natural-language user interface">Natural language<br />user interface</a></th><td class="navbox-list-with-group navbox-list navbox-even" style="width:100%;padding:0"><div style="padding:0 0.25em"> <ul><li><a href="/wiki/Chatbot" title="Chatbot">Chatbot</a></li> <li><a href="/wiki/Interactive_fiction" title="Interactive fiction">Interactive fiction</a> (c.f. <a href="/wiki/Syntax_guessing" class="mw-redirect" title="Syntax guessing">Syntax guessing</a>)</li> <li><a href="/wiki/Question_answering" title="Question answering">Question answering</a></li> <li><a href="/wiki/Virtual_assistant" title="Virtual assistant">Virtual assistant</a></li> <li><a href="/wiki/Voice_user_interface" title="Voice user interface">Voice user interface</a></li></ul> </div></td></tr><tr><th scope="row" class="navbox-group" style="width:1%">Related</th><td class="navbox-list-with-group navbox-list navbox-odd" style="width:100%;padding:0"><div style="padding:0 0.25em"> <ul><li><a href="/wiki/Formal_semantics_(natural_language)" title="Formal semantics (natural language)">Formal semantics</a></li> <li><a href="/wiki/Hallucination_(artificial_intelligence)" title="Hallucination (artificial intelligence)">Hallucination</a></li> <li><a href="/wiki/Natural_Language_Toolkit" title="Natural Language Toolkit">Natural Language Toolkit</a></li> <li><a href="/wiki/SpaCy" title="SpaCy">spaCy</a></li></ul> </div></td></tr></tbody></table></div> <div class="navbox-styles"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1129693374"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1236075235"></div><div role="navigation" class="navbox" aria-labelledby="Artificial_intelligence" style="padding:3px"><table class="nowraplinks hlist mw-collapsible {{{state}}} navbox-inner" style="border-spacing:0;background:transparent;color:inherit"><tbody><tr><th scope="col" class="navbox-title" colspan="2"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1129693374"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1239400231"><div class="navbar plainlinks hlist navbar-mini"><ul><li class="nv-view"><a href="/wiki/Template:Artificial_intelligence_navbox" title="Template:Artificial intelligence navbox"><abbr title="View this template">v</abbr></a></li><li class="nv-talk"><a href="/wiki/Template_talk:Artificial_intelligence_navbox" title="Template talk:Artificial intelligence navbox"><abbr title="Discuss this template">t</abbr></a></li><li class="nv-edit"><a href="/wiki/Special:EditPage/Template:Artificial_intelligence_navbox" title="Special:EditPage/Template:Artificial intelligence navbox"><abbr title="Edit this template">e</abbr></a></li></ul></div><div id="Artificial_intelligence" style="font-size:114%;margin:0 4em"><a href="/wiki/Artificial_intelligence" title="Artificial intelligence">Artificial intelligence</a></div></th></tr><tr><th scope="row" class="navbox-group" style="width:1%">Concepts</th><td class="navbox-list-with-group navbox-list navbox-odd" style="width:100%;padding:0"><div style="padding:0 0.25em"> <ul><li><a href="/wiki/Parameter" title="Parameter">Parameter</a> <ul><li><a href="/wiki/Hyperparameter_(machine_learning)" title="Hyperparameter (machine learning)">Hyperparameter</a></li></ul></li> <li><a href="/wiki/Loss_functions_for_classification" title="Loss functions for classification">Loss functions</a></li> <li><a href="/wiki/Regression_analysis" title="Regression analysis">Regression</a> <ul><li><a href="/wiki/Bias%E2%80%93variance_tradeoff" title="Bias–variance tradeoff">Bias–variance tradeoff</a></li> <li><a href="/wiki/Double_descent" title="Double descent">Double descent</a></li> <li><a href="/wiki/Overfitting" title="Overfitting">Overfitting</a></li></ul></li> <li><a href="/wiki/Cluster_analysis" title="Cluster analysis">Clustering</a></li> <li><a href="/wiki/Gradient_descent" title="Gradient descent">Gradient descent</a> <ul><li><a href="/wiki/Stochastic_gradient_descent" title="Stochastic gradient descent">SGD</a></li> <li><a href="/wiki/Quasi-Newton_method" title="Quasi-Newton method">Quasi-Newton method</a></li> <li><a href="/wiki/Conjugate_gradient_method" title="Conjugate gradient method">Conjugate gradient method</a></li></ul></li> <li><a href="/wiki/Backpropagation" title="Backpropagation">Backpropagation</a></li> <li><a href="/wiki/Attention_(machine_learning)" title="Attention (machine learning)">Attention</a></li> <li><a href="/wiki/Convolution" title="Convolution">Convolution</a></li> <li><a href="/wiki/Normalization_(machine_learning)" title="Normalization (machine learning)">Normalization</a> <ul><li><a href="/wiki/Batch_normalization" title="Batch normalization">Batchnorm</a></li></ul></li> <li><a href="/wiki/Activation_function" title="Activation function">Activation</a> <ul><li><a href="/wiki/Softmax_function" title="Softmax function">Softmax</a></li> <li><a href="/wiki/Sigmoid_function" title="Sigmoid function">Sigmoid</a></li> <li><a href="/wiki/Rectifier_(neural_networks)" title="Rectifier (neural networks)">Rectifier</a></li></ul></li> <li><a href="/wiki/Gating_mechanism" title="Gating mechanism">Gating</a></li> <li><a href="/wiki/Weight_initialization" title="Weight initialization">Weight initialization</a></li> <li><a href="/wiki/Regularization_(mathematics)" title="Regularization (mathematics)">Regularization</a></li> <li><a href="/wiki/Training,_validation,_and_test_data_sets" title="Training, validation, and test data sets">Datasets</a> <ul><li><a href="/wiki/Data_augmentation" title="Data augmentation">Augmentation</a></li></ul></li> <li><a href="/wiki/Reinforcement_learning" title="Reinforcement learning">Reinforcement learning</a> <ul><li><a href="/wiki/Q-learning" title="Q-learning">Q-learning</a></li> <li><a href="/wiki/State%E2%80%93action%E2%80%93reward%E2%80%93state%E2%80%93action" title="State–action–reward–state–action">SARSA</a></li> <li><a href="/wiki/Imitation_learning" title="Imitation learning">Imitation</a></li></ul></li> <li><a href="/wiki/Diffusion_process" title="Diffusion process">Diffusion</a></li> <li><a href="/wiki/Autoregressive_model" title="Autoregressive model">Autoregression</a></li> <li><a href="/wiki/Adversarial_machine_learning" title="Adversarial machine learning">Adversary</a></li> <li><a href="/wiki/Hallucination_(artificial_intelligence)" title="Hallucination (artificial intelligence)">Hallucination</a></li></ul> </div></td></tr><tr><th scope="row" class="navbox-group" style="width:1%">Applications</th><td class="navbox-list-with-group navbox-list navbox-even" style="width:100%;padding:0"><div style="padding:0 0.25em"> <ul><li><a href="/wiki/Machine_learning" title="Machine learning">Machine learning</a> <ul><li><a href="/wiki/Prompt_engineering#In-context_learning" title="Prompt engineering">In-context learning</a></li></ul></li> <li><a href="/wiki/Neural_network_(machine_learning)" title="Neural network (machine learning)">Artificial neural network</a> <ul><li><a href="/wiki/Deep_learning" title="Deep learning">Deep learning</a></li></ul></li> <li><a href="/wiki/Language_model" title="Language model">Language model</a> <ul><li><a class="mw-selflink selflink">Large language model</a></li> <li><a href="/wiki/Neural_machine_translation" title="Neural machine translation">NMT</a></li></ul></li> <li><a href="/wiki/Artificial_general_intelligence" title="Artificial general intelligence">Artificial general intelligence</a></li></ul> </div></td></tr><tr><th scope="row" class="navbox-group" style="width:1%">Implementations</th><td class="navbox-list-with-group navbox-list navbox-odd" style="width:100%;padding:0"><div style="padding:0 0.25em"></div><table class="nowraplinks navbox-subgroup" style="border-spacing:0"><tbody><tr><th scope="row" class="navbox-group" style="width:1%">Audio–visual</th><td class="navbox-list-with-group navbox-list navbox-odd" style="width:100%;padding:0"><div style="padding:0 0.25em"> <ul><li><a href="/wiki/AlexNet" title="AlexNet">AlexNet</a></li> <li><a href="/wiki/WaveNet" title="WaveNet">WaveNet</a></li> <li><a href="/wiki/Human_image_synthesis" title="Human image synthesis">Human image synthesis</a></li> <li><a href="/wiki/Handwriting_recognition" title="Handwriting recognition">HWR</a></li> <li><a href="/wiki/Optical_character_recognition" title="Optical character recognition">OCR</a></li> <li><a href="/wiki/Deep_learning_speech_synthesis" title="Deep learning speech synthesis">Speech synthesis</a> <ul><li><a href="/wiki/ElevenLabs" title="ElevenLabs">ElevenLabs</a></li></ul></li> <li><a href="/wiki/Speech_recognition" title="Speech recognition">Speech recognition</a></li> <li><a href="/wiki/Facial_recognition_system" title="Facial recognition system">Facial recognition</a></li> <li><a href="/wiki/AlphaFold" title="AlphaFold">AlphaFold</a></li> <li><a href="/wiki/Text-to-image_model" title="Text-to-image model">Text-to-image models</a> <ul><li><a href="/wiki/Latent_diffusion_model" title="Latent diffusion model">Latent diffusion model</a></li> <li><a href="/wiki/DALL-E" title="DALL-E">DALL-E</a></li> <li><a href="/wiki/Flux_(text-to-image_model)" title="Flux (text-to-image model)">Flux</a></li> <li><a href="/wiki/Ideogram_(text-to-image_model)" title="Ideogram (text-to-image model)">Ideogram</a></li> <li><a href="/wiki/Midjourney" title="Midjourney">Midjourney</a></li> <li><a href="/wiki/Stable_Diffusion" title="Stable Diffusion">Stable Diffusion</a></li></ul></li> <li><a href="/wiki/Text-to-video_model" title="Text-to-video model">Text-to-video models</a> <ul><li><a href="/wiki/Sora_(text-to-video_model)" title="Sora (text-to-video model)">Sora</a></li> <li><a href="/wiki/Dream_Machine_(text-to-video_model)" title="Dream Machine (text-to-video model)">Dream Machine</a></li> <li><a href="/wiki/VideoPoet" title="VideoPoet">VideoPoet</a></li></ul></li> <li><a href="/wiki/Whisper_(speech_recognition_system)" title="Whisper (speech recognition system)">Whisper</a></li></ul> </div></td></tr><tr><th scope="row" class="navbox-group" style="width:1%">Text</th><td class="navbox-list-with-group navbox-list navbox-even" style="width:100%;padding:0"><div style="padding:0 0.25em"> <ul><li><a href="/wiki/Word2vec" title="Word2vec">Word2vec</a></li> <li><a href="/wiki/Seq2seq" title="Seq2seq">Seq2seq</a></li> <li><a href="/wiki/GloVe" title="GloVe">GloVe</a></li> <li><a href="/wiki/BERT_(language_model)" title="BERT (language model)">BERT</a></li> <li><a href="/wiki/T5_(language_model)" title="T5 (language model)">T5</a></li> <li><a href="/wiki/Llama_(language_model)" title="Llama (language model)">Llama</a></li> <li><a href="/wiki/Chinchilla_(language_model)" title="Chinchilla (language model)">Chinchilla AI</a></li> <li><a href="/wiki/PaLM" title="PaLM">PaLM</a></li> <li><a href="/wiki/Generative_pre-trained_transformer" title="Generative pre-trained transformer">GPT</a> <ul><li><a href="/wiki/GPT-1" title="GPT-1">1</a></li> <li><a href="/wiki/GPT-J" title="GPT-J">J</a></li> <li><a href="/wiki/GPT-2" title="GPT-2">2</a></li> <li><a href="/wiki/GPT-3" title="GPT-3">3</a></li> <li><a href="/wiki/ChatGPT" title="ChatGPT">ChatGPT</a></li> <li><a href="/wiki/GPT-4" title="GPT-4">4</a></li> <li><a href="/wiki/GPT-4o" title="GPT-4o">4o</a></li> <li><a href="/wiki/OpenAI_o1" title="OpenAI o1">o1</a></li></ul></li> <li><a href="/wiki/Claude_(language_model)" title="Claude (language model)">Claude</a></li> <li><a href="/wiki/Gemini_(language_model)" title="Gemini (language model)">Gemini</a></li> <li><a href="/wiki/Grok_(chatbot)" title="Grok (chatbot)">Grok</a></li> <li><a href="/wiki/LaMDA" title="LaMDA">LaMDA</a></li> <li><a href="/wiki/BLOOM_(language_model)" title="BLOOM (language model)">BLOOM</a></li> <li><a href="/wiki/Project_Debater" title="Project Debater">Project Debater</a></li> <li><a href="/wiki/IBM_Watson" title="IBM Watson">IBM Watson</a></li> <li><a href="/wiki/IBM_Watsonx" title="IBM Watsonx">IBM Watsonx</a></li> <li><a href="/wiki/IBM_Granite" title="IBM Granite">Granite</a></li> <li><a href="/wiki/Huawei_PanGu" title="Huawei PanGu">PanGu-Σ</a></li></ul> </div></td></tr><tr><th scope="row" class="navbox-group" style="width:1%">Decisional</th><td class="navbox-list-with-group navbox-list navbox-odd" style="width:100%;padding:0"><div style="padding:0 0.25em"> <ul><li><a href="/wiki/AlphaGo" title="AlphaGo">AlphaGo</a></li> <li><a href="/wiki/AlphaZero" title="AlphaZero">AlphaZero</a></li> <li><a href="/wiki/OpenAI_Five" title="OpenAI Five">OpenAI Five</a></li> <li><a href="/wiki/Self-driving_car" title="Self-driving car">Self-driving car</a></li> <li><a href="/wiki/MuZero" title="MuZero">MuZero</a></li> <li><a href="/wiki/Action_selection" title="Action selection">Action selection</a> <ul><li><a href="/wiki/AutoGPT" title="AutoGPT">AutoGPT</a></li></ul></li> <li><a href="/wiki/Robot_control" title="Robot control">Robot control</a></li></ul> </div></td></tr></tbody></table><div></div></td></tr><tr><th scope="row" class="navbox-group" style="width:1%">People</th><td class="navbox-list-with-group navbox-list navbox-even" style="width:100%;padding:0"><div style="padding:0 0.25em"> <ul><li><a href="/wiki/Alan_Turing" title="Alan Turing">Alan Turing</a></li> <li><a href="/wiki/Claude_Shannon" title="Claude Shannon">Claude Shannon</a></li> <li><a href="/wiki/Allen_Newell" title="Allen Newell">Allen Newell</a></li> <li><a href="/wiki/Herbert_A._Simon" title="Herbert A. Simon">Herbert A. Simon</a></li> <li><a href="/wiki/Frank_Rosenblatt" title="Frank Rosenblatt">Frank Rosenblatt</a></li> <li><a href="/wiki/Marvin_Minsky" title="Marvin Minsky">Marvin Minsky</a></li> <li><a href="/wiki/John_McCarthy_(computer_scientist)" title="John McCarthy (computer scientist)">John McCarthy</a></li> <li><a href="/wiki/Nathaniel_Rochester_(computer_scientist)" title="Nathaniel Rochester (computer scientist)">Nathaniel Rochester</a></li> <li><a href="/wiki/Seymour_Papert" title="Seymour Papert">Seymour Papert</a></li> <li><a href="/wiki/Joseph_Weizenbaum" title="Joseph Weizenbaum">Joseph Weizenbaum</a></li> <li><a href="/wiki/Bernard_Widrow" title="Bernard Widrow">Bernard Widrow</a></li> <li><a href="/wiki/Paul_Werbos" title="Paul Werbos">Paul Werbos</a></li> <li><a href="/wiki/Yoshua_Bengio" title="Yoshua Bengio">Yoshua Bengio</a></li> <li><a href="/wiki/Alex_Graves_(computer_scientist)" title="Alex Graves (computer scientist)">Alex Graves</a></li> <li><a href="/wiki/Ian_Goodfellow" title="Ian Goodfellow">Ian Goodfellow</a></li> <li><a href="/wiki/Stephen_Grossberg" title="Stephen Grossberg">Stephen Grossberg</a></li> <li><a href="/wiki/Demis_Hassabis" title="Demis Hassabis">Demis Hassabis</a></li> <li><a href="/wiki/Geoffrey_Hinton" title="Geoffrey Hinton">Geoffrey Hinton</a></li> <li><a href="/wiki/Yann_LeCun" title="Yann LeCun">Yann LeCun</a></li> <li><a href="/wiki/Fei-Fei_Li" title="Fei-Fei Li">Fei-Fei Li</a></li> <li><a href="/wiki/Andrew_Ng" title="Andrew Ng">Andrew Ng</a></li> <li><a href="/wiki/J%C3%BCrgen_Schmidhuber" title="Jürgen Schmidhuber">Jürgen Schmidhuber</a></li> <li><a href="/wiki/David_Silver_(computer_scientist)" title="David Silver (computer scientist)">David Silver</a></li> <li><a href="/wiki/Ilya_Sutskever" title="Ilya Sutskever">Ilya Sutskever</a></li></ul> </div></td></tr><tr><th scope="row" class="navbox-group" style="width:1%">Organizations</th><td class="navbox-list-with-group navbox-list navbox-odd" style="width:100%;padding:0"><div style="padding:0 0.25em"> <ul><li><a href="/wiki/Anthropic" title="Anthropic">Anthropic</a></li> <li><a href="/wiki/EleutherAI" title="EleutherAI">EleutherAI</a></li> <li><a href="/wiki/Google_DeepMind" title="Google DeepMind">Google DeepMind</a></li> <li><a href="/wiki/Hugging_Face" title="Hugging Face">Hugging Face</a></li> <li><a href="/wiki/Kuaishou" title="Kuaishou">Kuaishou</a></li> <li><a href="/wiki/Meta_AI" title="Meta AI">Meta AI</a></li> <li><a href="/wiki/Mila_(research_institute)" title="Mila (research institute)">Mila</a></li> <li><a href="/wiki/MiniMax_(company)" title="MiniMax (company)">MiniMax</a></li> <li><a href="/wiki/Mistral_AI" title="Mistral AI">Mistral AI</a></li> <li><a href="/wiki/MIT_Computer_Science_and_Artificial_Intelligence_Laboratory" title="MIT Computer Science and Artificial Intelligence Laboratory">MIT CSAIL</a></li> <li><a href="/wiki/OpenAI" title="OpenAI">OpenAI</a></li> <li><a href="/wiki/Runway_(company)" title="Runway (company)">Runway</a></li> <li><a href="/wiki/XAI_(company)" title="XAI (company)">xAI</a></li></ul> </div></td></tr><tr><th scope="row" class="navbox-group" style="width:1%">Architectures</th><td class="navbox-list-with-group navbox-list navbox-even" style="width:100%;padding:0"><div style="padding:0 0.25em"> <ul><li><a href="/wiki/Neural_Turing_machine" title="Neural Turing machine">Neural Turing machine</a></li> <li><a href="/wiki/Differentiable_neural_computer" title="Differentiable neural computer">Differentiable neural computer</a></li> <li><a href="/wiki/Transformer_(deep_learning_architecture)" title="Transformer (deep learning architecture)">Transformer</a> <ul><li><a href="/wiki/Vision_transformer" title="Vision transformer">Vision transformer (ViT)</a></li></ul></li> <li><a href="/wiki/Recurrent_neural_network" title="Recurrent neural network">Recurrent neural network (RNN)</a></li> <li><a href="/wiki/Long_short-term_memory" title="Long short-term memory">Long short-term memory (LSTM)</a></li> <li><a href="/wiki/Gated_recurrent_unit" title="Gated recurrent unit">Gated recurrent unit (GRU)</a></li> <li><a href="/wiki/Echo_state_network" title="Echo state network">Echo state network</a></li> <li><a href="/wiki/Multilayer_perceptron" title="Multilayer perceptron">Multilayer perceptron (MLP)</a></li> <li><a href="/wiki/Convolutional_neural_network" title="Convolutional neural network">Convolutional neural network (CNN)</a></li> <li><a href="/wiki/Residual_neural_network" title="Residual neural network">Residual neural network (RNN)</a></li> <li><a href="/wiki/Highway_network" title="Highway network">Highway network</a></li> <li><a href="/wiki/Mamba_(deep_learning_architecture)" title="Mamba (deep learning architecture)">Mamba</a></li> <li><a href="/wiki/Autoencoder" title="Autoencoder">Autoencoder</a></li> <li><a href="/wiki/Variational_autoencoder" title="Variational autoencoder">Variational autoencoder (VAE)</a></li> <li><a href="/wiki/Generative_adversarial_network" title="Generative adversarial network">Generative adversarial network (GAN)</a></li> <li><a href="/wiki/Graph_neural_network" title="Graph neural network">Graph neural network (GNN)</a></li></ul> </div></td></tr><tr><td class="navbox-abovebelow" colspan="2"><div> <ul><li><span class="noviewer" typeof="mw:File"><a href="/wiki/File:Symbol_portal_class.svg" class="mw-file-description" title="Portal"><img alt="" src="//upload.wikimedia.org/wikipedia/en/thumb/e/e2/Symbol_portal_class.svg/16px-Symbol_portal_class.svg.png" decoding="async" width="16" height="16" class="mw-file-element" srcset="//upload.wikimedia.org/wikipedia/en/thumb/e/e2/Symbol_portal_class.svg/23px-Symbol_portal_class.svg.png 1.5x, //upload.wikimedia.org/wikipedia/en/thumb/e/e2/Symbol_portal_class.svg/31px-Symbol_portal_class.svg.png 2x" data-file-width="180" data-file-height="185" /></a></span> Portals <ul><li><a href="/wiki/Portal:Technology" title="Portal:Technology">Technology</a></li></ul></li> <li><span class="noviewer" typeof="mw:File"><span title="Category"><img alt="" src="//upload.wikimedia.org/wikipedia/en/thumb/9/96/Symbol_category_class.svg/16px-Symbol_category_class.svg.png" decoding="async" width="16" height="16" class="mw-file-element" srcset="//upload.wikimedia.org/wikipedia/en/thumb/9/96/Symbol_category_class.svg/23px-Symbol_category_class.svg.png 1.5x, //upload.wikimedia.org/wikipedia/en/thumb/9/96/Symbol_category_class.svg/31px-Symbol_category_class.svg.png 2x" data-file-width="180" data-file-height="185" /></span></span> Categories <ul><li><a href="/wiki/Category:Artificial_neural_networks" title="Category:Artificial neural networks">Artificial neural networks</a></li> <li><a href="/wiki/Category:Machine_learning" title="Category:Machine learning">Machine learning</a></li></ul></li></ul> </div></td></tr></tbody></table></div> <!-- NewPP limit report Parsed by mw‐web.codfw.main‐f69cdc8f6‐24l65 Cached time: 20241122152908 Cache expiry: 2592000 Reduced expiry: false Complications: [vary‐revision‐sha1, show‐toc] CPU time usage: 2.979 seconds Real time usage: 3.250 seconds Preprocessor visited node count: 19504/1000000 Post‐expand include size: 684454/2097152 bytes Template argument size: 13887/2097152 bytes Highest expansion depth: 23/100 Expensive parser function count: 23/500 Unstrip recursion depth: 1/20 Unstrip post‐expand size: 913188/5000000 bytes Lua time usage: 2.008/10.000 seconds Lua memory usage: 9463208/52428800 bytes Lua Profile: ? 320 ms 15.4% MediaWiki\Extension\Scribunto\Engines\LuaSandbox\LuaSandboxCallback::callParserFunction 220 ms 10.6% MediaWiki\Extension\Scribunto\Engines\LuaSandbox\LuaSandboxCallback::gsub 200 ms 9.6% dataWrapper <mw.lua:672> 200 ms 9.6% MediaWiki\Extension\Scribunto\Engines\LuaSandbox\LuaSandboxCallback::find 160 ms 7.7% <mw.lua:694> 140 ms 6.7% type 100 ms 4.8% MediaWiki\Extension\Scribunto\Engines\LuaSandbox\LuaSandboxCallback::preprocess 80 ms 3.8% (for generator) 60 ms 2.9% MediaWiki\Extension\Scribunto\Engines\LuaSandbox\LuaSandboxCallback::match 60 ms 2.9% [others] 540 ms 26.0% Number of Wikibase entities loaded: 0/400 --> <!-- Transclusion expansion time report (%,ms,calls,template) 100.00% 2788.884 1 -total 73.70% 2055.287 2 Template:Reflist 24.07% 671.244 76 Template:Cite_arXiv 19.57% 545.646 100 Template:Cite_web 8.63% 240.776 28 Template:Cite_journal 7.25% 202.072 1 Template:Harvnb 5.06% 141.079 1 Template:Machine_learning 4.83% 134.763 1 Template:Sidebar_with_collapsible_lists 3.02% 84.158 1 Template:Short_description 2.59% 72.245 13 Template:Citation --> <!-- Saved in parser cache with key enwiki:pcache:idhash:73248112-0!canonical and timestamp 20241122152908 and revision id 1258457238. Rendering was triggered because: page-view --> </div><!--esi <esi:include src="/esitest-fa8a495983347898/content" /> --><noscript><img src="https://login.wikimedia.org/wiki/Special:CentralAutoLogin/start?type=1x1" alt="" width="1" height="1" style="border: none; position: absolute;"></noscript> <div class="printfooter" data-nosnippet="">Retrieved from "<a dir="ltr" href="https://en.wikipedia.org/w/index.php?title=Large_language_model&oldid=1258457238">https://en.wikipedia.org/w/index.php?title=Large_language_model&oldid=1258457238</a>"</div></div> <div id="catlinks" class="catlinks" data-mw="interface"><div id="mw-normal-catlinks" class="mw-normal-catlinks"><a href="/wiki/Help:Category" title="Help:Category">Categories</a>: <ul><li><a href="/wiki/Category:Large_language_models" title="Category:Large language models">Large language models</a></li><li><a href="/wiki/Category:Deep_learning" title="Category:Deep learning">Deep learning</a></li><li><a href="/wiki/Category:Natural_language_processing" title="Category:Natural language processing">Natural language processing</a></li></ul></div><div id="mw-hidden-catlinks" class="mw-hidden-catlinks mw-hidden-cats-hidden">Hidden categories: <ul><li><a href="/wiki/Category:CS1:_long_volume_value" title="Category:CS1: long volume value">CS1: long volume value</a></li><li><a href="/wiki/Category:Webarchive_template_wayback_links" title="Category:Webarchive template wayback links">Webarchive template wayback links</a></li><li><a href="/wiki/Category:Articles_with_short_description" title="Category:Articles with short description">Articles with short description</a></li><li><a href="/wiki/Category:Short_description_is_different_from_Wikidata" title="Category:Short description is different from Wikidata">Short description is different from Wikidata</a></li><li><a href="/wiki/Category:Articles_containing_potentially_dated_statements_from_2024" title="Category:Articles containing potentially dated statements from 2024">Articles containing potentially dated statements from 2024</a></li><li><a href="/wiki/Category:All_articles_containing_potentially_dated_statements" title="Category:All articles containing potentially dated statements">All articles containing potentially dated statements</a></li><li><a href="/wiki/Category:Articles_containing_potentially_dated_statements_from_June_2024" title="Category:Articles containing potentially dated statements from June 2024">Articles containing potentially dated statements from June 2024</a></li><li><a href="/wiki/Category:All_accuracy_disputes" title="Category:All accuracy disputes">All accuracy disputes</a></li><li><a href="/wiki/Category:Articles_with_disputed_statements_from_September_2024" title="Category:Articles with disputed statements from September 2024">Articles with disputed statements from September 2024</a></li><li><a href="/wiki/Category:All_articles_with_unsourced_statements" title="Category:All articles with unsourced statements">All articles with unsourced statements</a></li><li><a href="/wiki/Category:Articles_with_unsourced_statements_from_February_2024" title="Category:Articles with unsourced statements from February 2024">Articles with unsourced statements from February 2024</a></li><li><a href="/wiki/Category:Articles_containing_potentially_dated_statements_from_October_2024" title="Category:Articles containing potentially dated statements from October 2024">Articles containing potentially dated statements from October 2024</a></li></ul></div></div> </div> </main> </div> <div class="mw-footer-container"> <footer id="footer" class="mw-footer" > <ul id="footer-info"> <li id="footer-info-lastmod"> This page was last edited on 19 November 2024, at 21:04<span class="anonymous-show"> (UTC)</span>.</li> <li id="footer-info-copyright">Text is available under the <a href="/wiki/Wikipedia:Text_of_the_Creative_Commons_Attribution-ShareAlike_4.0_International_License" title="Wikipedia:Text of the Creative Commons Attribution-ShareAlike 4.0 International License">Creative Commons Attribution-ShareAlike 4.0 License</a>; additional terms may apply. By using this site, you agree to the <a href="https://foundation.wikimedia.org/wiki/Special:MyLanguage/Policy:Terms_of_Use" class="extiw" title="foundation:Special:MyLanguage/Policy:Terms of Use">Terms of Use</a> and <a href="https://foundation.wikimedia.org/wiki/Special:MyLanguage/Policy:Privacy_policy" class="extiw" title="foundation:Special:MyLanguage/Policy:Privacy policy">Privacy Policy</a>. Wikipedia® is a registered trademark of the <a rel="nofollow" class="external text" href="https://wikimediafoundation.org/">Wikimedia Foundation, Inc.</a>, a non-profit organization.</li> </ul> <ul id="footer-places"> <li id="footer-places-privacy"><a href="https://foundation.wikimedia.org/wiki/Special:MyLanguage/Policy:Privacy_policy">Privacy policy</a></li> <li id="footer-places-about"><a href="/wiki/Wikipedia:About">About Wikipedia</a></li> <li id="footer-places-disclaimers"><a href="/wiki/Wikipedia:General_disclaimer">Disclaimers</a></li> <li id="footer-places-contact"><a href="//en.wikipedia.org/wiki/Wikipedia:Contact_us">Contact Wikipedia</a></li> <li id="footer-places-wm-codeofconduct"><a href="https://foundation.wikimedia.org/wiki/Special:MyLanguage/Policy:Universal_Code_of_Conduct">Code of Conduct</a></li> <li id="footer-places-developers"><a href="https://developer.wikimedia.org">Developers</a></li> <li id="footer-places-statslink"><a href="https://stats.wikimedia.org/#/en.wikipedia.org">Statistics</a></li> <li id="footer-places-cookiestatement"><a href="https://foundation.wikimedia.org/wiki/Special:MyLanguage/Policy:Cookie_statement">Cookie statement</a></li> <li id="footer-places-mobileview"><a href="//en.m.wikipedia.org/w/index.php?title=Large_language_model&mobileaction=toggle_view_mobile" class="noprint stopMobileRedirectToggle">Mobile view</a></li> </ul> <ul id="footer-icons" class="noprint"> <li id="footer-copyrightico"><a href="https://wikimediafoundation.org/" class="cdx-button cdx-button--fake-button cdx-button--size-large cdx-button--fake-button--enabled"><img src="/static/images/footer/wikimedia-button.svg" width="84" height="29" alt="Wikimedia Foundation" loading="lazy"></a></li> <li id="footer-poweredbyico"><a href="https://www.mediawiki.org/" class="cdx-button cdx-button--fake-button cdx-button--size-large cdx-button--fake-button--enabled"><img src="/w/resources/assets/poweredby_mediawiki.svg" alt="Powered by MediaWiki" width="88" height="31" loading="lazy"></a></li> </ul> </footer> </div> </div> </div> <div class="vector-settings" id="p-dock-bottom"> <ul></ul> </div><script>(RLQ=window.RLQ||[]).push(function(){mw.config.set({"wgHostname":"mw-web.codfw.main-f69cdc8f6-88l8n","wgBackendResponseTime":154,"wgPageParseReport":{"limitreport":{"cputime":"2.979","walltime":"3.250","ppvisitednodes":{"value":19504,"limit":1000000},"postexpandincludesize":{"value":684454,"limit":2097152},"templateargumentsize":{"value":13887,"limit":2097152},"expansiondepth":{"value":23,"limit":100},"expensivefunctioncount":{"value":23,"limit":500},"unstrip-depth":{"value":1,"limit":20},"unstrip-size":{"value":913188,"limit":5000000},"entityaccesscount":{"value":0,"limit":400},"timingprofile":["100.00% 2788.884 1 -total"," 73.70% 2055.287 2 Template:Reflist"," 24.07% 671.244 76 Template:Cite_arXiv"," 19.57% 545.646 100 Template:Cite_web"," 8.63% 240.776 28 Template:Cite_journal"," 7.25% 202.072 1 Template:Harvnb"," 5.06% 141.079 1 Template:Machine_learning"," 4.83% 134.763 1 Template:Sidebar_with_collapsible_lists"," 3.02% 84.158 1 Template:Short_description"," 2.59% 72.245 13 Template:Citation"]},"scribunto":{"limitreport-timeusage":{"value":"2.008","limit":"10.000"},"limitreport-memusage":{"value":9463208,"limit":52428800},"limitreport-logs":"anchor_id_list = table#1 {\n [\"CITEREFAI2024\"] = 1,\n [\"CITEREFAbdinJacobsAwanAneja2024\"] = 1,\n [\"CITEREFAlayracDonahueLucMiech2022\"] = 1,\n [\"CITEREFAlba2023\"] = 1,\n [\"CITEREFAlbrecht2024\"] = 1,\n [\"CITEREFAllamar\"] = 2,\n [\"CITEREFAlviKharya2021\"] = 1,\n [\"CITEREFAnanthaswamy2023\"] = 1,\n [\"CITEREFAntolAgrawalLuMitchell2015\"] = 1,\n [\"CITEREFAskellBaiChenDrain2021\"] = 1,\n [\"CITEREFBahdanauChoBengio2014\"] = 1,\n [\"CITEREFBaiKadavathKunduAskell2022\"] = 1,\n [\"CITEREFBankoBrill2001\"] = 1,\n [\"CITEREFBidermanSchoelkopfAnthonyBradley2023\"] = 1,\n [\"CITEREFBlackBidermanHallahan2022\"] = 1,\n [\"CITEREFBowman2023\"] = 1,\n [\"CITEREFBrownMannRyderSubbiah2020\"] = 3,\n [\"CITEREFBubeckChandrasekaranEldanGehrke2023\"] = 1,\n [\"CITEREFCaballeroGuptaRishKrueger2022\"] = 1,\n [\"CITEREFChenLiBaiYang2021\"] = 1,\n [\"CITEREFChengDurmusJurafsky2023\"] = 1,\n [\"CITEREFChengThoppilan2022\"] = 1,\n [\"CITEREFClarkLeeChangKwiatkowski2019\"] = 1,\n [\"CITEREFDaiDu2021\"] = 1,\n [\"CITEREFDettmersPagnoniHoltzmanZettlemoyer2023\"] = 1,\n [\"CITEREFDettmersSvirschevskiEgiazarianKuznedelev2023\"] = 1,\n [\"CITEREFDevlinChangLeeToutanova2018\"] = 2,\n [\"CITEREFDey2023\"] = 1,\n [\"CITEREFDeyGosalZhimingChen2023\"] = 1,\n [\"CITEREFDickson2024\"] = 1,\n [\"CITEREFDodgeSapMarasovićAgnew2021\"] = 1,\n [\"CITEREFDriessXiaSajjadiLynch2023\"] = 1,\n [\"CITEREFElias2023\"] = 1,\n [\"CITEREFEvans2014\"] = 1,\n [\"CITEREFFathallahDasDe_GiorgisPoltronieri2024\"] = 1,\n [\"CITEREFFrank2023\"] = 1,\n [\"CITEREFFrantarAshkboosHoeflerAlistarh2022\"] = 1,\n [\"CITEREFFranzen2023\"] = 1,\n [\"CITEREFFriston2022\"] = 1,\n [\"CITEREFGaoBidermanBlackGolding2020\"] = 1,\n [\"CITEREFGaoMadaanZhouAlon2022\"] = 1,\n [\"CITEREFGoodman2001\"] = 1,\n [\"CITEREFGrootendorst\"] = 1,\n [\"CITEREFGuDao2023\"] = 1,\n [\"CITEREFHahnGoyal2023\"] = 1,\n [\"CITEREFHalevyNorvigPereira2009\"] = 1,\n [\"CITEREFHaoGuMaJiahua_Hong2023\"] = 1,\n [\"CITEREFHeaven2023\"] = 1,\n [\"CITEREFHeikkilä2023\"] = 1,\n [\"CITEREFHern2019\"] = 1,\n [\"CITEREFHoffmannBorgeaudMenschBuchatskaya2022\"] = 2,\n [\"CITEREFHoffmannBorgeaudMenschSifre2022\"] = 1,\n [\"CITEREFHubinger2024\"] = 1,\n [\"CITEREFHughes2023\"] = 1,\n [\"CITEREFHuyen2019\"] = 1,\n [\"CITEREFIyer2021\"] = 1,\n [\"CITEREFJiLeeFrieskeYu2022\"] = 1,\n [\"CITEREFJinRinard2023\"] = 1,\n [\"CITEREFJurafskyMartin2023\"] = 1,\n [\"CITEREFKaddour2023\"] = 1,\n [\"CITEREFKang2023\"] = 1,\n [\"CITEREFKaplanMcCandlishHenighanBrown2020\"] = 1,\n [\"CITEREFKaushalMahowald2022\"] = 1,\n [\"CITEREFKhrushchevVasilevPetrovZinov2022\"] = 1,\n [\"CITEREFKilgarriffGrefenstette2003\"] = 1,\n [\"CITEREFKirosSalakhutdinovZemel2014\"] = 1,\n [\"CITEREFKotekDockumSun2023\"] = 1,\n [\"CITEREFKrizhevskySutskeverHinton2012\"] = 1,\n [\"CITEREFKöpfKilchervon_RütteAnagnostidis2023\"] = 1,\n [\"CITEREFLakoff1999\"] = 1,\n [\"CITEREFLeeIppolitoNystromZhang2022\"] = 1,\n [\"CITEREFLepikhinLeeXuChen2021\"] = 1,\n [\"CITEREFLewisPerezPiktusPetroni2020\"] = 1,\n [\"CITEREFLewkowyczAndreassenDohanDyer2022\"] = 1,\n [\"CITEREFLiBubeckEldanDel_Giorno2023\"] = 1,\n [\"CITEREFLiHopkinsBauViégas2022\"] = 1,\n [\"CITEREFLiLiSavareseHoi2023\"] = 1,\n [\"CITEREFLiangWuSongWu2023\"] = 1,\n [\"CITEREFLinGouGongLiu2024\"] = 1,\n [\"CITEREFLinHiltonEvans2021\"] = 1,\n [\"CITEREFLiuLiWuLee2023\"] = 1,\n [\"CITEREFLundberg2023\"] = 1,\n [\"CITEREFLuoPuettSmith2023\"] = 1,\n [\"CITEREFMann\"] = 1,\n [\"CITEREFManning2022\"] = 1,\n [\"CITEREFMaslejFattoriniBrynjolfssonEtchemendy2023\"] = 1,\n [\"CITEREFMerritt2022\"] = 1,\n [\"CITEREFMetz2023\"] = 1,\n [\"CITEREFMitchellKrakauer2023\"] = 1,\n [\"CITEREFMukherjeeChang2024\"] = 1,\n [\"CITEREFNagelAmjadBaalenLouizos2020\"] = 1,\n [\"CITEREFNandaChanLieberumSmith2023\"] = 1,\n [\"CITEREFNarangChowdhery2022\"] = 1,\n [\"CITEREFNewport2023\"] = 1,\n [\"CITEREFNirmal2023\"] = 1,\n [\"CITEREFOpenAI2023\"] = 2,\n [\"CITEREFOrnes2023\"] = 1,\n [\"CITEREFOuyangWuJiangAlmeida2022\"] = 1,\n [\"CITEREFPaaßGiesselbach2022\"] = 1,\n [\"CITEREFParanjapeLundbergSinghHajishirzi2023\"] = 1,\n [\"CITEREFParkO\u0026#039;BrienCaiRingel_Morris2023\"] = 1,\n [\"CITEREFPatelLiRasooliConstant2022\"] = 1,\n [\"CITEREFPatelPavlick2021\"] = 1,\n [\"CITEREFPatilZhangWangGonzalez2023\"] = 1,\n [\"CITEREFPenedoMalarticHesslowCojocaru2023\"] = 1,\n [\"CITEREFPengAlcaideAnthonyAlbalak2023\"] = 1,\n [\"CITEREFPengWangDeng2023\"] = 1,\n [\"CITEREFPetrovEmanuele_La_MalfaTorrBibi2023\"] = 1,\n [\"CITEREFPetrovMalfaTorrBibi2023\"] = 1,\n [\"CITEREFPichai2023\"] = 1,\n [\"CITEREFPilehvarCamacho-Collados2019\"] = 1,\n [\"CITEREFPolinoPascanuAlistarh2018\"] = 1,\n [\"CITEREFPrickett2021\"] = 1,\n [\"CITEREFRaffelShazeerRobertsLee2020\"] = 1,\n [\"CITEREFRajbhandariLiYaoZhang2022\"] = 1,\n [\"CITEREFRenZhouMengHuang2023\"] = 1,\n [\"CITEREFResnikSmith2003\"] = 1,\n [\"CITEREFRogersKovalevaRumshisky2020\"] = 1,\n [\"CITEREFRoose2023\"] = 1,\n [\"CITEREFSchaefferMirandaKoyejo2023\"] = 1,\n [\"CITEREFSchreiner2023\"] = 1,\n [\"CITEREFSharirPelegShoham2020\"] = 1,\n [\"CITEREFShazeerMirhoseiniMaziarzDavis2017\"] = 1,\n [\"CITEREFShinnCassanoLabashGopinath2023\"] = 1,\n [\"CITEREFSmithPatwaryNorickLeGresley2022\"] = 1,\n [\"CITEREFSoltanAnanthakrishnanFitzGeraldGupta2022\"] = 1,\n [\"CITEREFSrivastavaRastogiRaoAbu_Awal_Md_Shoeb2022\"] = 1,\n [\"CITEREFStephen_Council2023\"] = 1,\n [\"CITEREFStokel-Walker2023\"] = 1,\n [\"CITEREFSusan_ZhangMona_DiabLuke_Zettlemoyer\"] = 1,\n [\"CITEREFTaylorKardasCucurullScialom2022\"] = 1,\n [\"CITEREFThoppilanDe_FreitasHallShazeer2022\"] = 1,\n [\"CITEREFVarshneyYaoZhangChen2023\"] = 1,\n [\"CITEREFVaswaniShazeerParmarUszkoreit2017\"] = 1,\n [\"CITEREFWang2024\"] = 1,\n [\"CITEREFWangCaiLiuMa2023\"] = 1,\n [\"CITEREFWangKordiMishraLiu2022\"] = 1,\n [\"CITEREFWangSunXiangWu2021\"] = 1,\n [\"CITEREFWayne_Xin_ZhaoZhouLiTang2023\"] = 1,\n [\"CITEREFWeiTayBommasaniRaffel2022\"] = 1,\n [\"CITEREFWiggers2022\"] = 1,\n [\"CITEREFWiggers2023\"] = 1,\n [\"CITEREFWiggers2024\"] = 1,\n [\"CITEREFWrobel\"] = 1,\n [\"CITEREFWuIrsoyLuDabravolski2023\"] = 1,\n [\"CITEREFWuPrabhumoyeMin2023\"] = 1,\n [\"CITEREFYangDaiYangCarbonell2020\"] = 1,\n [\"CITEREFYaoZhaoYuDu2022\"] = 1,\n [\"CITEREFYennie_Jun2023\"] = 1,\n [\"CITEREFYinFuZhaoLi2023\"] = 1,\n [\"CITEREFZaibShengEmma_Zhang2020\"] = 1,\n [\"CITEREFZellersHoltzmanBiskFarhadi2019\"] = 1,\n [\"CITEREFZhangLehmanStanleyClune2023\"] = 1,\n [\"CITEREFZhangLiBing2023\"] = 1,\n [\"CITEREFZhangRollerGoyalArtetxe2022\"] = 1,\n [\"CITEREFZhaoZhouLi2023\"] = 1,\n [\"Emergent_abilities\"] = 1,\n [\"Tokenization\"] = 1,\n}\ntemplate_list = table#1 {\n [\"!\"] = 4,\n [\"Anchor\"] = 2,\n [\"Artificial intelligence (AI)\"] = 1,\n [\"As of\"] = 3,\n [\"Br\"] = 4,\n [\"Citation\"] = 13,\n [\"Citation needed\"] = 1,\n [\"Cite arXiv\"] = 76,\n [\"Cite arxiv\"] = 1,\n [\"Cite book\"] = 7,\n [\"Cite conference\"] = 2,\n [\"Cite journal\"] = 28,\n [\"Cite magazine\"] = 1,\n [\"Cite news\"] = 8,\n [\"Cite web\"] = 100,\n [\"Distinguish\"] = 1,\n [\"Dts\"] = 50,\n [\"Dubious\"] = 2,\n [\"Efn\"] = 6,\n [\"Further\"] = 1,\n [\"Harvnb\"] = 1,\n [\"Machine learning\"] = 1,\n [\"Main\"] = 6,\n [\"Main article\"] = 1,\n [\"Natural language processing\"] = 1,\n [\"No\"] = 23,\n [\"Notelist\"] = 1,\n [\"Partial success\"] = 8,\n [\"Reflist\"] = 1,\n [\"See also\"] = 6,\n [\"Short description\"] = 1,\n [\"Smalldiv\"] = 1,\n [\"Sort\"] = 60,\n [\"Webarchive\"] = 3,\n [\"Yes\"] = 21,\n}\narticle_whitelist = table#1 {\n}\n","limitreport-profile":[["?","320","15.4"],["MediaWiki\\Extension\\Scribunto\\Engines\\LuaSandbox\\LuaSandboxCallback::callParserFunction","220","10.6"],["MediaWiki\\Extension\\Scribunto\\Engines\\LuaSandbox\\LuaSandboxCallback::gsub","200","9.6"],["dataWrapper \u003Cmw.lua:672\u003E","200","9.6"],["MediaWiki\\Extension\\Scribunto\\Engines\\LuaSandbox\\LuaSandboxCallback::find","160","7.7"],["\u003Cmw.lua:694\u003E","140","6.7"],["type","100","4.8"],["MediaWiki\\Extension\\Scribunto\\Engines\\LuaSandbox\\LuaSandboxCallback::preprocess","80","3.8"],["(for generator)","60","2.9"],["MediaWiki\\Extension\\Scribunto\\Engines\\LuaSandbox\\LuaSandboxCallback::match","60","2.9"],["[others]","540","26.0"]]},"cachereport":{"origin":"mw-web.codfw.main-f69cdc8f6-24l65","timestamp":"20241122152908","ttl":2592000,"transientcontent":false}}});});</script> <script type="application/ld+json">{"@context":"https:\/\/schema.org","@type":"Article","name":"Large language model","url":"https:\/\/en.wikipedia.org\/wiki\/Large_language_model","sameAs":"http:\/\/www.wikidata.org\/entity\/Q115305900","mainEntity":"http:\/\/www.wikidata.org\/entity\/Q115305900","author":{"@type":"Organization","name":"Contributors to Wikimedia projects"},"publisher":{"@type":"Organization","name":"Wikimedia Foundation, Inc.","logo":{"@type":"ImageObject","url":"https:\/\/www.wikimedia.org\/static\/images\/wmf-hor-googpub.png"}},"datePublished":"2023-03-09T15:43:17Z","dateModified":"2024-11-19T21:04:22Z","headline":"language model built with very large amounts of texts"}</script> </body> </html>