CINXE.COM
Transformer (deep learning architecture) - Wikipedia
<!DOCTYPE html> <html class="client-nojs vector-feature-language-in-header-enabled vector-feature-language-in-main-page-header-disabled vector-feature-page-tools-pinned-disabled vector-feature-toc-pinned-clientpref-1 vector-feature-main-menu-pinned-disabled vector-feature-limited-width-clientpref-1 vector-feature-limited-width-content-enabled vector-feature-custom-font-size-clientpref-1 vector-feature-appearance-pinned-clientpref-1 vector-feature-night-mode-enabled skin-theme-clientpref-day vector-sticky-header-enabled vector-toc-available" lang="en" dir="ltr"> <head> <meta charset="UTF-8"> <title>Transformer (deep learning architecture) - Wikipedia</title> <script>(function(){var className="client-js vector-feature-language-in-header-enabled vector-feature-language-in-main-page-header-disabled vector-feature-page-tools-pinned-disabled vector-feature-toc-pinned-clientpref-1 vector-feature-main-menu-pinned-disabled vector-feature-limited-width-clientpref-1 vector-feature-limited-width-content-enabled vector-feature-custom-font-size-clientpref-1 vector-feature-appearance-pinned-clientpref-1 vector-feature-night-mode-enabled skin-theme-clientpref-day vector-sticky-header-enabled vector-toc-available";var cookie=document.cookie.match(/(?:^|; )enwikimwclientpreferences=([^;]+)/);if(cookie){cookie[1].split('%2C').forEach(function(pref){className=className.replace(new RegExp('(^| )'+pref.replace(/-clientpref-\w+$|[^\w-]+/g,'')+'-clientpref-\\w+( |$)'),'$1'+pref+'$2');});}document.documentElement.className=className;}());RLCONF={"wgBreakFrames":false,"wgSeparatorTransformTable":["",""],"wgDigitTransformTable":["",""],"wgDefaultDateFormat":"dmy", "wgMonthNames":["","January","February","March","April","May","June","July","August","September","October","November","December"],"wgRequestId":"cf9cbf9c-dbc2-4951-962d-4b7bc9d401e4","wgCanonicalNamespace":"","wgCanonicalSpecialPageName":false,"wgNamespaceNumber":0,"wgPageName":"Transformer_(deep_learning_architecture)","wgTitle":"Transformer (deep learning architecture)","wgCurRevisionId":1276021232,"wgRevisionId":1276021232,"wgArticleId":61603971,"wgIsArticle":true,"wgIsRedirect":false,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgCategories":["CS1 maint: multiple names: authors list","CS1 errors: missing periodical","Articles with short description","Short description is different from Wikidata","Webarchive template wayback links","Google software","Neural network architectures","2017 in artificial intelligence"],"wgPageViewLanguage":"en","wgPageContentLanguage":"en","wgPageContentModel":"wikitext","wgRelevantPageName":"Transformer_(deep_learning_architecture)", "wgRelevantArticleId":61603971,"wgIsProbablyEditable":true,"wgRelevantPageIsProbablyEditable":true,"wgRestrictionEdit":[],"wgRestrictionMove":[],"wgNoticeProject":"wikipedia","wgCiteReferencePreviewsActive":false,"wgFlaggedRevsParams":{"tags":{"status":{"levels":1}}},"wgMediaViewerOnClick":true,"wgMediaViewerEnabledByDefault":true,"wgPopupsFlags":0,"wgVisualEditor":{"pageLanguageCode":"en","pageLanguageDir":"ltr","pageVariantFallbacks":"en"},"wgMFDisplayWikibaseDescriptions":{"search":true,"watchlist":true,"tagline":false,"nearby":true},"wgWMESchemaEditAttemptStepOversample":false,"wgWMEPageLength":100000,"wgEditSubmitButtonLabelPublish":true,"wgULSPosition":"interlanguage","wgULSisCompactLinksEnabled":false,"wgVector2022LanguageInHeader":true,"wgULSisLanguageSelectorEmpty":false,"wgWikibaseItemId":"Q85810444","wgCheckUserClientHintsHeadersJsApi":["brands","architecture","bitness","fullVersionList","mobile","model","platform","platformVersion"],"GEHomepageSuggestedEditsEnableTopics": true,"wgGETopicsMatchModeEnabled":false,"wgGEStructuredTaskRejectionReasonTextInputEnabled":false,"wgGELevelingUpEnabledForUser":false};RLSTATE={"ext.globalCssJs.user.styles":"ready","site.styles":"ready","user.styles":"ready","ext.globalCssJs.user":"ready","user":"ready","user.options":"loading","ext.cite.styles":"ready","ext.math.styles":"ready","skins.vector.search.codex.styles":"ready","skins.vector.styles":"ready","skins.vector.icons":"ready","jquery.makeCollapsible.styles":"ready","ext.wikimediamessages.styles":"ready","ext.visualEditor.desktopArticleTarget.noscript":"ready","ext.uls.interlanguage":"ready","wikibase.client.init":"ready","ext.wikimediaBadges":"ready"};RLPAGEMODULES=["ext.cite.ux-enhancements","mediawiki.page.media","site","mediawiki.page.ready","jquery.makeCollapsible","mediawiki.toc","skins.vector.js","ext.centralNotice.geoIP","ext.centralNotice.startUp","ext.gadget.ReferenceTooltips","ext.gadget.switcher","ext.urlShortener.toolbar", "ext.centralauth.centralautologin","mmv.bootstrap","ext.popups","ext.visualEditor.desktopArticleTarget.init","ext.visualEditor.targetLoader","ext.echo.centralauth","ext.eventLogging","ext.wikimediaEvents","ext.navigationTiming","ext.uls.interface","ext.cx.eventlogging.campaigns","ext.cx.uls.quick.actions","wikibase.client.vector-2022","ext.checkUser.clientHints","ext.growthExperiments.SuggestedEditSession"];</script> <script>(RLQ=window.RLQ||[]).push(function(){mw.loader.impl(function(){return["user.options@12s5i",function($,jQuery,require,module){mw.user.tokens.set({"patrolToken":"+\\","watchToken":"+\\","csrfToken":"+\\"}); }];});});</script> <link rel="stylesheet" href="/w/load.php?lang=en&modules=ext.cite.styles%7Cext.math.styles%7Cext.uls.interlanguage%7Cext.visualEditor.desktopArticleTarget.noscript%7Cext.wikimediaBadges%7Cext.wikimediamessages.styles%7Cjquery.makeCollapsible.styles%7Cskins.vector.icons%2Cstyles%7Cskins.vector.search.codex.styles%7Cwikibase.client.init&only=styles&skin=vector-2022"> <script async="" src="/w/load.php?lang=en&modules=startup&only=scripts&raw=1&skin=vector-2022"></script> <meta name="ResourceLoaderDynamicStyles" content=""> <link rel="stylesheet" href="/w/load.php?lang=en&modules=site.styles&only=styles&skin=vector-2022"> <meta name="generator" content="MediaWiki 1.44.0-wmf.16"> <meta name="referrer" content="origin"> <meta name="referrer" content="origin-when-cross-origin"> <meta name="robots" content="max-image-preview:standard"> <meta name="format-detection" content="telephone=no"> <meta property="og:image" content="https://upload.wikimedia.org/wikipedia/commons/thumb/3/34/Transformer%2C_full_architecture.png/1200px-Transformer%2C_full_architecture.png"> <meta property="og:image:width" content="1200"> <meta property="og:image:height" content="1262"> <meta property="og:image" content="https://upload.wikimedia.org/wikipedia/commons/thumb/3/34/Transformer%2C_full_architecture.png/800px-Transformer%2C_full_architecture.png"> <meta property="og:image:width" content="800"> <meta property="og:image:height" content="842"> <meta property="og:image" content="https://upload.wikimedia.org/wikipedia/commons/thumb/3/34/Transformer%2C_full_architecture.png/640px-Transformer%2C_full_architecture.png"> <meta property="og:image:width" content="640"> <meta property="og:image:height" content="673"> <meta name="viewport" content="width=1120"> <meta property="og:title" content="Transformer (deep learning architecture) - Wikipedia"> <meta property="og:type" content="website"> <link rel="preconnect" href="//upload.wikimedia.org"> <link rel="alternate" media="only screen and (max-width: 640px)" href="//en.m.wikipedia.org/wiki/Transformer_(deep_learning_architecture)"> <link rel="alternate" type="application/x-wiki" title="Edit this page" href="/w/index.php?title=Transformer_(deep_learning_architecture)&action=edit"> <link rel="apple-touch-icon" href="/static/apple-touch/wikipedia.png"> <link rel="icon" href="/static/favicon/wikipedia.ico"> <link rel="search" type="application/opensearchdescription+xml" href="/w/rest.php/v1/search" title="Wikipedia (en)"> <link rel="EditURI" type="application/rsd+xml" href="//en.wikipedia.org/w/api.php?action=rsd"> <link rel="canonical" href="https://en.wikipedia.org/wiki/Transformer_(deep_learning_architecture)"> <link rel="license" href="https://creativecommons.org/licenses/by-sa/4.0/deed.en"> <link rel="alternate" type="application/atom+xml" title="Wikipedia Atom feed" href="/w/index.php?title=Special:RecentChanges&feed=atom"> <link rel="dns-prefetch" href="//meta.wikimedia.org" /> <link rel="dns-prefetch" href="login.wikimedia.org"> </head> <body class="skin--responsive skin-vector skin-vector-search-vue mediawiki ltr sitedir-ltr mw-hide-empty-elt ns-0 ns-subject mw-editable page-Transformer_deep_learning_architecture rootpage-Transformer_deep_learning_architecture skin-vector-2022 action-view"><a class="mw-jump-link" href="#bodyContent">Jump to content</a> <div class="vector-header-container"> <header class="vector-header mw-header"> <div class="vector-header-start"> <nav class="vector-main-menu-landmark" aria-label="Site"> <div id="vector-main-menu-dropdown" class="vector-dropdown vector-main-menu-dropdown vector-button-flush-left vector-button-flush-right" title="Main menu" > <input type="checkbox" id="vector-main-menu-dropdown-checkbox" role="button" aria-haspopup="true" data-event-name="ui.dropdown-vector-main-menu-dropdown" class="vector-dropdown-checkbox " aria-label="Main menu" > <label id="vector-main-menu-dropdown-label" for="vector-main-menu-dropdown-checkbox" class="vector-dropdown-label cdx-button cdx-button--fake-button cdx-button--fake-button--enabled cdx-button--weight-quiet cdx-button--icon-only " aria-hidden="true" ><span class="vector-icon mw-ui-icon-menu mw-ui-icon-wikimedia-menu"></span> <span class="vector-dropdown-label-text">Main menu</span> </label> <div class="vector-dropdown-content"> <div id="vector-main-menu-unpinned-container" class="vector-unpinned-container"> <div id="vector-main-menu" class="vector-main-menu vector-pinnable-element"> <div class="vector-pinnable-header vector-main-menu-pinnable-header vector-pinnable-header-unpinned" data-feature-name="main-menu-pinned" data-pinnable-element-id="vector-main-menu" data-pinned-container-id="vector-main-menu-pinned-container" data-unpinned-container-id="vector-main-menu-unpinned-container" > <div class="vector-pinnable-header-label">Main menu</div> <button class="vector-pinnable-header-toggle-button vector-pinnable-header-pin-button" data-event-name="pinnable-header.vector-main-menu.pin">move to sidebar</button> <button class="vector-pinnable-header-toggle-button vector-pinnable-header-unpin-button" data-event-name="pinnable-header.vector-main-menu.unpin">hide</button> </div> <div id="p-navigation" class="vector-menu mw-portlet mw-portlet-navigation" > <div class="vector-menu-heading"> Navigation </div> <div class="vector-menu-content"> <ul class="vector-menu-content-list"> <li id="n-mainpage-description" class="mw-list-item"><a href="/wiki/Main_Page" title="Visit the main page [z]" accesskey="z"><span>Main page</span></a></li><li id="n-contents" class="mw-list-item"><a href="/wiki/Wikipedia:Contents" title="Guides to browsing Wikipedia"><span>Contents</span></a></li><li id="n-currentevents" class="mw-list-item"><a href="/wiki/Portal:Current_events" title="Articles related to current events"><span>Current events</span></a></li><li id="n-randompage" class="mw-list-item"><a href="/wiki/Special:Random" title="Visit a randomly selected article [x]" accesskey="x"><span>Random article</span></a></li><li id="n-aboutsite" class="mw-list-item"><a href="/wiki/Wikipedia:About" title="Learn about Wikipedia and how it works"><span>About Wikipedia</span></a></li><li id="n-contactpage" class="mw-list-item"><a href="//en.wikipedia.org/wiki/Wikipedia:Contact_us" title="How to contact Wikipedia"><span>Contact us</span></a></li> </ul> </div> </div> <div id="p-interaction" class="vector-menu mw-portlet mw-portlet-interaction" > <div class="vector-menu-heading"> Contribute </div> <div class="vector-menu-content"> <ul class="vector-menu-content-list"> <li id="n-help" class="mw-list-item"><a href="/wiki/Help:Contents" title="Guidance on how to use and edit Wikipedia"><span>Help</span></a></li><li id="n-introduction" class="mw-list-item"><a href="/wiki/Help:Introduction" title="Learn how to edit Wikipedia"><span>Learn to edit</span></a></li><li id="n-portal" class="mw-list-item"><a href="/wiki/Wikipedia:Community_portal" title="The hub for editors"><span>Community portal</span></a></li><li id="n-recentchanges" class="mw-list-item"><a href="/wiki/Special:RecentChanges" title="A list of recent changes to Wikipedia [r]" accesskey="r"><span>Recent changes</span></a></li><li id="n-upload" class="mw-list-item"><a href="/wiki/Wikipedia:File_upload_wizard" title="Add images or other media for use on Wikipedia"><span>Upload file</span></a></li><li id="n-specialpages" class="mw-list-item"><a href="/wiki/Special:SpecialPages"><span>Special pages</span></a></li> </ul> </div> </div> </div> </div> </div> </div> </nav> <a href="/wiki/Main_Page" class="mw-logo"> <img class="mw-logo-icon" src="/static/images/icons/wikipedia.png" alt="" aria-hidden="true" height="50" width="50"> <span class="mw-logo-container skin-invert"> <img class="mw-logo-wordmark" alt="Wikipedia" src="/static/images/mobile/copyright/wikipedia-wordmark-en.svg" style="width: 7.5em; height: 1.125em;"> <img class="mw-logo-tagline" alt="The Free Encyclopedia" src="/static/images/mobile/copyright/wikipedia-tagline-en.svg" width="117" height="13" style="width: 7.3125em; height: 0.8125em;"> </span> </a> </div> <div class="vector-header-end"> <div id="p-search" role="search" class="vector-search-box-vue vector-search-box-collapses vector-search-box-show-thumbnail vector-search-box-auto-expand-width vector-search-box"> <a href="/wiki/Special:Search" class="cdx-button cdx-button--fake-button cdx-button--fake-button--enabled cdx-button--weight-quiet cdx-button--icon-only search-toggle" title="Search Wikipedia [f]" accesskey="f"><span class="vector-icon mw-ui-icon-search mw-ui-icon-wikimedia-search"></span> <span>Search</span> </a> <div class="vector-typeahead-search-container"> <div class="cdx-typeahead-search cdx-typeahead-search--show-thumbnail cdx-typeahead-search--auto-expand-width"> <form action="/w/index.php" id="searchform" class="cdx-search-input cdx-search-input--has-end-button"> <div id="simpleSearch" class="cdx-search-input__input-wrapper" data-search-loc="header-moved"> <div class="cdx-text-input cdx-text-input--has-start-icon"> <input class="cdx-text-input__input" type="search" name="search" placeholder="Search Wikipedia" aria-label="Search Wikipedia" autocapitalize="sentences" title="Search Wikipedia [f]" accesskey="f" id="searchInput" > <span class="cdx-text-input__icon cdx-text-input__start-icon"></span> </div> <input type="hidden" name="title" value="Special:Search"> </div> <button class="cdx-button cdx-search-input__end-button">Search</button> </form> </div> </div> </div> <nav class="vector-user-links vector-user-links-wide" aria-label="Personal tools"> <div class="vector-user-links-main"> <div id="p-vector-user-menu-preferences" class="vector-menu mw-portlet emptyPortlet" > <div class="vector-menu-content"> <ul class="vector-menu-content-list"> </ul> </div> </div> <div id="p-vector-user-menu-userpage" class="vector-menu mw-portlet emptyPortlet" > <div class="vector-menu-content"> <ul class="vector-menu-content-list"> </ul> </div> </div> <nav class="vector-appearance-landmark" aria-label="Appearance"> <div id="vector-appearance-dropdown" class="vector-dropdown " title="Change the appearance of the page's font size, width, and color" > <input type="checkbox" id="vector-appearance-dropdown-checkbox" role="button" aria-haspopup="true" data-event-name="ui.dropdown-vector-appearance-dropdown" class="vector-dropdown-checkbox " aria-label="Appearance" > <label id="vector-appearance-dropdown-label" for="vector-appearance-dropdown-checkbox" class="vector-dropdown-label cdx-button cdx-button--fake-button cdx-button--fake-button--enabled cdx-button--weight-quiet cdx-button--icon-only " aria-hidden="true" ><span class="vector-icon mw-ui-icon-appearance mw-ui-icon-wikimedia-appearance"></span> <span class="vector-dropdown-label-text">Appearance</span> </label> <div class="vector-dropdown-content"> <div id="vector-appearance-unpinned-container" class="vector-unpinned-container"> </div> </div> </div> </nav> <div id="p-vector-user-menu-notifications" class="vector-menu mw-portlet emptyPortlet" > <div class="vector-menu-content"> <ul class="vector-menu-content-list"> </ul> </div> </div> <div id="p-vector-user-menu-overflow" class="vector-menu mw-portlet" > <div class="vector-menu-content"> <ul class="vector-menu-content-list"> <li id="pt-sitesupport-2" class="user-links-collapsible-item mw-list-item user-links-collapsible-item"><a data-mw="interface" href="https://donate.wikimedia.org/?wmf_source=donate&wmf_medium=sidebar&wmf_campaign=en.wikipedia.org&uselang=en" class=""><span>Donate</span></a> </li> <li id="pt-createaccount-2" class="user-links-collapsible-item mw-list-item user-links-collapsible-item"><a data-mw="interface" href="/w/index.php?title=Special:CreateAccount&returnto=Transformer+%28deep+learning+architecture%29" title="You are encouraged to create an account and log in; however, it is not mandatory" class=""><span>Create account</span></a> </li> <li id="pt-login-2" class="user-links-collapsible-item mw-list-item user-links-collapsible-item"><a data-mw="interface" href="/w/index.php?title=Special:UserLogin&returnto=Transformer+%28deep+learning+architecture%29" title="You're encouraged to log in; however, it's not mandatory. [o]" accesskey="o" class=""><span>Log in</span></a> </li> </ul> </div> </div> </div> <div id="vector-user-links-dropdown" class="vector-dropdown vector-user-menu vector-button-flush-right vector-user-menu-logged-out" title="Log in and more options" > <input type="checkbox" id="vector-user-links-dropdown-checkbox" role="button" aria-haspopup="true" data-event-name="ui.dropdown-vector-user-links-dropdown" class="vector-dropdown-checkbox " aria-label="Personal tools" > <label id="vector-user-links-dropdown-label" for="vector-user-links-dropdown-checkbox" class="vector-dropdown-label cdx-button cdx-button--fake-button cdx-button--fake-button--enabled cdx-button--weight-quiet cdx-button--icon-only " aria-hidden="true" ><span class="vector-icon mw-ui-icon-ellipsis mw-ui-icon-wikimedia-ellipsis"></span> <span class="vector-dropdown-label-text">Personal tools</span> </label> <div class="vector-dropdown-content"> <div id="p-personal" class="vector-menu mw-portlet mw-portlet-personal user-links-collapsible-item" title="User menu" > <div class="vector-menu-content"> <ul class="vector-menu-content-list"> <li id="pt-sitesupport" class="user-links-collapsible-item mw-list-item"><a href="https://donate.wikimedia.org/?wmf_source=donate&wmf_medium=sidebar&wmf_campaign=en.wikipedia.org&uselang=en"><span>Donate</span></a></li><li id="pt-createaccount" class="user-links-collapsible-item mw-list-item"><a href="/w/index.php?title=Special:CreateAccount&returnto=Transformer+%28deep+learning+architecture%29" title="You are encouraged to create an account and log in; however, it is not mandatory"><span class="vector-icon mw-ui-icon-userAdd mw-ui-icon-wikimedia-userAdd"></span> <span>Create account</span></a></li><li id="pt-login" class="user-links-collapsible-item mw-list-item"><a href="/w/index.php?title=Special:UserLogin&returnto=Transformer+%28deep+learning+architecture%29" title="You're encouraged to log in; however, it's not mandatory. [o]" accesskey="o"><span class="vector-icon mw-ui-icon-logIn mw-ui-icon-wikimedia-logIn"></span> <span>Log in</span></a></li> </ul> </div> </div> <div id="p-user-menu-anon-editor" class="vector-menu mw-portlet mw-portlet-user-menu-anon-editor" > <div class="vector-menu-heading"> Pages for logged out editors <a href="/wiki/Help:Introduction" aria-label="Learn more about editing"><span>learn more</span></a> </div> <div class="vector-menu-content"> <ul class="vector-menu-content-list"> <li id="pt-anoncontribs" class="mw-list-item"><a href="/wiki/Special:MyContributions" title="A list of edits made from this IP address [y]" accesskey="y"><span>Contributions</span></a></li><li id="pt-anontalk" class="mw-list-item"><a href="/wiki/Special:MyTalk" title="Discussion about edits from this IP address [n]" accesskey="n"><span>Talk</span></a></li> </ul> </div> </div> </div> </div> </nav> </div> </header> </div> <div class="mw-page-container"> <div class="mw-page-container-inner"> <div class="vector-sitenotice-container"> <div id="siteNotice"><!-- CentralNotice --></div> </div> <div class="vector-column-start"> <div class="vector-main-menu-container"> <div id="mw-navigation"> <nav id="mw-panel" class="vector-main-menu-landmark" aria-label="Site"> <div id="vector-main-menu-pinned-container" class="vector-pinned-container"> </div> </nav> </div> </div> <div class="vector-sticky-pinned-container"> <nav id="mw-panel-toc" aria-label="Contents" data-event-name="ui.sidebar-toc" class="mw-table-of-contents-container vector-toc-landmark"> <div id="vector-toc-pinned-container" class="vector-pinned-container"> <div id="vector-toc" class="vector-toc vector-pinnable-element"> <div class="vector-pinnable-header vector-toc-pinnable-header vector-pinnable-header-pinned" data-feature-name="toc-pinned" data-pinnable-element-id="vector-toc" > <h2 class="vector-pinnable-header-label">Contents</h2> <button class="vector-pinnable-header-toggle-button vector-pinnable-header-pin-button" data-event-name="pinnable-header.vector-toc.pin">move to sidebar</button> <button class="vector-pinnable-header-toggle-button vector-pinnable-header-unpin-button" data-event-name="pinnable-header.vector-toc.unpin">hide</button> </div> <ul class="vector-toc-contents" id="mw-panel-toc-list"> <li id="toc-mw-content-text" class="vector-toc-list-item vector-toc-level-1"> <a href="#" class="vector-toc-link"> <div class="vector-toc-text">(Top)</div> </a> </li> <li id="toc-History" class="vector-toc-list-item vector-toc-level-1"> <a class="vector-toc-link" href="#History"> <div class="vector-toc-text"> <span class="vector-toc-numb">1</span> <span>History</span> </div> </a> <button aria-controls="toc-History-sublist" class="cdx-button cdx-button--weight-quiet cdx-button--icon-only vector-toc-toggle"> <span class="vector-icon mw-ui-icon-wikimedia-expand"></span> <span>Toggle History subsection</span> </button> <ul id="toc-History-sublist" class="vector-toc-list"> <li id="toc-Predecessors" class="vector-toc-list-item vector-toc-level-2"> <a class="vector-toc-link" href="#Predecessors"> <div class="vector-toc-text"> <span class="vector-toc-numb">1.1</span> <span>Predecessors</span> </div> </a> <ul id="toc-Predecessors-sublist" class="vector-toc-list"> </ul> </li> <li id="toc-Attention_with_seq2seq" class="vector-toc-list-item vector-toc-level-2"> <a class="vector-toc-link" href="#Attention_with_seq2seq"> <div class="vector-toc-text"> <span class="vector-toc-numb">1.2</span> <span>Attention with seq2seq</span> </div> </a> <ul id="toc-Attention_with_seq2seq-sublist" class="vector-toc-list"> </ul> </li> <li id="toc-Parallelizing_attention" class="vector-toc-list-item vector-toc-level-2"> <a class="vector-toc-link" href="#Parallelizing_attention"> <div class="vector-toc-text"> <span class="vector-toc-numb">1.3</span> <span>Parallelizing attention</span> </div> </a> <ul id="toc-Parallelizing_attention-sublist" class="vector-toc-list"> </ul> </li> <li id="toc-AI_boom_era" class="vector-toc-list-item vector-toc-level-2"> <a class="vector-toc-link" href="#AI_boom_era"> <div class="vector-toc-text"> <span class="vector-toc-numb">1.4</span> <span>AI boom era</span> </div> </a> <ul id="toc-AI_boom_era-sublist" class="vector-toc-list"> </ul> </li> </ul> </li> <li id="toc-Training" class="vector-toc-list-item vector-toc-level-1"> <a class="vector-toc-link" href="#Training"> <div class="vector-toc-text"> <span class="vector-toc-numb">2</span> <span>Training</span> </div> </a> <button aria-controls="toc-Training-sublist" class="cdx-button cdx-button--weight-quiet cdx-button--icon-only vector-toc-toggle"> <span class="vector-icon mw-ui-icon-wikimedia-expand"></span> <span>Toggle Training subsection</span> </button> <ul id="toc-Training-sublist" class="vector-toc-list"> <li id="toc-Methods_for_stabilizing_training" class="vector-toc-list-item vector-toc-level-2"> <a class="vector-toc-link" href="#Methods_for_stabilizing_training"> <div class="vector-toc-text"> <span class="vector-toc-numb">2.1</span> <span>Methods for stabilizing training</span> </div> </a> <ul id="toc-Methods_for_stabilizing_training-sublist" class="vector-toc-list"> </ul> </li> <li id="toc-Pretrain-finetune" class="vector-toc-list-item vector-toc-level-2"> <a class="vector-toc-link" href="#Pretrain-finetune"> <div class="vector-toc-text"> <span class="vector-toc-numb">2.2</span> <span>Pretrain-finetune</span> </div> </a> <ul id="toc-Pretrain-finetune-sublist" class="vector-toc-list"> </ul> </li> <li id="toc-Tasks" class="vector-toc-list-item vector-toc-level-2"> <a class="vector-toc-link" href="#Tasks"> <div class="vector-toc-text"> <span class="vector-toc-numb">2.3</span> <span>Tasks</span> </div> </a> <ul id="toc-Tasks-sublist" class="vector-toc-list"> </ul> </li> </ul> </li> <li id="toc-Architecture" class="vector-toc-list-item vector-toc-level-1"> <a class="vector-toc-link" href="#Architecture"> <div class="vector-toc-text"> <span class="vector-toc-numb">3</span> <span>Architecture</span> </div> </a> <button aria-controls="toc-Architecture-sublist" class="cdx-button cdx-button--weight-quiet cdx-button--icon-only vector-toc-toggle"> <span class="vector-icon mw-ui-icon-wikimedia-expand"></span> <span>Toggle Architecture subsection</span> </button> <ul id="toc-Architecture-sublist" class="vector-toc-list"> <li id="toc-Tokenization" class="vector-toc-list-item vector-toc-level-2"> <a class="vector-toc-link" href="#Tokenization"> <div class="vector-toc-text"> <span class="vector-toc-numb">3.1</span> <span>Tokenization</span> </div> </a> <ul id="toc-Tokenization-sublist" class="vector-toc-list"> </ul> </li> <li id="toc-Embedding" class="vector-toc-list-item vector-toc-level-2"> <a class="vector-toc-link" href="#Embedding"> <div class="vector-toc-text"> <span class="vector-toc-numb">3.2</span> <span>Embedding</span> </div> </a> <ul id="toc-Embedding-sublist" class="vector-toc-list"> </ul> </li> <li id="toc-Un-embedding" class="vector-toc-list-item vector-toc-level-2"> <a class="vector-toc-link" href="#Un-embedding"> <div class="vector-toc-text"> <span class="vector-toc-numb">3.3</span> <span>Un-embedding</span> </div> </a> <ul id="toc-Un-embedding-sublist" class="vector-toc-list"> </ul> </li> <li id="toc-Positional_encoding" class="vector-toc-list-item vector-toc-level-2"> <a class="vector-toc-link" href="#Positional_encoding"> <div class="vector-toc-text"> <span class="vector-toc-numb">3.4</span> <span>Positional encoding</span> </div> </a> <ul id="toc-Positional_encoding-sublist" class="vector-toc-list"> </ul> </li> <li id="toc-Encoder-decoder_(overview)" class="vector-toc-list-item vector-toc-level-2"> <a class="vector-toc-link" href="#Encoder-decoder_(overview)"> <div class="vector-toc-text"> <span class="vector-toc-numb">3.5</span> <span>Encoder-decoder (overview)</span> </div> </a> <ul id="toc-Encoder-decoder_(overview)-sublist" class="vector-toc-list"> </ul> </li> <li id="toc-Feedforward_network" class="vector-toc-list-item vector-toc-level-2"> <a class="vector-toc-link" href="#Feedforward_network"> <div class="vector-toc-text"> <span class="vector-toc-numb">3.6</span> <span>Feedforward network</span> </div> </a> <ul id="toc-Feedforward_network-sublist" class="vector-toc-list"> </ul> </li> <li id="toc-Scaled_dot-product_attention" class="vector-toc-list-item vector-toc-level-2"> <a class="vector-toc-link" href="#Scaled_dot-product_attention"> <div class="vector-toc-text"> <span class="vector-toc-numb">3.7</span> <span>Scaled dot-product attention</span> </div> </a> <ul id="toc-Scaled_dot-product_attention-sublist" class="vector-toc-list"> <li id="toc-Attention_head" class="vector-toc-list-item vector-toc-level-3"> <a class="vector-toc-link" href="#Attention_head"> <div class="vector-toc-text"> <span class="vector-toc-numb">3.7.1</span> <span>Attention head</span> </div> </a> <ul id="toc-Attention_head-sublist" class="vector-toc-list"> </ul> </li> <li id="toc-Multiheaded_attention" class="vector-toc-list-item vector-toc-level-3"> <a class="vector-toc-link" href="#Multiheaded_attention"> <div class="vector-toc-text"> <span class="vector-toc-numb">3.7.2</span> <span>Multiheaded attention</span> </div> </a> <ul id="toc-Multiheaded_attention-sublist" class="vector-toc-list"> </ul> </li> <li id="toc-Masked_attention" class="vector-toc-list-item vector-toc-level-3"> <a class="vector-toc-link" href="#Masked_attention"> <div class="vector-toc-text"> <span class="vector-toc-numb">3.7.3</span> <span>Masked attention</span> </div> </a> <ul id="toc-Masked_attention-sublist" class="vector-toc-list"> </ul> </li> </ul> </li> <li id="toc-Encoder" class="vector-toc-list-item vector-toc-level-2"> <a class="vector-toc-link" href="#Encoder"> <div class="vector-toc-text"> <span class="vector-toc-numb">3.8</span> <span>Encoder</span> </div> </a> <ul id="toc-Encoder-sublist" class="vector-toc-list"> </ul> </li> <li id="toc-Decoder" class="vector-toc-list-item vector-toc-level-2"> <a class="vector-toc-link" href="#Decoder"> <div class="vector-toc-text"> <span class="vector-toc-numb">3.9</span> <span>Decoder</span> </div> </a> <ul id="toc-Decoder-sublist" class="vector-toc-list"> </ul> </li> <li id="toc-Adapted_architectures" class="vector-toc-list-item vector-toc-level-2"> <a class="vector-toc-link" href="#Adapted_architectures"> <div class="vector-toc-text"> <span class="vector-toc-numb">3.10</span> <span>Adapted architectures</span> </div> </a> <ul id="toc-Adapted_architectures-sublist" class="vector-toc-list"> </ul> </li> </ul> </li> <li id="toc-Full_transformer_architecture" class="vector-toc-list-item vector-toc-level-1"> <a class="vector-toc-link" href="#Full_transformer_architecture"> <div class="vector-toc-text"> <span class="vector-toc-numb">4</span> <span>Full transformer architecture</span> </div> </a> <button aria-controls="toc-Full_transformer_architecture-sublist" class="cdx-button cdx-button--weight-quiet cdx-button--icon-only vector-toc-toggle"> <span class="vector-icon mw-ui-icon-wikimedia-expand"></span> <span>Toggle Full transformer architecture subsection</span> </button> <ul id="toc-Full_transformer_architecture-sublist" class="vector-toc-list"> <li id="toc-Sublayers" class="vector-toc-list-item vector-toc-level-2"> <a class="vector-toc-link" href="#Sublayers"> <div class="vector-toc-text"> <span class="vector-toc-numb">4.1</span> <span>Sublayers</span> </div> </a> <ul id="toc-Sublayers-sublist" class="vector-toc-list"> </ul> </li> <li id="toc-Pseudocode" class="vector-toc-list-item vector-toc-level-2"> <a class="vector-toc-link" href="#Pseudocode"> <div class="vector-toc-text"> <span class="vector-toc-numb">4.2</span> <span>Pseudocode</span> </div> </a> <ul id="toc-Pseudocode-sublist" class="vector-toc-list"> </ul> </li> <li id="toc-Terminology" class="vector-toc-list-item vector-toc-level-2"> <a class="vector-toc-link" href="#Terminology"> <div class="vector-toc-text"> <span class="vector-toc-numb">4.3</span> <span>Terminology</span> </div> </a> <ul id="toc-Terminology-sublist" class="vector-toc-list"> </ul> </li> </ul> </li> <li id="toc-Subsequent_work" class="vector-toc-list-item vector-toc-level-1"> <a class="vector-toc-link" href="#Subsequent_work"> <div class="vector-toc-text"> <span class="vector-toc-numb">5</span> <span>Subsequent work</span> </div> </a> <button aria-controls="toc-Subsequent_work-sublist" class="cdx-button cdx-button--weight-quiet cdx-button--icon-only vector-toc-toggle"> <span class="vector-icon mw-ui-icon-wikimedia-expand"></span> <span>Toggle Subsequent work subsection</span> </button> <ul id="toc-Subsequent_work-sublist" class="vector-toc-list"> <li id="toc-Alternative_activation_functions" class="vector-toc-list-item vector-toc-level-2"> <a class="vector-toc-link" href="#Alternative_activation_functions"> <div class="vector-toc-text"> <span class="vector-toc-numb">5.1</span> <span>Alternative activation functions</span> </div> </a> <ul id="toc-Alternative_activation_functions-sublist" class="vector-toc-list"> </ul> </li> <li id="toc-Alternative_normalizations" class="vector-toc-list-item vector-toc-level-2"> <a class="vector-toc-link" href="#Alternative_normalizations"> <div class="vector-toc-text"> <span class="vector-toc-numb">5.2</span> <span>Alternative normalizations</span> </div> </a> <ul id="toc-Alternative_normalizations-sublist" class="vector-toc-list"> </ul> </li> <li id="toc-Alternative_positional_encodings" class="vector-toc-list-item vector-toc-level-2"> <a class="vector-toc-link" href="#Alternative_positional_encodings"> <div class="vector-toc-text"> <span class="vector-toc-numb">5.3</span> <span>Alternative positional encodings</span> </div> </a> <ul id="toc-Alternative_positional_encodings-sublist" class="vector-toc-list"> <li id="toc-RoPE" class="vector-toc-list-item vector-toc-level-3"> <a class="vector-toc-link" href="#RoPE"> <div class="vector-toc-text"> <span class="vector-toc-numb">5.3.1</span> <span>RoPE</span> </div> </a> <ul id="toc-RoPE-sublist" class="vector-toc-list"> </ul> </li> <li id="toc-ALiBi" class="vector-toc-list-item vector-toc-level-3"> <a class="vector-toc-link" href="#ALiBi"> <div class="vector-toc-text"> <span class="vector-toc-numb">5.3.2</span> <span>ALiBi</span> </div> </a> <ul id="toc-ALiBi-sublist" class="vector-toc-list"> </ul> </li> <li id="toc-Relative_Position_Encodings" class="vector-toc-list-item vector-toc-level-3"> <a class="vector-toc-link" href="#Relative_Position_Encodings"> <div class="vector-toc-text"> <span class="vector-toc-numb">5.3.3</span> <span>Relative Position Encodings</span> </div> </a> <ul id="toc-Relative_Position_Encodings-sublist" class="vector-toc-list"> </ul> </li> </ul> </li> <li id="toc-Efficient_implementation" class="vector-toc-list-item vector-toc-level-2"> <a class="vector-toc-link" href="#Efficient_implementation"> <div class="vector-toc-text"> <span class="vector-toc-numb">5.4</span> <span>Efficient implementation</span> </div> </a> <ul id="toc-Efficient_implementation-sublist" class="vector-toc-list"> <li id="toc-KV_caching" class="vector-toc-list-item vector-toc-level-3"> <a class="vector-toc-link" href="#KV_caching"> <div class="vector-toc-text"> <span class="vector-toc-numb">5.4.1</span> <span>KV caching</span> </div> </a> <ul id="toc-KV_caching-sublist" class="vector-toc-list"> </ul> </li> <li id="toc-FlashAttention" class="vector-toc-list-item vector-toc-level-3"> <a class="vector-toc-link" href="#FlashAttention"> <div class="vector-toc-text"> <span class="vector-toc-numb">5.4.2</span> <span>FlashAttention</span> </div> </a> <ul id="toc-FlashAttention-sublist" class="vector-toc-list"> </ul> </li> <li id="toc-Multi-Query_Attention" class="vector-toc-list-item vector-toc-level-3"> <a class="vector-toc-link" href="#Multi-Query_Attention"> <div class="vector-toc-text"> <span class="vector-toc-numb">5.4.3</span> <span>Multi-Query Attention</span> </div> </a> <ul id="toc-Multi-Query_Attention-sublist" class="vector-toc-list"> </ul> </li> <li id="toc-Speculative_decoding" class="vector-toc-list-item vector-toc-level-3"> <a class="vector-toc-link" href="#Speculative_decoding"> <div class="vector-toc-text"> <span class="vector-toc-numb">5.4.4</span> <span>Speculative decoding</span> </div> </a> <ul id="toc-Speculative_decoding-sublist" class="vector-toc-list"> </ul> </li> </ul> </li> <li id="toc-Sub-quadratic_transformers" class="vector-toc-list-item vector-toc-level-2"> <a class="vector-toc-link" href="#Sub-quadratic_transformers"> <div class="vector-toc-text"> <span class="vector-toc-numb">5.5</span> <span>Sub-quadratic transformers</span> </div> </a> <ul id="toc-Sub-quadratic_transformers-sublist" class="vector-toc-list"> <li id="toc-Alternative_attention_graphs" class="vector-toc-list-item vector-toc-level-3"> <a class="vector-toc-link" href="#Alternative_attention_graphs"> <div class="vector-toc-text"> <span class="vector-toc-numb">5.5.1</span> <span>Alternative attention graphs</span> </div> </a> <ul id="toc-Alternative_attention_graphs-sublist" class="vector-toc-list"> </ul> </li> <li id="toc-Random_Feature_Attention" class="vector-toc-list-item vector-toc-level-3"> <a class="vector-toc-link" href="#Random_Feature_Attention"> <div class="vector-toc-text"> <span class="vector-toc-numb">5.5.2</span> <span>Random Feature Attention</span> </div> </a> <ul id="toc-Random_Feature_Attention-sublist" class="vector-toc-list"> </ul> </li> </ul> </li> <li id="toc-Multimodality" class="vector-toc-list-item vector-toc-level-2"> <a class="vector-toc-link" href="#Multimodality"> <div class="vector-toc-text"> <span class="vector-toc-numb">5.6</span> <span>Multimodality</span> </div> </a> <ul id="toc-Multimodality-sublist" class="vector-toc-list"> </ul> </li> </ul> </li> <li id="toc-Applications" class="vector-toc-list-item vector-toc-level-1"> <a class="vector-toc-link" href="#Applications"> <div class="vector-toc-text"> <span class="vector-toc-numb">6</span> <span>Applications</span> </div> </a> <ul id="toc-Applications-sublist" class="vector-toc-list"> </ul> </li> <li id="toc-See_also" class="vector-toc-list-item vector-toc-level-1"> <a class="vector-toc-link" href="#See_also"> <div class="vector-toc-text"> <span class="vector-toc-numb">7</span> <span>See also</span> </div> </a> <ul id="toc-See_also-sublist" class="vector-toc-list"> </ul> </li> <li id="toc-Notes" class="vector-toc-list-item vector-toc-level-1"> <a class="vector-toc-link" href="#Notes"> <div class="vector-toc-text"> <span class="vector-toc-numb">8</span> <span>Notes</span> </div> </a> <ul id="toc-Notes-sublist" class="vector-toc-list"> </ul> </li> <li id="toc-References" class="vector-toc-list-item vector-toc-level-1"> <a class="vector-toc-link" href="#References"> <div class="vector-toc-text"> <span class="vector-toc-numb">9</span> <span>References</span> </div> </a> <ul id="toc-References-sublist" class="vector-toc-list"> </ul> </li> <li id="toc-Further_reading" class="vector-toc-list-item vector-toc-level-1"> <a class="vector-toc-link" href="#Further_reading"> <div class="vector-toc-text"> <span class="vector-toc-numb">10</span> <span>Further reading</span> </div> </a> <ul id="toc-Further_reading-sublist" class="vector-toc-list"> </ul> </li> </ul> </div> </div> </nav> </div> </div> <div class="mw-content-container"> <main id="content" class="mw-body"> <header class="mw-body-header vector-page-titlebar"> <nav aria-label="Contents" class="vector-toc-landmark"> <div id="vector-page-titlebar-toc" class="vector-dropdown vector-page-titlebar-toc vector-button-flush-left" title="Table of Contents" > <input type="checkbox" id="vector-page-titlebar-toc-checkbox" role="button" aria-haspopup="true" data-event-name="ui.dropdown-vector-page-titlebar-toc" class="vector-dropdown-checkbox " aria-label="Toggle the table of contents" > <label id="vector-page-titlebar-toc-label" for="vector-page-titlebar-toc-checkbox" class="vector-dropdown-label cdx-button cdx-button--fake-button cdx-button--fake-button--enabled cdx-button--weight-quiet cdx-button--icon-only " aria-hidden="true" ><span class="vector-icon mw-ui-icon-listBullet mw-ui-icon-wikimedia-listBullet"></span> <span class="vector-dropdown-label-text">Toggle the table of contents</span> </label> <div class="vector-dropdown-content"> <div id="vector-page-titlebar-toc-unpinned-container" class="vector-unpinned-container"> </div> </div> </div> </nav> <h1 id="firstHeading" class="firstHeading mw-first-heading"><span class="mw-page-title-main">Transformer (deep learning architecture)</span></h1> <div id="p-lang-btn" class="vector-dropdown mw-portlet mw-portlet-lang" > <input type="checkbox" id="p-lang-btn-checkbox" role="button" aria-haspopup="true" data-event-name="ui.dropdown-p-lang-btn" class="vector-dropdown-checkbox mw-interlanguage-selector" aria-label="Go to an article in another language. Available in 28 languages" > <label id="p-lang-btn-label" for="p-lang-btn-checkbox" class="vector-dropdown-label cdx-button cdx-button--fake-button cdx-button--fake-button--enabled cdx-button--weight-quiet cdx-button--action-progressive mw-portlet-lang-heading-28" aria-hidden="true" ><span class="vector-icon mw-ui-icon-language-progressive mw-ui-icon-wikimedia-language-progressive"></span> <span class="vector-dropdown-label-text">28 languages</span> </label> <div class="vector-dropdown-content"> <div class="vector-menu-content"> <ul class="vector-menu-content-list"> <li class="interlanguage-link interwiki-ar mw-list-item"><a href="https://ar.wikipedia.org/wiki/%D9%85%D8%AD%D9%88%D9%84_(%D8%AA%D8%B9%D9%84%D9%85_%D8%A7%D9%84%D8%A2%D9%84%D8%A9)" title="محول (تعلم الآلة) – Arabic" lang="ar" hreflang="ar" data-title="محول (تعلم الآلة)" data-language-autonym="العربية" data-language-local-name="Arabic" class="interlanguage-link-target"><span>العربية</span></a></li><li class="interlanguage-link interwiki-ca mw-list-item"><a href="https://ca.wikipedia.org/wiki/Transformador_(model_d%27aprenentatge_autom%C3%A0tic)" title="Transformador (model d'aprenentatge automàtic) – Catalan" lang="ca" hreflang="ca" data-title="Transformador (model d'aprenentatge automàtic)" data-language-autonym="Català" data-language-local-name="Catalan" class="interlanguage-link-target"><span>Català</span></a></li><li class="interlanguage-link interwiki-cs mw-list-item"><a href="https://cs.wikipedia.org/wiki/Transform%C3%A1tor_(model_strojov%C3%A9ho_u%C4%8Den%C3%AD)" title="Transformátor (model strojového učení) – Czech" lang="cs" hreflang="cs" data-title="Transformátor (model strojového učení)" data-language-autonym="Čeština" data-language-local-name="Czech" class="interlanguage-link-target"><span>Čeština</span></a></li><li class="interlanguage-link interwiki-de mw-list-item"><a href="https://de.wikipedia.org/wiki/Transformer_(Maschinelles_Lernen)" title="Transformer (Maschinelles Lernen) – German" lang="de" hreflang="de" data-title="Transformer (Maschinelles Lernen)" data-language-autonym="Deutsch" data-language-local-name="German" class="interlanguage-link-target"><span>Deutsch</span></a></li><li class="interlanguage-link interwiki-et mw-list-item"><a href="https://et.wikipedia.org/wiki/Transformer_(masin%C3%B5pe)" title="Transformer (masinõpe) – Estonian" lang="et" hreflang="et" data-title="Transformer (masinõpe)" data-language-autonym="Eesti" data-language-local-name="Estonian" class="interlanguage-link-target"><span>Eesti</span></a></li><li class="interlanguage-link interwiki-es mw-list-item"><a href="https://es.wikipedia.org/wiki/Transformador_(modelo_de_aprendizaje_autom%C3%A1tico)" title="Transformador (modelo de aprendizaje automático) – Spanish" lang="es" hreflang="es" data-title="Transformador (modelo de aprendizaje automático)" data-language-autonym="Español" data-language-local-name="Spanish" class="interlanguage-link-target"><span>Español</span></a></li><li class="interlanguage-link interwiki-eu mw-list-item"><a href="https://eu.wikipedia.org/wiki/Transformer_(ikasketa_automatikoko_eredua)" title="Transformer (ikasketa automatikoko eredua) – Basque" lang="eu" hreflang="eu" data-title="Transformer (ikasketa automatikoko eredua)" data-language-autonym="Euskara" data-language-local-name="Basque" class="interlanguage-link-target"><span>Euskara</span></a></li><li class="interlanguage-link interwiki-fa mw-list-item"><a href="https://fa.wikipedia.org/wiki/%D8%AA%D8%B1%D9%86%D8%B3%D9%81%D9%88%D8%B1%D9%85%D8%B1_(%DB%8C%D8%A7%D8%AF%DA%AF%DB%8C%D8%B1%DB%8C_%D8%B9%D9%85%DB%8C%D9%82)" title="ترنسفورمر (یادگیری عمیق) – Persian" lang="fa" hreflang="fa" data-title="ترنسفورمر (یادگیری عمیق)" data-language-autonym="فارسی" data-language-local-name="Persian" class="interlanguage-link-target"><span>فارسی</span></a></li><li class="interlanguage-link interwiki-fr mw-list-item"><a href="https://fr.wikipedia.org/wiki/Transformeur" title="Transformeur – French" lang="fr" hreflang="fr" data-title="Transformeur" data-language-autonym="Français" data-language-local-name="French" class="interlanguage-link-target"><span>Français</span></a></li><li class="interlanguage-link interwiki-ga mw-list-item"><a href="https://ga.wikipedia.org/wiki/Trasfhoirmeoir_(ailtireacht_domhainfhoghlama)" title="Trasfhoirmeoir (ailtireacht domhainfhoghlama) – Irish" lang="ga" hreflang="ga" data-title="Trasfhoirmeoir (ailtireacht domhainfhoghlama)" data-language-autonym="Gaeilge" data-language-local-name="Irish" class="interlanguage-link-target"><span>Gaeilge</span></a></li><li class="interlanguage-link interwiki-gl mw-list-item"><a href="https://gl.wikipedia.org/wiki/Transformador_(modelo_de_aprendizaxe_autom%C3%A1tica)" title="Transformador (modelo de aprendizaxe automática) – Galician" lang="gl" hreflang="gl" data-title="Transformador (modelo de aprendizaxe automática)" data-language-autonym="Galego" data-language-local-name="Galician" class="interlanguage-link-target"><span>Galego</span></a></li><li class="interlanguage-link interwiki-ko mw-list-item"><a href="https://ko.wikipedia.org/wiki/%ED%8A%B8%EB%9E%9C%EC%8A%A4%ED%8F%AC%EB%A8%B8_(%EA%B8%B0%EA%B3%84_%ED%95%99%EC%8A%B5)" title="트랜스포머 (기계 학습) – Korean" lang="ko" hreflang="ko" data-title="트랜스포머 (기계 학습)" data-language-autonym="한국어" data-language-local-name="Korean" class="interlanguage-link-target"><span>한국어</span></a></li><li class="interlanguage-link interwiki-hy mw-list-item"><a href="https://hy.wikipedia.org/wiki/%D5%8F%D6%80%D5%A1%D5%B6%D5%BD%D6%86%D5%B8%D6%80%D5%B4%D5%A5%D6%80_(%D5%AD%D5%B8%D6%80%D5%A8_%D5%B8%D6%82%D5%BD%D5%B8%D6%82%D6%81%D5%B8%D6%82%D5%B4)" title="Տրանսֆորմեր (խորը ուսուցում) – Armenian" lang="hy" hreflang="hy" data-title="Տրանսֆորմեր (խորը ուսուցում)" data-language-autonym="Հայերեն" data-language-local-name="Armenian" class="interlanguage-link-target"><span>Հայերեն</span></a></li><li class="interlanguage-link interwiki-it mw-list-item"><a href="https://it.wikipedia.org/wiki/Trasformatore_(informatica)" title="Trasformatore (informatica) – Italian" lang="it" hreflang="it" data-title="Trasformatore (informatica)" data-language-autonym="Italiano" data-language-local-name="Italian" class="interlanguage-link-target"><span>Italiano</span></a></li><li class="interlanguage-link interwiki-he mw-list-item"><a href="https://he.wikipedia.org/wiki/%D7%98%D7%A8%D7%A0%D7%A1%D7%A4%D7%95%D7%A8%D7%9E%D7%A8_(%D7%9C%D7%9E%D7%99%D7%93%D7%AA_%D7%9E%D7%9B%D7%95%D7%A0%D7%94)" title="טרנספורמר (למידת מכונה) – Hebrew" lang="he" hreflang="he" data-title="טרנספורמר (למידת מכונה)" data-language-autonym="עברית" data-language-local-name="Hebrew" class="interlanguage-link-target"><span>עברית</span></a></li><li class="interlanguage-link interwiki-ja mw-list-item"><a href="https://ja.wikipedia.org/wiki/Transformer_(%E6%A9%9F%E6%A2%B0%E5%AD%A6%E7%BF%92%E3%83%A2%E3%83%87%E3%83%AB)" title="Transformer (機械学習モデル) – Japanese" lang="ja" hreflang="ja" data-title="Transformer (機械学習モデル)" data-language-autonym="日本語" data-language-local-name="Japanese" class="interlanguage-link-target"><span>日本語</span></a></li><li class="interlanguage-link interwiki-pl mw-list-item"><a href="https://pl.wikipedia.org/wiki/Transformator_(sztuczna_inteligencja)" title="Transformator (sztuczna inteligencja) – Polish" lang="pl" hreflang="pl" data-title="Transformator (sztuczna inteligencja)" data-language-autonym="Polski" data-language-local-name="Polish" class="interlanguage-link-target"><span>Polski</span></a></li><li class="interlanguage-link interwiki-kaa mw-list-item"><a href="https://kaa.wikipedia.org/wiki/Transformator_(tere%C5%84_oq%C4%B1t%C4%B1w_arxitekturas%C4%B1)" title="Transformator (tereń oqıtıw arxitekturası) – Kara-Kalpak" lang="kaa" hreflang="kaa" data-title="Transformator (tereń oqıtıw arxitekturası)" data-language-autonym="Qaraqalpaqsha" data-language-local-name="Kara-Kalpak" class="interlanguage-link-target"><span>Qaraqalpaqsha</span></a></li><li class="interlanguage-link interwiki-ru mw-list-item"><a href="https://ru.wikipedia.org/wiki/%D0%A2%D1%80%D0%B0%D0%BD%D1%81%D1%84%D0%BE%D1%80%D0%BC%D0%B5%D1%80_(%D0%BC%D0%BE%D0%B4%D0%B5%D0%BB%D1%8C_%D0%BC%D0%B0%D1%88%D0%B8%D0%BD%D0%BD%D0%BE%D0%B3%D0%BE_%D0%BE%D0%B1%D1%83%D1%87%D0%B5%D0%BD%D0%B8%D1%8F)" title="Трансформер (модель машинного обучения) – Russian" lang="ru" hreflang="ru" data-title="Трансформер (модель машинного обучения)" data-language-autonym="Русский" data-language-local-name="Russian" class="interlanguage-link-target"><span>Русский</span></a></li><li class="interlanguage-link interwiki-simple mw-list-item"><a href="https://simple.wikipedia.org/wiki/Transformer_(machine_learning_model)" title="Transformer (machine learning model) – Simple English" lang="en-simple" hreflang="en-simple" data-title="Transformer (machine learning model)" data-language-autonym="Simple English" data-language-local-name="Simple English" class="interlanguage-link-target"><span>Simple English</span></a></li><li class="interlanguage-link interwiki-ckb mw-list-item"><a href="https://ckb.wikipedia.org/wiki/%D8%AA%D8%B1%D8%A7%D9%86%D8%B3%D9%81%DB%86%D8%B1%D9%85%DB%95%D8%B1_(%D9%85%DB%86%D8%AF%DB%8E%D9%84%DB%8C_%D9%81%DB%8E%D8%B1%D8%A8%D9%88%D9%88%D9%86%DB%8C_%D8%A6%D8%A7%D9%85%DB%8E%D8%B1)" title="ترانسفۆرمەر (مۆدێلی فێربوونی ئامێر) – Central Kurdish" lang="ckb" hreflang="ckb" data-title="ترانسفۆرمەر (مۆدێلی فێربوونی ئامێر)" data-language-autonym="کوردی" data-language-local-name="Central Kurdish" class="interlanguage-link-target"><span>کوردی</span></a></li><li class="interlanguage-link interwiki-sr mw-list-item"><a href="https://sr.wikipedia.org/wiki/Transformator_(model_ma%C5%A1inskog_u%C4%8Denja)" title="Transformator (model mašinskog učenja) – Serbian" lang="sr" hreflang="sr" data-title="Transformator (model mašinskog učenja)" data-language-autonym="Српски / srpski" data-language-local-name="Serbian" class="interlanguage-link-target"><span>Српски / srpski</span></a></li><li class="interlanguage-link interwiki-sv mw-list-item"><a href="https://sv.wikipedia.org/wiki/Transformator_(maskininl%C3%A4rningsmodell)" title="Transformator (maskininlärningsmodell) – Swedish" lang="sv" hreflang="sv" data-title="Transformator (maskininlärningsmodell)" data-language-autonym="Svenska" data-language-local-name="Swedish" class="interlanguage-link-target"><span>Svenska</span></a></li><li class="interlanguage-link interwiki-th mw-list-item"><a href="https://th.wikipedia.org/wiki/%E0%B8%97%E0%B8%A3%E0%B8%B2%E0%B8%99%E0%B8%AA%E0%B9%8C%E0%B8%9F%E0%B8%AD%E0%B8%A3%E0%B9%8C%E0%B9%80%E0%B8%A1%E0%B8%AD%E0%B8%A3%E0%B9%8C" title="ทรานส์ฟอร์เมอร์ – Thai" lang="th" hreflang="th" data-title="ทรานส์ฟอร์เมอร์" data-language-autonym="ไทย" data-language-local-name="Thai" class="interlanguage-link-target"><span>ไทย</span></a></li><li class="interlanguage-link interwiki-uk mw-list-item"><a href="https://uk.wikipedia.org/wiki/%D0%A2%D1%80%D0%B0%D0%BD%D1%81%D1%84%D0%BE%D1%80%D0%BC%D0%B5%D1%80_(%D0%B0%D1%80%D1%85%D1%96%D1%82%D0%B5%D0%BA%D1%82%D1%83%D1%80%D0%B0_%D0%B3%D0%BB%D0%B8%D0%B1%D0%BE%D0%BA%D0%BE%D0%B3%D0%BE_%D0%BD%D0%B0%D0%B2%D1%87%D0%B0%D0%BD%D0%BD%D1%8F)" title="Трансформер (архітектура глибокого навчання) – Ukrainian" lang="uk" hreflang="uk" data-title="Трансформер (архітектура глибокого навчання)" data-language-autonym="Українська" data-language-local-name="Ukrainian" class="interlanguage-link-target"><span>Українська</span></a></li><li class="interlanguage-link interwiki-vi mw-list-item"><a href="https://vi.wikipedia.org/wiki/Transformer_(m%C3%B4_h%C3%ACnh_h%E1%BB%8Dc_m%C3%A1y)" title="Transformer (mô hình học máy) – Vietnamese" lang="vi" hreflang="vi" data-title="Transformer (mô hình học máy)" data-language-autonym="Tiếng Việt" data-language-local-name="Vietnamese" class="interlanguage-link-target"><span>Tiếng Việt</span></a></li><li class="interlanguage-link interwiki-zh-yue mw-list-item"><a href="https://zh-yue.wikipedia.org/wiki/Transformer_(%E6%A9%9F%E6%A2%B0%E5%AD%B8%E7%BF%92%E6%A8%A1%E5%9E%8B)" title="Transformer (機械學習模型) – Cantonese" lang="yue" hreflang="yue" data-title="Transformer (機械學習模型)" data-language-autonym="粵語" data-language-local-name="Cantonese" class="interlanguage-link-target"><span>粵語</span></a></li><li class="interlanguage-link interwiki-zh mw-list-item"><a href="https://zh.wikipedia.org/wiki/Transformer%E6%A8%A1%E5%9E%8B" title="Transformer模型 – Chinese" lang="zh" hreflang="zh" data-title="Transformer模型" data-language-autonym="中文" data-language-local-name="Chinese" class="interlanguage-link-target"><span>中文</span></a></li> </ul> <div class="after-portlet after-portlet-lang"><span class="wb-langlinks-edit wb-langlinks-link"><a href="https://www.wikidata.org/wiki/Special:EntityPage/Q85810444#sitelinks-wikipedia" title="Edit interlanguage links" class="wbc-editpage">Edit links</a></span></div> </div> </div> </div> </header> <div class="vector-page-toolbar"> <div class="vector-page-toolbar-container"> <div id="left-navigation"> <nav aria-label="Namespaces"> <div id="p-associated-pages" class="vector-menu vector-menu-tabs mw-portlet mw-portlet-associated-pages" > <div class="vector-menu-content"> <ul class="vector-menu-content-list"> <li id="ca-nstab-main" class="selected vector-tab-noicon mw-list-item"><a href="/wiki/Transformer_(deep_learning_architecture)" title="View the content page [c]" accesskey="c"><span>Article</span></a></li><li id="ca-talk" class="vector-tab-noicon mw-list-item"><a href="/wiki/Talk:Transformer_(deep_learning_architecture)" rel="discussion" title="Discuss improvements to the content page [t]" accesskey="t"><span>Talk</span></a></li> </ul> </div> </div> <div id="vector-variants-dropdown" class="vector-dropdown emptyPortlet" > <input type="checkbox" id="vector-variants-dropdown-checkbox" role="button" aria-haspopup="true" data-event-name="ui.dropdown-vector-variants-dropdown" class="vector-dropdown-checkbox " aria-label="Change language variant" > <label id="vector-variants-dropdown-label" for="vector-variants-dropdown-checkbox" class="vector-dropdown-label cdx-button cdx-button--fake-button cdx-button--fake-button--enabled cdx-button--weight-quiet" aria-hidden="true" ><span class="vector-dropdown-label-text">English</span> </label> <div class="vector-dropdown-content"> <div id="p-variants" class="vector-menu mw-portlet mw-portlet-variants emptyPortlet" > <div class="vector-menu-content"> <ul class="vector-menu-content-list"> </ul> </div> </div> </div> </div> </nav> </div> <div id="right-navigation" class="vector-collapsible"> <nav aria-label="Views"> <div id="p-views" class="vector-menu vector-menu-tabs mw-portlet mw-portlet-views" > <div class="vector-menu-content"> <ul class="vector-menu-content-list"> <li id="ca-view" class="selected vector-tab-noicon mw-list-item"><a href="/wiki/Transformer_(deep_learning_architecture)"><span>Read</span></a></li><li id="ca-edit" class="vector-tab-noicon mw-list-item"><a href="/w/index.php?title=Transformer_(deep_learning_architecture)&action=edit" title="Edit this page [e]" accesskey="e"><span>Edit</span></a></li><li id="ca-history" class="vector-tab-noicon mw-list-item"><a href="/w/index.php?title=Transformer_(deep_learning_architecture)&action=history" title="Past revisions of this page [h]" accesskey="h"><span>View history</span></a></li> </ul> </div> </div> </nav> <nav class="vector-page-tools-landmark" aria-label="Page tools"> <div id="vector-page-tools-dropdown" class="vector-dropdown vector-page-tools-dropdown" > <input type="checkbox" id="vector-page-tools-dropdown-checkbox" role="button" aria-haspopup="true" data-event-name="ui.dropdown-vector-page-tools-dropdown" class="vector-dropdown-checkbox " aria-label="Tools" > <label id="vector-page-tools-dropdown-label" for="vector-page-tools-dropdown-checkbox" class="vector-dropdown-label cdx-button cdx-button--fake-button cdx-button--fake-button--enabled cdx-button--weight-quiet" aria-hidden="true" ><span class="vector-dropdown-label-text">Tools</span> </label> <div class="vector-dropdown-content"> <div id="vector-page-tools-unpinned-container" class="vector-unpinned-container"> <div id="vector-page-tools" class="vector-page-tools vector-pinnable-element"> <div class="vector-pinnable-header vector-page-tools-pinnable-header vector-pinnable-header-unpinned" data-feature-name="page-tools-pinned" data-pinnable-element-id="vector-page-tools" data-pinned-container-id="vector-page-tools-pinned-container" data-unpinned-container-id="vector-page-tools-unpinned-container" > <div class="vector-pinnable-header-label">Tools</div> <button class="vector-pinnable-header-toggle-button vector-pinnable-header-pin-button" data-event-name="pinnable-header.vector-page-tools.pin">move to sidebar</button> <button class="vector-pinnable-header-toggle-button vector-pinnable-header-unpin-button" data-event-name="pinnable-header.vector-page-tools.unpin">hide</button> </div> <div id="p-cactions" class="vector-menu mw-portlet mw-portlet-cactions emptyPortlet vector-has-collapsible-items" title="More options" > <div class="vector-menu-heading"> Actions </div> <div class="vector-menu-content"> <ul class="vector-menu-content-list"> <li id="ca-more-view" class="selected vector-more-collapsible-item mw-list-item"><a href="/wiki/Transformer_(deep_learning_architecture)"><span>Read</span></a></li><li id="ca-more-edit" class="vector-more-collapsible-item mw-list-item"><a href="/w/index.php?title=Transformer_(deep_learning_architecture)&action=edit" title="Edit this page [e]" accesskey="e"><span>Edit</span></a></li><li id="ca-more-history" class="vector-more-collapsible-item mw-list-item"><a href="/w/index.php?title=Transformer_(deep_learning_architecture)&action=history"><span>View history</span></a></li> </ul> </div> </div> <div id="p-tb" class="vector-menu mw-portlet mw-portlet-tb" > <div class="vector-menu-heading"> General </div> <div class="vector-menu-content"> <ul class="vector-menu-content-list"> <li id="t-whatlinkshere" class="mw-list-item"><a href="/wiki/Special:WhatLinksHere/Transformer_(deep_learning_architecture)" title="List of all English Wikipedia pages containing links to this page [j]" accesskey="j"><span>What links here</span></a></li><li id="t-recentchangeslinked" class="mw-list-item"><a href="/wiki/Special:RecentChangesLinked/Transformer_(deep_learning_architecture)" rel="nofollow" title="Recent changes in pages linked from this page [k]" accesskey="k"><span>Related changes</span></a></li><li id="t-upload" class="mw-list-item"><a href="//en.wikipedia.org/wiki/Wikipedia:File_Upload_Wizard" title="Upload files [u]" accesskey="u"><span>Upload file</span></a></li><li id="t-permalink" class="mw-list-item"><a href="/w/index.php?title=Transformer_(deep_learning_architecture)&oldid=1276021232" title="Permanent link to this revision of this page"><span>Permanent link</span></a></li><li id="t-info" class="mw-list-item"><a href="/w/index.php?title=Transformer_(deep_learning_architecture)&action=info" title="More information about this page"><span>Page information</span></a></li><li id="t-cite" class="mw-list-item"><a href="/w/index.php?title=Special:CiteThisPage&page=Transformer_%28deep_learning_architecture%29&id=1276021232&wpFormIdentifier=titleform" title="Information on how to cite this page"><span>Cite this page</span></a></li><li id="t-urlshortener" class="mw-list-item"><a href="/w/index.php?title=Special:UrlShortener&url=https%3A%2F%2Fen.wikipedia.org%2Fwiki%2FTransformer_%28deep_learning_architecture%29"><span>Get shortened URL</span></a></li><li id="t-urlshortener-qrcode" class="mw-list-item"><a href="/w/index.php?title=Special:QrCode&url=https%3A%2F%2Fen.wikipedia.org%2Fwiki%2FTransformer_%28deep_learning_architecture%29"><span>Download QR code</span></a></li> </ul> </div> </div> <div id="p-coll-print_export" class="vector-menu mw-portlet mw-portlet-coll-print_export" > <div class="vector-menu-heading"> Print/export </div> <div class="vector-menu-content"> <ul class="vector-menu-content-list"> <li id="coll-download-as-rl" class="mw-list-item"><a href="/w/index.php?title=Special:DownloadAsPdf&page=Transformer_%28deep_learning_architecture%29&action=show-download-screen" title="Download this page as a PDF file"><span>Download as PDF</span></a></li><li id="t-print" class="mw-list-item"><a href="/w/index.php?title=Transformer_(deep_learning_architecture)&printable=yes" title="Printable version of this page [p]" accesskey="p"><span>Printable version</span></a></li> </ul> </div> </div> <div id="p-wikibase-otherprojects" class="vector-menu mw-portlet mw-portlet-wikibase-otherprojects" > <div class="vector-menu-heading"> In other projects </div> <div class="vector-menu-content"> <ul class="vector-menu-content-list"> <li id="t-wikibase" class="wb-otherproject-link wb-otherproject-wikibase-dataitem mw-list-item"><a href="https://www.wikidata.org/wiki/Special:EntityPage/Q85810444" title="Structured data on this page hosted by Wikidata [g]" accesskey="g"><span>Wikidata item</span></a></li> </ul> </div> </div> </div> </div> </div> </div> </nav> </div> </div> </div> <div class="vector-column-end"> <div class="vector-sticky-pinned-container"> <nav class="vector-page-tools-landmark" aria-label="Page tools"> <div id="vector-page-tools-pinned-container" class="vector-pinned-container"> </div> </nav> <nav class="vector-appearance-landmark" aria-label="Appearance"> <div id="vector-appearance-pinned-container" class="vector-pinned-container"> <div id="vector-appearance" class="vector-appearance vector-pinnable-element"> <div class="vector-pinnable-header vector-appearance-pinnable-header vector-pinnable-header-pinned" data-feature-name="appearance-pinned" data-pinnable-element-id="vector-appearance" data-pinned-container-id="vector-appearance-pinned-container" data-unpinned-container-id="vector-appearance-unpinned-container" > <div class="vector-pinnable-header-label">Appearance</div> <button class="vector-pinnable-header-toggle-button vector-pinnable-header-pin-button" data-event-name="pinnable-header.vector-appearance.pin">move to sidebar</button> <button class="vector-pinnable-header-toggle-button vector-pinnable-header-unpin-button" data-event-name="pinnable-header.vector-appearance.unpin">hide</button> </div> </div> </div> </nav> </div> </div> <div id="bodyContent" class="vector-body" aria-labelledby="firstHeading" data-mw-ve-target-container> <div class="vector-body-before-content"> <div class="mw-indicators"> </div> <div id="siteSub" class="noprint">From Wikipedia, the free encyclopedia</div> </div> <div id="contentSub"><div id="mw-content-subtitle"></div></div> <div id="mw-content-text" class="mw-body-content"><div class="mw-content-ltr mw-parser-output" lang="en" dir="ltr"><div class="shortdescription nomobile noexcerpt noprint searchaux" style="display:none">Deep learning architecture for modelling sequential data</div> <style data-mw-deduplicate="TemplateStyles:r1244144826">.mw-parser-output .machine-learning-list-title{background-color:#ddddff}html.skin-theme-clientpref-night .mw-parser-output .machine-learning-list-title{background-color:#222}@media(prefers-color-scheme:dark){html.skin-theme-clientpref-os .mw-parser-output .machine-learning-list-title{background-color:#222}}</style> <style data-mw-deduplicate="TemplateStyles:r1129693374">.mw-parser-output .hlist dl,.mw-parser-output .hlist ol,.mw-parser-output .hlist ul{margin:0;padding:0}.mw-parser-output .hlist dd,.mw-parser-output .hlist dt,.mw-parser-output .hlist li{margin:0;display:inline}.mw-parser-output .hlist.inline,.mw-parser-output .hlist.inline dl,.mw-parser-output .hlist.inline ol,.mw-parser-output .hlist.inline ul,.mw-parser-output .hlist dl dl,.mw-parser-output .hlist dl ol,.mw-parser-output .hlist dl ul,.mw-parser-output .hlist ol dl,.mw-parser-output .hlist ol ol,.mw-parser-output .hlist ol ul,.mw-parser-output .hlist ul dl,.mw-parser-output .hlist ul ol,.mw-parser-output .hlist ul ul{display:inline}.mw-parser-output .hlist .mw-empty-li{display:none}.mw-parser-output .hlist dt::after{content:": "}.mw-parser-output .hlist dd::after,.mw-parser-output .hlist li::after{content:" · ";font-weight:bold}.mw-parser-output .hlist dd:last-child::after,.mw-parser-output .hlist dt:last-child::after,.mw-parser-output .hlist li:last-child::after{content:none}.mw-parser-output .hlist dd dd:first-child::before,.mw-parser-output .hlist dd dt:first-child::before,.mw-parser-output .hlist dd li:first-child::before,.mw-parser-output .hlist dt dd:first-child::before,.mw-parser-output .hlist dt dt:first-child::before,.mw-parser-output .hlist dt li:first-child::before,.mw-parser-output .hlist li dd:first-child::before,.mw-parser-output .hlist li dt:first-child::before,.mw-parser-output .hlist li li:first-child::before{content:" (";font-weight:normal}.mw-parser-output .hlist dd dd:last-child::after,.mw-parser-output .hlist dd dt:last-child::after,.mw-parser-output .hlist dd li:last-child::after,.mw-parser-output .hlist dt dd:last-child::after,.mw-parser-output .hlist dt dt:last-child::after,.mw-parser-output .hlist dt li:last-child::after,.mw-parser-output .hlist li dd:last-child::after,.mw-parser-output .hlist li dt:last-child::after,.mw-parser-output .hlist li li:last-child::after{content:")";font-weight:normal}.mw-parser-output .hlist ol{counter-reset:listitem}.mw-parser-output .hlist ol>li{counter-increment:listitem}.mw-parser-output .hlist ol>li::before{content:" "counter(listitem)"\a0 "}.mw-parser-output .hlist dd ol>li:first-child::before,.mw-parser-output .hlist dt ol>li:first-child::before,.mw-parser-output .hlist li ol>li:first-child::before{content:" ("counter(listitem)"\a0 "}</style><style data-mw-deduplicate="TemplateStyles:r1246091330">.mw-parser-output .sidebar{width:22em;float:right;clear:right;margin:0.5em 0 1em 1em;background:var(--background-color-neutral-subtle,#f8f9fa);border:1px solid var(--border-color-base,#a2a9b1);padding:0.2em;text-align:center;line-height:1.4em;font-size:88%;border-collapse:collapse;display:table}body.skin-minerva .mw-parser-output .sidebar{display:table!important;float:right!important;margin:0.5em 0 1em 1em!important}.mw-parser-output .sidebar-subgroup{width:100%;margin:0;border-spacing:0}.mw-parser-output .sidebar-left{float:left;clear:left;margin:0.5em 1em 1em 0}.mw-parser-output .sidebar-none{float:none;clear:both;margin:0.5em 1em 1em 0}.mw-parser-output .sidebar-outer-title{padding:0 0.4em 0.2em;font-size:125%;line-height:1.2em;font-weight:bold}.mw-parser-output .sidebar-top-image{padding:0.4em}.mw-parser-output .sidebar-top-caption,.mw-parser-output .sidebar-pretitle-with-top-image,.mw-parser-output .sidebar-caption{padding:0.2em 0.4em 0;line-height:1.2em}.mw-parser-output .sidebar-pretitle{padding:0.4em 0.4em 0;line-height:1.2em}.mw-parser-output .sidebar-title,.mw-parser-output .sidebar-title-with-pretitle{padding:0.2em 0.8em;font-size:145%;line-height:1.2em}.mw-parser-output .sidebar-title-with-pretitle{padding:0.1em 0.4em}.mw-parser-output .sidebar-image{padding:0.2em 0.4em 0.4em}.mw-parser-output .sidebar-heading{padding:0.1em 0.4em}.mw-parser-output .sidebar-content{padding:0 0.5em 0.4em}.mw-parser-output .sidebar-content-with-subgroup{padding:0.1em 0.4em 0.2em}.mw-parser-output .sidebar-above,.mw-parser-output .sidebar-below{padding:0.3em 0.8em;font-weight:bold}.mw-parser-output .sidebar-collapse .sidebar-above,.mw-parser-output .sidebar-collapse .sidebar-below{border-top:1px solid #aaa;border-bottom:1px solid #aaa}.mw-parser-output .sidebar-navbar{text-align:right;font-size:115%;padding:0 0.4em 0.4em}.mw-parser-output .sidebar-list-title{padding:0 0.4em;text-align:left;font-weight:bold;line-height:1.6em;font-size:105%}.mw-parser-output .sidebar-list-title-c{padding:0 0.4em;text-align:center;margin:0 3.3em}@media(max-width:640px){body.mediawiki .mw-parser-output .sidebar{width:100%!important;clear:both;float:none!important;margin-left:0!important;margin-right:0!important}}body.skin--responsive .mw-parser-output .sidebar a>img{max-width:none!important}@media screen{html.skin-theme-clientpref-night .mw-parser-output .sidebar:not(.notheme) .sidebar-list-title,html.skin-theme-clientpref-night .mw-parser-output .sidebar:not(.notheme) .sidebar-title-with-pretitle{background:transparent!important}html.skin-theme-clientpref-night .mw-parser-output .sidebar:not(.notheme) .sidebar-title-with-pretitle a{color:var(--color-progressive)!important}}@media screen and (prefers-color-scheme:dark){html.skin-theme-clientpref-os .mw-parser-output .sidebar:not(.notheme) .sidebar-list-title,html.skin-theme-clientpref-os .mw-parser-output .sidebar:not(.notheme) .sidebar-title-with-pretitle{background:transparent!important}html.skin-theme-clientpref-os .mw-parser-output .sidebar:not(.notheme) .sidebar-title-with-pretitle a{color:var(--color-progressive)!important}}@media print{body.ns-0 .mw-parser-output .sidebar{display:none!important}}</style><style data-mw-deduplicate="TemplateStyles:r886047488">.mw-parser-output .nobold{font-weight:normal}</style><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r886047488"><table class="sidebar sidebar-collapse nomobile nowraplinks"><tbody><tr><td class="sidebar-pretitle">Part of a series on</td></tr><tr><th class="sidebar-title-with-pretitle"><a href="/wiki/Machine_learning" title="Machine learning">Machine learning</a><br />and <a href="/wiki/Data_mining" title="Data mining">data mining</a></th></tr><tr><td class="sidebar-content"> <div class="sidebar-list mw-collapsible mw-collapsed machine-learning-list-title"><div class="sidebar-list-title" style="border-top:1px solid #aaa; text-align:center;;color: var(--color-base)">Paradigms</div><div class="sidebar-list-content mw-collapsible-content hlist" style="background-color: #FFFFFF;"> <ul><li><a href="/wiki/Supervised_learning" title="Supervised learning">Supervised learning</a></li> <li><a href="/wiki/Unsupervised_learning" title="Unsupervised learning">Unsupervised learning</a></li> <li><a href="/wiki/Semi-supervised_learning" class="mw-redirect" title="Semi-supervised learning">Semi-supervised learning</a></li> <li><a href="/wiki/Self-supervised_learning" title="Self-supervised learning">Self-supervised learning</a></li> <li><a href="/wiki/Reinforcement_learning" title="Reinforcement learning">Reinforcement learning</a></li> <li><a href="/wiki/Meta-learning_(computer_science)" title="Meta-learning (computer science)">Meta-learning</a></li> <li><a href="/wiki/Online_machine_learning" title="Online machine learning">Online learning</a></li> <li><a href="/wiki/Batch_learning" class="mw-redirect" title="Batch learning">Batch learning</a></li> <li><a href="/wiki/Curriculum_learning" title="Curriculum learning">Curriculum learning</a></li> <li><a href="/wiki/Rule-based_machine_learning" title="Rule-based machine learning">Rule-based learning</a></li> <li><a href="/wiki/Neuro-symbolic_AI" title="Neuro-symbolic AI">Neuro-symbolic AI</a></li> <li><a href="/wiki/Neuromorphic_engineering" class="mw-redirect" title="Neuromorphic engineering">Neuromorphic engineering</a></li> <li><a href="/wiki/Quantum_machine_learning" title="Quantum machine learning">Quantum machine learning</a></li></ul></div></div></td> </tr><tr><td class="sidebar-content"> <div class="sidebar-list mw-collapsible mw-collapsed machine-learning-list-title"><div class="sidebar-list-title" style="border-top:1px solid #aaa; text-align:center;;color: var(--color-base)">Problems</div><div class="sidebar-list-content mw-collapsible-content hlist" style="background-color: #FFFFFF;"> <ul><li><a href="/wiki/Statistical_classification" title="Statistical classification">Classification</a></li> <li><a href="/wiki/Generative_model" title="Generative model">Generative modeling</a></li> <li><a href="/wiki/Regression_analysis" title="Regression analysis">Regression</a></li> <li><a href="/wiki/Cluster_analysis" title="Cluster analysis">Clustering</a></li> <li><a href="/wiki/Dimensionality_reduction" title="Dimensionality reduction">Dimensionality reduction</a></li> <li><a href="/wiki/Density_estimation" title="Density estimation">Density estimation</a></li> <li><a href="/wiki/Anomaly_detection" title="Anomaly detection">Anomaly detection</a></li> <li><a href="/wiki/Data_cleaning" class="mw-redirect" title="Data cleaning">Data cleaning</a></li> <li><a href="/wiki/Automated_machine_learning" title="Automated machine learning">AutoML</a></li> <li><a href="/wiki/Association_rule_learning" title="Association rule learning">Association rules</a></li> <li><a href="/wiki/Semantic_analysis_(machine_learning)" title="Semantic analysis (machine learning)">Semantic analysis</a></li> <li><a href="/wiki/Structured_prediction" title="Structured prediction">Structured prediction</a></li> <li><a href="/wiki/Feature_engineering" title="Feature engineering">Feature engineering</a></li> <li><a href="/wiki/Feature_learning" title="Feature learning">Feature learning</a></li> <li><a href="/wiki/Learning_to_rank" title="Learning to rank">Learning to rank</a></li> <li><a href="/wiki/Grammar_induction" title="Grammar induction">Grammar induction</a></li> <li><a href="/wiki/Ontology_learning" title="Ontology learning">Ontology learning</a></li> <li><a href="/wiki/Multimodal_learning" title="Multimodal learning">Multimodal learning</a></li></ul></div></div></td> </tr><tr><td class="sidebar-content"> <div class="sidebar-list mw-collapsible mw-collapsed machine-learning-list-title"><div class="sidebar-list-title" style="border-top:1px solid #aaa; text-align:center;;color: var(--color-base)"><div style="display: inline-block; line-height: 1.2em; padding: .1em 0;"><a href="/wiki/Supervised_learning" title="Supervised learning">Supervised learning</a><br /><span class="nobold"><span style="font-size:85%;">(<b><a href="/wiki/Statistical_classification" title="Statistical classification">classification</a></b> • <b><a href="/wiki/Regression_analysis" title="Regression analysis">regression</a></b>)</span></span> </div></div><div class="sidebar-list-content mw-collapsible-content hlist" style="background-color: #FFFFFF;"> <ul><li><a href="/wiki/Apprenticeship_learning" title="Apprenticeship learning">Apprenticeship learning</a></li> <li><a href="/wiki/Decision_tree_learning" title="Decision tree learning">Decision trees</a></li> <li><a href="/wiki/Ensemble_learning" title="Ensemble learning">Ensembles</a> <ul><li><a href="/wiki/Bootstrap_aggregating" title="Bootstrap aggregating">Bagging</a></li> <li><a href="/wiki/Boosting_(machine_learning)" title="Boosting (machine learning)">Boosting</a></li> <li><a href="/wiki/Random_forest" title="Random forest">Random forest</a></li></ul></li> <li><a href="/wiki/K-nearest_neighbors_algorithm" title="K-nearest neighbors algorithm"><i>k</i>-NN</a></li> <li><a href="/wiki/Linear_regression" title="Linear regression">Linear regression</a></li> <li><a href="/wiki/Naive_Bayes_classifier" title="Naive Bayes classifier">Naive Bayes</a></li> <li><a href="/wiki/Artificial_neural_network" class="mw-redirect" title="Artificial neural network">Artificial neural networks</a></li> <li><a href="/wiki/Logistic_regression" title="Logistic regression">Logistic regression</a></li> <li><a href="/wiki/Perceptron" title="Perceptron">Perceptron</a></li> <li><a href="/wiki/Relevance_vector_machine" title="Relevance vector machine">Relevance vector machine (RVM)</a></li> <li><a href="/wiki/Support_vector_machine" title="Support vector machine">Support vector machine (SVM)</a></li></ul></div></div></td> </tr><tr><td class="sidebar-content"> <div class="sidebar-list mw-collapsible mw-collapsed machine-learning-list-title"><div class="sidebar-list-title" style="border-top:1px solid #aaa; text-align:center;;color: var(--color-base)"><a href="/wiki/Cluster_analysis" title="Cluster analysis">Clustering</a></div><div class="sidebar-list-content mw-collapsible-content hlist" style="background-color: #FFFFFF;"> <ul><li><a href="/wiki/BIRCH" title="BIRCH">BIRCH</a></li> <li><a href="/wiki/CURE_algorithm" title="CURE algorithm">CURE</a></li> <li><a href="/wiki/Hierarchical_clustering" title="Hierarchical clustering">Hierarchical</a></li> <li><a href="/wiki/K-means_clustering" title="K-means clustering"><i>k</i>-means</a></li> <li><a href="/wiki/Fuzzy_clustering" title="Fuzzy clustering">Fuzzy</a></li> <li><a href="/wiki/Expectation%E2%80%93maximization_algorithm" title="Expectation–maximization algorithm">Expectation–maximization (EM)</a></li> <li><br /><a href="/wiki/DBSCAN" title="DBSCAN">DBSCAN</a></li> <li><a href="/wiki/OPTICS_algorithm" title="OPTICS algorithm">OPTICS</a></li> <li><a href="/wiki/Mean_shift" title="Mean shift">Mean shift</a></li></ul></div></div></td> </tr><tr><td class="sidebar-content"> <div class="sidebar-list mw-collapsible mw-collapsed machine-learning-list-title"><div class="sidebar-list-title" style="border-top:1px solid #aaa; text-align:center;;color: var(--color-base)"><a href="/wiki/Dimensionality_reduction" title="Dimensionality reduction">Dimensionality reduction</a></div><div class="sidebar-list-content mw-collapsible-content hlist" style="background-color: #FFFFFF;"> <ul><li><a href="/wiki/Factor_analysis" title="Factor analysis">Factor analysis</a></li> <li><a href="/wiki/Canonical_correlation" title="Canonical correlation">CCA</a></li> <li><a href="/wiki/Independent_component_analysis" title="Independent component analysis">ICA</a></li> <li><a href="/wiki/Linear_discriminant_analysis" title="Linear discriminant analysis">LDA</a></li> <li><a href="/wiki/Non-negative_matrix_factorization" title="Non-negative matrix factorization">NMF</a></li> <li><a href="/wiki/Principal_component_analysis" title="Principal component analysis">PCA</a></li> <li><a href="/wiki/Proper_generalized_decomposition" title="Proper generalized decomposition">PGD</a></li> <li><a href="/wiki/T-distributed_stochastic_neighbor_embedding" title="T-distributed stochastic neighbor embedding">t-SNE</a></li> <li><a href="/wiki/Sparse_dictionary_learning" title="Sparse dictionary learning">SDL</a></li></ul></div></div></td> </tr><tr><td class="sidebar-content"> <div class="sidebar-list mw-collapsible mw-collapsed machine-learning-list-title"><div class="sidebar-list-title" style="border-top:1px solid #aaa; text-align:center;;color: var(--color-base)"><a href="/wiki/Structured_prediction" title="Structured prediction">Structured prediction</a></div><div class="sidebar-list-content mw-collapsible-content hlist" style="background-color: #FFFFFF;"> <ul><li><a href="/wiki/Graphical_model" title="Graphical model">Graphical models</a> <ul><li><a href="/wiki/Bayesian_network" title="Bayesian network">Bayes net</a></li> <li><a href="/wiki/Conditional_random_field" title="Conditional random field">Conditional random field</a></li> <li><a href="/wiki/Hidden_Markov_model" title="Hidden Markov model">Hidden Markov</a></li></ul></li></ul></div></div></td> </tr><tr><td class="sidebar-content"> <div class="sidebar-list mw-collapsible mw-collapsed machine-learning-list-title"><div class="sidebar-list-title" style="border-top:1px solid #aaa; text-align:center;;color: var(--color-base)"><a href="/wiki/Anomaly_detection" title="Anomaly detection">Anomaly detection</a></div><div class="sidebar-list-content mw-collapsible-content hlist" style="background-color: #FFFFFF;"> <ul><li><a href="/wiki/Random_sample_consensus" title="Random sample consensus">RANSAC</a></li> <li><a href="/wiki/K-nearest_neighbors_algorithm" title="K-nearest neighbors algorithm"><i>k</i>-NN</a></li> <li><a href="/wiki/Local_outlier_factor" title="Local outlier factor">Local outlier factor</a></li> <li><a href="/wiki/Isolation_forest" title="Isolation forest">Isolation forest</a></li></ul></div></div></td> </tr><tr><td class="sidebar-content"> <div class="sidebar-list mw-collapsible machine-learning-list-title"><div class="sidebar-list-title" style="border-top:1px solid #aaa; text-align:center;;color: var(--color-base)"><a href="/wiki/Artificial_neural_network" class="mw-redirect" title="Artificial neural network">Artificial neural network</a></div><div class="sidebar-list-content mw-collapsible-content hlist" style="background-color: #FFFFFF;"> <ul><li><a href="/wiki/Autoencoder" title="Autoencoder">Autoencoder</a></li> <li><a href="/wiki/Deep_learning" title="Deep learning">Deep learning</a></li> <li><a href="/wiki/Feedforward_neural_network" title="Feedforward neural network">Feedforward neural network</a></li> <li><a href="/wiki/Recurrent_neural_network" title="Recurrent neural network">Recurrent neural network</a> <ul><li><a href="/wiki/Long_short-term_memory" title="Long short-term memory">LSTM</a></li> <li><a href="/wiki/Gated_recurrent_unit" title="Gated recurrent unit">GRU</a></li> <li><a href="/wiki/Echo_state_network" title="Echo state network">ESN</a></li> <li><a href="/wiki/Reservoir_computing" title="Reservoir computing">reservoir computing</a></li></ul></li> <li><a href="/wiki/Boltzmann_machine" title="Boltzmann machine">Boltzmann machine</a> <ul><li><a href="/wiki/Restricted_Boltzmann_machine" title="Restricted Boltzmann machine">Restricted</a></li></ul></li> <li><a href="/wiki/Generative_adversarial_network" title="Generative adversarial network">GAN</a></li> <li><a href="/wiki/Diffusion_model" title="Diffusion model">Diffusion model</a></li> <li><a href="/wiki/Self-organizing_map" title="Self-organizing map">SOM</a></li> <li><a href="/wiki/Convolutional_neural_network" title="Convolutional neural network">Convolutional neural network</a> <ul><li><a href="/wiki/U-Net" title="U-Net">U-Net</a></li> <li><a href="/wiki/LeNet" title="LeNet">LeNet</a></li> <li><a href="/wiki/AlexNet" title="AlexNet">AlexNet</a></li> <li><a href="/wiki/DeepDream" title="DeepDream">DeepDream</a></li></ul></li> <li><a href="/wiki/Neural_radiance_field" title="Neural radiance field">Neural radiance field</a></li> <li><a href="/wiki/Transformer_(machine_learning_model)" class="mw-redirect" title="Transformer (machine learning model)">Transformer</a> <ul><li><a href="/wiki/Vision_transformer" title="Vision transformer">Vision</a></li></ul></li> <li><a href="/wiki/Mamba_(deep_learning_architecture)" title="Mamba (deep learning architecture)">Mamba</a></li> <li><a href="/wiki/Spiking_neural_network" title="Spiking neural network">Spiking neural network</a></li> <li><a href="/wiki/Memtransistor" title="Memtransistor">Memtransistor</a></li> <li><a href="/wiki/Electrochemical_RAM" title="Electrochemical RAM">Electrochemical RAM</a> (ECRAM)</li></ul></div></div></td> </tr><tr><td class="sidebar-content"> <div class="sidebar-list mw-collapsible mw-collapsed machine-learning-list-title"><div class="sidebar-list-title" style="border-top:1px solid #aaa; text-align:center;;color: var(--color-base)"><a href="/wiki/Reinforcement_learning" title="Reinforcement learning">Reinforcement learning</a></div><div class="sidebar-list-content mw-collapsible-content hlist" style="background-color: #FFFFFF;"> <ul><li><a href="/wiki/Q-learning" title="Q-learning">Q-learning</a></li> <li><a href="/wiki/State%E2%80%93action%E2%80%93reward%E2%80%93state%E2%80%93action" title="State–action–reward–state–action">SARSA</a></li> <li><a href="/wiki/Temporal_difference_learning" title="Temporal difference learning">Temporal difference (TD)</a></li> <li><a href="/wiki/Multi-agent_reinforcement_learning" title="Multi-agent reinforcement learning">Multi-agent</a> <ul><li><a href="/wiki/Self-play_(reinforcement_learning_technique)" class="mw-redirect" title="Self-play (reinforcement learning technique)">Self-play</a></li></ul></li></ul></div></div></td> </tr><tr><td class="sidebar-content"> <div class="sidebar-list mw-collapsible mw-collapsed machine-learning-list-title"><div class="sidebar-list-title" style="border-top:1px solid #aaa; text-align:center;;color: var(--color-base)">Learning with humans</div><div class="sidebar-list-content mw-collapsible-content hlist" style="background-color: #FFFFFF;"> <ul><li><a href="/wiki/Active_learning_(machine_learning)" title="Active learning (machine learning)">Active learning</a></li> <li><a href="/wiki/Crowdsourcing" title="Crowdsourcing">Crowdsourcing</a></li> <li><a href="/wiki/Human-in-the-loop" title="Human-in-the-loop">Human-in-the-loop</a></li> <li><a href="/wiki/Reinforcement_learning_from_human_feedback" title="Reinforcement learning from human feedback">RLHF</a></li></ul></div></div></td> </tr><tr><td class="sidebar-content"> <div class="sidebar-list mw-collapsible mw-collapsed machine-learning-list-title"><div class="sidebar-list-title" style="border-top:1px solid #aaa; text-align:center;;color: var(--color-base)">Model diagnostics</div><div class="sidebar-list-content mw-collapsible-content hlist" style="background-color: #FFFFFF;"> <ul><li><a href="/wiki/Coefficient_of_determination" title="Coefficient of determination">Coefficient of determination</a></li> <li><a href="/wiki/Confusion_matrix" title="Confusion matrix">Confusion matrix</a></li> <li><a href="/wiki/Learning_curve_(machine_learning)" title="Learning curve (machine learning)">Learning curve</a></li> <li><a href="/wiki/Receiver_operating_characteristic" title="Receiver operating characteristic">ROC curve</a></li></ul></div></div></td> </tr><tr><td class="sidebar-content"> <div class="sidebar-list mw-collapsible mw-collapsed machine-learning-list-title"><div class="sidebar-list-title" style="border-top:1px solid #aaa; text-align:center;;color: var(--color-base)">Mathematical foundations</div><div class="sidebar-list-content mw-collapsible-content hlist" style="background-color: #FFFFFF;"> <ul><li><a href="/wiki/Kernel_machines" class="mw-redirect" title="Kernel machines">Kernel machines</a></li> <li><a href="/wiki/Bias%E2%80%93variance_tradeoff" title="Bias–variance tradeoff">Bias–variance tradeoff</a></li> <li><a href="/wiki/Computational_learning_theory" title="Computational learning theory">Computational learning theory</a></li> <li><a href="/wiki/Empirical_risk_minimization" title="Empirical risk minimization">Empirical risk minimization</a></li> <li><a href="/wiki/Occam_learning" title="Occam learning">Occam learning</a></li> <li><a href="/wiki/Probably_approximately_correct_learning" title="Probably approximately correct learning">PAC learning</a></li> <li><a href="/wiki/Statistical_learning_theory" title="Statistical learning theory">Statistical learning</a></li> <li><a href="/wiki/Vapnik%E2%80%93Chervonenkis_theory" title="Vapnik–Chervonenkis theory">VC theory</a></li> <li><a href="/wiki/Topological_deep_learning" title="Topological deep learning">Topological deep learning</a></li></ul></div></div></td> </tr><tr><td class="sidebar-content"> <div class="sidebar-list mw-collapsible mw-collapsed machine-learning-list-title"><div class="sidebar-list-title" style="border-top:1px solid #aaa; text-align:center;;color: var(--color-base)">Journals and conferences</div><div class="sidebar-list-content mw-collapsible-content hlist" style="background-color: #FFFFFF;"> <ul><li><a href="/wiki/ECML_PKDD" title="ECML PKDD">ECML PKDD</a></li> <li><a href="/wiki/Conference_on_Neural_Information_Processing_Systems" title="Conference on Neural Information Processing Systems">NeurIPS</a></li> <li><a href="/wiki/International_Conference_on_Machine_Learning" title="International Conference on Machine Learning">ICML</a></li> <li><a href="/wiki/International_Conference_on_Learning_Representations" title="International Conference on Learning Representations">ICLR</a></li> <li><a href="/wiki/International_Joint_Conference_on_Artificial_Intelligence" title="International Joint Conference on Artificial Intelligence">IJCAI</a></li> <li><a href="/wiki/Machine_Learning_(journal)" title="Machine Learning (journal)">ML</a></li> <li><a href="/wiki/Journal_of_Machine_Learning_Research" title="Journal of Machine Learning Research">JMLR</a></li></ul></div></div></td> </tr><tr><td class="sidebar-content"> <div class="sidebar-list mw-collapsible mw-collapsed machine-learning-list-title"><div class="sidebar-list-title" style="border-top:1px solid #aaa; text-align:center;;color: var(--color-base)">Related articles</div><div class="sidebar-list-content mw-collapsible-content hlist" style="background-color: #FFFFFF;"> <ul><li><a href="/wiki/Glossary_of_artificial_intelligence" title="Glossary of artificial intelligence">Glossary of artificial intelligence</a></li> <li><a href="/wiki/List_of_datasets_for_machine-learning_research" title="List of datasets for machine-learning research">List of datasets for machine-learning research</a> <ul><li><a href="/wiki/List_of_datasets_in_computer_vision_and_image_processing" title="List of datasets in computer vision and image processing">List of datasets in computer vision and image processing</a></li></ul></li> <li><a href="/wiki/Outline_of_machine_learning" title="Outline of machine learning">Outline of machine learning</a></li></ul></div></div></td> </tr><tr><td class="sidebar-navbar"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1129693374"><style data-mw-deduplicate="TemplateStyles:r1239400231">.mw-parser-output .navbar{display:inline;font-size:88%;font-weight:normal}.mw-parser-output .navbar-collapse{float:left;text-align:left}.mw-parser-output .navbar-boxtext{word-spacing:0}.mw-parser-output .navbar ul{display:inline-block;white-space:nowrap;line-height:inherit}.mw-parser-output .navbar-brackets::before{margin-right:-0.125em;content:"[ "}.mw-parser-output .navbar-brackets::after{margin-left:-0.125em;content:" ]"}.mw-parser-output .navbar li{word-spacing:-0.125em}.mw-parser-output .navbar a>span,.mw-parser-output .navbar a>abbr{text-decoration:inherit}.mw-parser-output .navbar-mini abbr{font-variant:small-caps;border-bottom:none;text-decoration:none;cursor:inherit}.mw-parser-output .navbar-ct-full{font-size:114%;margin:0 7em}.mw-parser-output .navbar-ct-mini{font-size:114%;margin:0 4em}html.skin-theme-clientpref-night .mw-parser-output .navbar li a abbr{color:var(--color-base)!important}@media(prefers-color-scheme:dark){html.skin-theme-clientpref-os .mw-parser-output .navbar li a abbr{color:var(--color-base)!important}}@media print{.mw-parser-output .navbar{display:none!important}}</style><div class="navbar plainlinks hlist navbar-mini"><ul><li class="nv-view"><a href="/wiki/Template:Machine_learning" title="Template:Machine learning"><abbr title="View this template">v</abbr></a></li><li class="nv-talk"><a href="/wiki/Template_talk:Machine_learning" title="Template talk:Machine learning"><abbr title="Discuss this template">t</abbr></a></li><li class="nv-edit"><a href="/wiki/Special:EditPage/Template:Machine_learning" title="Special:EditPage/Template:Machine learning"><abbr title="Edit this template">e</abbr></a></li></ul></div></td></tr></tbody></table> <figure class="mw-default-size" typeof="mw:File/Thumb"><a href="/wiki/File:Transformer,_full_architecture.png" class="mw-file-description"><img src="//upload.wikimedia.org/wikipedia/commons/thumb/3/34/Transformer%2C_full_architecture.png/220px-Transformer%2C_full_architecture.png" decoding="async" width="220" height="231" class="mw-file-element" srcset="//upload.wikimedia.org/wikipedia/commons/thumb/3/34/Transformer%2C_full_architecture.png/330px-Transformer%2C_full_architecture.png 1.5x, //upload.wikimedia.org/wikipedia/commons/thumb/3/34/Transformer%2C_full_architecture.png/440px-Transformer%2C_full_architecture.png 2x" data-file-width="1426" data-file-height="1500" /></a><figcaption>A standard Transformer architecture, showing on the left an encoder, and on the right a decoder. Note: it uses the pre-LN convention, which is different from the post-LN convention used in the original 2017 Transformer.</figcaption></figure> <p>The <b>transformer</b> is a <a href="/wiki/Deep_learning" title="Deep learning">deep learning</a> architecture that was developed by researchers at <a href="/wiki/Google" title="Google">Google</a> and is based on the multi-head <a href="/wiki/Attention_(machine_learning)" title="Attention (machine learning)">attention</a> mechanism, which was proposed in the 2017 paper "<a href="/wiki/Attention_Is_All_You_Need" title="Attention Is All You Need">Attention Is All You Need</a>".<sup id="cite_ref-2017_Attention_Is_All_You_Need_1-0" class="reference"><a href="#cite_note-2017_Attention_Is_All_You_Need-1"><span class="cite-bracket">[</span>1<span class="cite-bracket">]</span></a></sup> Text is converted to numerical representations called <a href="/wiki/Large_language_model#Tokenization" title="Large language model">tokens</a>, and each token is converted into a vector via lookup from a <a href="/wiki/Word_embedding" title="Word embedding">word embedding</a> table.<sup id="cite_ref-2017_Attention_Is_All_You_Need_1-1" class="reference"><a href="#cite_note-2017_Attention_Is_All_You_Need-1"><span class="cite-bracket">[</span>1<span class="cite-bracket">]</span></a></sup> At each layer, each <a href="/wiki/Tokenization_(lexical_analysis)" class="mw-redirect" title="Tokenization (lexical analysis)">token</a> is then <a href="/wiki/Contextualization_(computer_science)" title="Contextualization (computer science)">contextualized</a> within the scope of the context window with other (unmasked) tokens via a parallel multi-head attention mechanism, allowing the signal for key <a href="/wiki/Tokenization_(lexical_analysis)" class="mw-redirect" title="Tokenization (lexical analysis)">tokens</a> to be amplified and less important tokens to be diminished. </p><p>Transformers have the advantage of having no recurrent units, therefore requiring less training time than earlier <a href="/wiki/Recurrent_neural_network" title="Recurrent neural network">recurrent neural architectures</a> (RNNs) such as <a href="/wiki/Long_short-term_memory" title="Long short-term memory">long short-term memory</a> (LSTM).<sup id="cite_ref-lstm1997_2-0" class="reference"><a href="#cite_note-lstm1997-2"><span class="cite-bracket">[</span>2<span class="cite-bracket">]</span></a></sup> Later variations have been widely adopted for training <a href="/wiki/Large_language_model" title="Large language model">large language models</a> (LLM) on large (language) <a href="/wiki/Training,_validation,_and_test_data_sets" title="Training, validation, and test data sets">datasets</a>, such as the <a href="/wiki/Wikipedia" title="Wikipedia">Wikipedia</a> <a href="/wiki/Text_corpus" title="Text corpus">corpus</a> and <a href="/wiki/Common_Crawl" title="Common Crawl">Common Crawl</a>.<sup id="cite_ref-:7_3-0" class="reference"><a href="#cite_note-:7-3"><span class="cite-bracket">[</span>3<span class="cite-bracket">]</span></a></sup> </p><p> Transformers were first developed as an improvement over previous architectures for <a href="/wiki/Machine_translation" title="Machine translation">machine translation</a>,<sup id="cite_ref-inventors_4-0" class="reference"><a href="#cite_note-inventors-4"><span class="cite-bracket">[</span>4<span class="cite-bracket">]</span></a></sup><sup id="cite_ref-inventconfirm_5-0" class="reference"><a href="#cite_note-inventconfirm-5"><span class="cite-bracket">[</span>5<span class="cite-bracket">]</span></a></sup> but have found many applications since. They are used in large-scale <a href="/wiki/Natural_language_processing" title="Natural language processing">natural language processing</a>, <a href="/wiki/Computer_vision" title="Computer vision">computer vision</a> (<a href="/wiki/Vision_transformer" title="Vision transformer">vision transformers</a>), <a href="/wiki/Reinforcement_learning" title="Reinforcement learning">reinforcement learning</a>,<sup id="cite_ref-:10_6-0" class="reference"><a href="#cite_note-:10-6"><span class="cite-bracket">[</span>6<span class="cite-bracket">]</span></a></sup><sup id="cite_ref-7" class="reference"><a href="#cite_note-7"><span class="cite-bracket">[</span>7<span class="cite-bracket">]</span></a></sup> <a href="/wiki/Audio_signal_processing" title="Audio signal processing">audio</a>,<sup id="cite_ref-Robust_Speech_Recognition_via_Large-Scale_Weak_Supervision_8-0" class="reference"><a href="#cite_note-Robust_Speech_Recognition_via_Large-Scale_Weak_Supervision-8"><span class="cite-bracket">[</span>8<span class="cite-bracket">]</span></a></sup> <a href="/wiki/Multimodal_learning" title="Multimodal learning">multimodal learning</a>, <a href="/wiki/Robotics" title="Robotics">robotics</a>,<sup id="cite_ref-9" class="reference"><a href="#cite_note-9"><span class="cite-bracket">[</span>9<span class="cite-bracket">]</span></a></sup> and even playing <a href="/wiki/Computer_chess" title="Computer chess">chess</a>.<sup id="cite_ref-grandmaster_10-0" class="reference"><a href="#cite_note-grandmaster-10"><span class="cite-bracket">[</span>10<span class="cite-bracket">]</span></a></sup> It has also led to the development of <a href="/wiki/Transfer_learning" title="Transfer learning">pre-trained systems</a>, such as <a href="/wiki/Generative_pre-trained_transformer" title="Generative pre-trained transformer">generative pre-trained transformers</a> (GPTs)<sup id="cite_ref-wolf2020_11-0" class="reference"><a href="#cite_note-wolf2020-11"><span class="cite-bracket">[</span>11<span class="cite-bracket">]</span></a></sup> and <a href="/wiki/BERT_(language_model)" title="BERT (language model)">BERT</a><sup id="cite_ref-:6_12-0" class="reference"><a href="#cite_note-:6-12"><span class="cite-bracket">[</span>12<span class="cite-bracket">]</span></a></sup> (bidirectional encoder representations from transformers).<style data-mw-deduplicate="TemplateStyles:r886046785">.mw-parser-output .toclimit-2 .toclevel-1 ul,.mw-parser-output .toclimit-3 .toclevel-2 ul,.mw-parser-output .toclimit-4 .toclevel-3 ul,.mw-parser-output .toclimit-5 .toclevel-4 ul,.mw-parser-output .toclimit-6 .toclevel-5 ul,.mw-parser-output .toclimit-7 .toclevel-6 ul{display:none}</style></p><div class="toclimit-3"><meta property="mw:PageProp/toc" /></div> <div class="mw-heading mw-heading2"><h2 id="History">History</h2><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Transformer_(deep_learning_architecture)&action=edit&section=1" title="Edit section: History"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <style data-mw-deduplicate="TemplateStyles:r1236090951">.mw-parser-output .hatnote{font-style:italic}.mw-parser-output div.hatnote{padding-left:1.6em;margin-bottom:0.5em}.mw-parser-output .hatnote i{font-style:normal}.mw-parser-output .hatnote+link+.hatnote{margin-top:-0.5em}@media print{body.ns-0 .mw-parser-output .hatnote{display:none!important}}</style><div role="note" class="hatnote navigation-not-searchable">See also: <a href="/wiki/Timeline_of_machine_learning" title="Timeline of machine learning">Timeline of machine learning</a></div> <div class="mw-heading mw-heading3"><h3 id="Predecessors">Predecessors</h3><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Transformer_(deep_learning_architecture)&action=edit&section=2" title="Edit section: Predecessors"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <p>For many years, sequence modelling and generation was done by using plain <a href="/wiki/Recurrent_neural_network" title="Recurrent neural network">recurrent neural networks</a> (RNNs). A well-cited early example was the <a href="/wiki/Elman_network" class="mw-redirect" title="Elman network">Elman network</a> (1990). In theory, the information from one token can propagate arbitrarily far down the sequence, but in practice the <a href="/wiki/Vanishing-gradient_problem" class="mw-redirect" title="Vanishing-gradient problem">vanishing-gradient problem</a> leaves the model's state at the end of a long sentence without precise, extractable information about preceding tokens. </p><p>A key breakthrough was <a href="/wiki/Long_short-term_memory" title="Long short-term memory">LSTM</a> (1995),<sup id="cite_ref-13" class="reference"><a href="#cite_note-13"><span class="cite-bracket">[</span>note 1<span class="cite-bracket">]</span></a></sup> a RNN which used various innovations to overcome the vanishing gradient problem, allowing efficient learning of long-sequence modelling. One key innovation was the use of an <a href="/wiki/Attention_mechanism" class="mw-redirect" title="Attention mechanism">attention mechanism</a> which used neurons that multiply the outputs of other neurons, so-called <i>multiplicative units</i>.<sup id="cite_ref-14" class="reference"><a href="#cite_note-14"><span class="cite-bracket">[</span>13<span class="cite-bracket">]</span></a></sup> Neural networks using multiplicative units were later called <i>sigma-pi networks</i><sup id="cite_ref-PDP_15-0" class="reference"><a href="#cite_note-PDP-15"><span class="cite-bracket">[</span>14<span class="cite-bracket">]</span></a></sup> or <i><a href="/w/index.php?title=Higher-order_neural_network&action=edit&redlink=1" class="new" title="Higher-order neural network (page does not exist)">higher-order networks</a></i>.<sup id="cite_ref-16" class="reference"><a href="#cite_note-16"><span class="cite-bracket">[</span>15<span class="cite-bracket">]</span></a></sup> LSTM became the standard architecture for long sequence modelling until the 2017 publication of Transformers. However, LSTM still used sequential processing, like most other RNNs.<sup id="cite_ref-17" class="reference"><a href="#cite_note-17"><span class="cite-bracket">[</span>note 2<span class="cite-bracket">]</span></a></sup> Specifically, RNNs operate one token at a time from first to last; they cannot operate in parallel over all tokens in a sequence. </p><p>Modern Transformers overcome this problem, but unlike RNNs, they require computation time that is <a href="/wiki/Quadratic_function" title="Quadratic function">quadratic</a> in the size of the context window. The linearly scaling <a href="/w/index.php?title=Fast_weight&action=edit&redlink=1" class="new" title="Fast weight (page does not exist)">fast weight</a> controller (1992) learns to compute a weight matrix for further processing depending on the input.<sup id="cite_ref-transform19922_18-0" class="reference"><a href="#cite_note-transform19922-18"><span class="cite-bracket">[</span>16<span class="cite-bracket">]</span></a></sup> One of its two networks has "fast weights" or "dynamic links" (1981).<sup id="cite_ref-malsburg1981_19-0" class="reference"><a href="#cite_note-malsburg1981-19"><span class="cite-bracket">[</span>17<span class="cite-bracket">]</span></a></sup><sup id="cite_ref-feldman1982_20-0" class="reference"><a href="#cite_note-feldman1982-20"><span class="cite-bracket">[</span>18<span class="cite-bracket">]</span></a></sup><sup id="cite_ref-21" class="reference"><a href="#cite_note-21"><span class="cite-bracket">[</span>19<span class="cite-bracket">]</span></a></sup> A slow neural network learns by gradient descent to generate keys and values for computing the weight changes of the fast neural network which computes answers to queries.<sup id="cite_ref-transform19922_18-1" class="reference"><a href="#cite_note-transform19922-18"><span class="cite-bracket">[</span>16<span class="cite-bracket">]</span></a></sup> This was later shown to be equivalent to the unnormalized linear Transformer.<sup id="cite_ref-fastlinear20202_22-0" class="reference"><a href="#cite_note-fastlinear20202-22"><span class="cite-bracket">[</span>20<span class="cite-bracket">]</span></a></sup><sup id="cite_ref-schlag20212_23-0" class="reference"><a href="#cite_note-schlag20212-23"><span class="cite-bracket">[</span>21<span class="cite-bracket">]</span></a></sup> </p> <div class="mw-heading mw-heading3"><h3 id="Attention_with_seq2seq">Attention with seq2seq</h3><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Transformer_(deep_learning_architecture)&action=edit&section=3" title="Edit section: Attention with seq2seq"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1236090951"><div role="note" class="hatnote navigation-not-searchable">Main article: <a href="/wiki/Seq2seq#History" title="Seq2seq">Seq2seq § History</a></div> <p>The idea of encoder-decoder sequence transduction had been developed in the early 2010s (see previous papers<sup id="cite_ref-:22_24-0" class="reference"><a href="#cite_note-:22-24"><span class="cite-bracket">[</span>22<span class="cite-bracket">]</span></a></sup><sup id="cite_ref-sequence_25-0" class="reference"><a href="#cite_note-sequence-25"><span class="cite-bracket">[</span>23<span class="cite-bracket">]</span></a></sup>). The papers most commonly cited as the originators that produced seq2seq are two concurrently published papers from 2014.<sup id="cite_ref-:22_24-1" class="reference"><a href="#cite_note-:22-24"><span class="cite-bracket">[</span>22<span class="cite-bracket">]</span></a></sup><sup id="cite_ref-sequence_25-1" class="reference"><a href="#cite_note-sequence-25"><span class="cite-bracket">[</span>23<span class="cite-bracket">]</span></a></sup> </p><p>A 380M-parameter model for machine translation uses two <a href="/wiki/Long_short-term_memory" title="Long short-term memory">long short-term memories</a> (LSTM).<sup id="cite_ref-sequence_25-2" class="reference"><a href="#cite_note-sequence-25"><span class="cite-bracket">[</span>23<span class="cite-bracket">]</span></a></sup> Its architecture consists of two parts. The <i>encoder</i> is an LSTM that takes in a sequence of tokens and turns it into a vector. The <i>decoder</i> is another LSTM that converts the vector into a sequence of tokens. Similarly, another 130M-parameter model used <a href="/wiki/Gated_recurrent_unit" title="Gated recurrent unit">gated recurrent units</a> (GRU) instead of LSTM.<sup id="cite_ref-:22_24-2" class="reference"><a href="#cite_note-:22-24"><span class="cite-bracket">[</span>22<span class="cite-bracket">]</span></a></sup> Later research showed that GRUs are neither better nor worse than LSTMs for seq2seq.<sup id="cite_ref-MyUser_Arxiv.org_May_18_2016c_26-0" class="reference"><a href="#cite_note-MyUser_Arxiv.org_May_18_2016c-26"><span class="cite-bracket">[</span>24<span class="cite-bracket">]</span></a></sup><sup id="cite_ref-gruber_jockisch_27-0" class="reference"><a href="#cite_note-gruber_jockisch-27"><span class="cite-bracket">[</span>25<span class="cite-bracket">]</span></a></sup> </p><p>These early seq2seq models had no attention mechanism, and the state vector is accessible only after the <i>last</i> word of the source text was processed. Although in theory such a vector retains the information about the whole original sentence, in practice the information is poorly preserved. This is because the input is processed sequentially by one recurrent network into a <i>fixed</i>-size output vector, which is then processed by another recurrent network into an output. If the input is long, then the output vector would not be able to contain all relevant information, degrading the output. As evidence, reversing the input sentence improved seq2seq translation.<sup id="cite_ref-28" class="reference"><a href="#cite_note-28"><span class="cite-bracket">[</span>26<span class="cite-bracket">]</span></a></sup> </p><p>The <i>RNNsearch</i> model introduced an attention mechanism to seq2seq for machine translation to solve the bottleneck problem (of the <i>fixed-size</i> output vector), allowing the model to process long-distance dependencies more easily. The name is because it "emulates searching through a source sentence during decoding a translation".<sup id="cite_ref-inventors_4-1" class="reference"><a href="#cite_note-inventors-4"><span class="cite-bracket">[</span>4<span class="cite-bracket">]</span></a></sup> </p><p>The relative performances were compared between global (that of <i>RNNsearch</i>) and local (sliding window) attention model architectures for machine translation, finding that mixed attention had higher quality than global attention, while local attention reduced translation time.<sup id="cite_ref-29" class="reference"><a href="#cite_note-29"><span class="cite-bracket">[</span>27<span class="cite-bracket">]</span></a></sup> </p><p>In 2016, <a href="/wiki/Google_Translate" title="Google Translate">Google Translate</a> was revamped to <a href="/wiki/Google_Neural_Machine_Translation" title="Google Neural Machine Translation">Google Neural Machine Translation</a>, which replaced the previous model based on <a href="/wiki/Statistical_machine_translation" title="Statistical machine translation">statistical machine translation</a>. The new model was a seq2seq model where the encoder and the decoder were both 8 layers of bidirectional LSTM.<sup id="cite_ref-Y4moj_30-0" class="reference"><a href="#cite_note-Y4moj-30"><span class="cite-bracket">[</span>28<span class="cite-bracket">]</span></a></sup> It took nine months to develop, and it outperformed the statistical approach, which took ten years to develop.<sup id="cite_ref-UJDu8_31-0" class="reference"><a href="#cite_note-UJDu8-31"><span class="cite-bracket">[</span>29<span class="cite-bracket">]</span></a></sup> </p> <div class="mw-heading mw-heading3"><h3 id="Parallelizing_attention">Parallelizing attention</h3><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Transformer_(deep_learning_architecture)&action=edit&section=4" title="Edit section: Parallelizing attention"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1236090951"><div role="note" class="hatnote navigation-not-searchable">Main article: <a href="/wiki/Attention_(machine_learning)#History" title="Attention (machine learning)">Attention (machine learning) § History</a></div> <p>Seq2seq models with attention (including self-attention) still suffered from the same issue with recurrent networks, which is that they are hard to <a href="/wiki/Parallel_computing" title="Parallel computing">parallelize</a>, which prevented them from being accelerated on GPUs. In 2016, <i>decomposable attention</i> applied a self-attention mechanism to <a href="/wiki/Feedforward_neural_network" title="Feedforward neural network">feedforward networks</a>, which are easy to parallelize, and achieved <a href="/wiki/State_of_the_art" title="State of the art">SOTA</a> result in <a href="/wiki/Textual_entailment" title="Textual entailment">textual entailment</a> with an order of magnitude fewer parameters than LSTMs.<sup id="cite_ref-32" class="reference"><a href="#cite_note-32"><span class="cite-bracket">[</span>30<span class="cite-bracket">]</span></a></sup> One of its authors, Jakob Uszkoreit, suspected that attention <i>without</i> recurrence is sufficient for language translation, thus the title "attention is <i>all</i> you need".<sup id="cite_ref-:11_33-0" class="reference"><a href="#cite_note-:11-33"><span class="cite-bracket">[</span>31<span class="cite-bracket">]</span></a></sup> That hypothesis was against conventional wisdom at the time, and even his father <a href="/wiki/Hans_Uszkoreit" title="Hans Uszkoreit">Hans Uszkoreit</a>, a well-known computational linguist, was skeptical.<sup id="cite_ref-:11_33-1" class="reference"><a href="#cite_note-:11-33"><span class="cite-bracket">[</span>31<span class="cite-bracket">]</span></a></sup> In the same year, self-attention (called <i>intra-attention or</i> <i>intra-sentence attention</i>) was proposed for LSTMs.<sup id="cite_ref-34" class="reference"><a href="#cite_note-34"><span class="cite-bracket">[</span>32<span class="cite-bracket">]</span></a></sup> </p><p>In 2017, the original (100M-sized) encoder-decoder transformer model was proposed in the "<a href="/wiki/Attention_is_all_you_need" class="mw-redirect" title="Attention is all you need">Attention is all you need</a>" paper. At the time, the focus of the research was on improving <a href="/wiki/Seq2seq" title="Seq2seq">seq2seq</a> for <a href="/wiki/Machine_translation" title="Machine translation">machine translation</a>, by removing its recurrence to process all tokens in parallel, but preserving its dot-product attention mechanism to keep its text processing performance.<sup id="cite_ref-2017_Attention_Is_All_You_Need_1-2" class="reference"><a href="#cite_note-2017_Attention_Is_All_You_Need-1"><span class="cite-bracket">[</span>1<span class="cite-bracket">]</span></a></sup> This led to the introduction of a multi-head attention model that was easier to parallelize due to the use of independent heads and the lack of recurrence. Its parallelizability was an important factor to its widespread use in large neural networks.<sup id="cite_ref-35" class="reference"><a href="#cite_note-35"><span class="cite-bracket">[</span>33<span class="cite-bracket">]</span></a></sup> <span class="anchor" id="Transformer_boom"></span> </p> <div class="mw-heading mw-heading3"><h3 id="AI_boom_era">AI boom era</h3><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Transformer_(deep_learning_architecture)&action=edit&section=5" title="Edit section: AI boom era"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <p>Already in spring 2017, even before the "Attention is all you need" preprint was published, one of the co-authors applied the "decoder-only" variation of the architecture to generate fictitious Wikipedia articles.<sup id="cite_ref-36" class="reference"><a href="#cite_note-36"><span class="cite-bracket">[</span>34<span class="cite-bracket">]</span></a></sup> Transformer architecture is now used in many <a href="/wiki/Generative_artificial_intelligence" title="Generative artificial intelligence">generative models</a> that contribute to the ongoing <a href="/wiki/AI_boom" title="AI boom">AI boom</a>. </p><p>In language modelling, <a href="/wiki/ELMo" title="ELMo">ELMo</a> (2018) was a bi-directional LSTM that produces contextualized <a href="/wiki/Word_embedding" title="Word embedding">word embeddings</a>, improving upon the line of research from <a href="/wiki/Bag-of-words_model" title="Bag-of-words model">bag of words</a> and <a href="/wiki/Word2vec" title="Word2vec">word2vec</a>. It was followed by <a href="/wiki/BERT_(language_model)" title="BERT (language model)">BERT</a> (2018), an encoder-only Transformer model.<sup id="cite_ref-:03_37-0" class="reference"><a href="#cite_note-:03-37"><span class="cite-bracket">[</span>35<span class="cite-bracket">]</span></a></sup> In 2019 October, Google started using BERT to process search queries.<sup id="cite_ref-38" class="reference"><a href="#cite_note-38"><span class="cite-bracket">[</span>36<span class="cite-bracket">]</span></a></sup> In 2020, Google Translate replaced the previous RNN-encoder–RNN-decoder model by a Transformer-encoder–RNN-decoder model.<sup id="cite_ref-39" class="reference"><a href="#cite_note-39"><span class="cite-bracket">[</span>37<span class="cite-bracket">]</span></a></sup> </p><p>Starting in 2018, the OpenAI <a href="/wiki/Generative_pre-trained_transformer" title="Generative pre-trained transformer">GPT series</a> of decoder-only Transformers became state of the art in <a href="/wiki/Natural_language_generation" title="Natural language generation">natural language generation</a>. In 2022, a chatbot based on GPT-3, <a href="/wiki/ChatGPT" title="ChatGPT">ChatGPT</a>, became unexpectedly popular,<sup id="cite_ref-40" class="reference"><a href="#cite_note-40"><span class="cite-bracket">[</span>38<span class="cite-bracket">]</span></a></sup> triggering a boom around <a href="/wiki/Large_language_model" title="Large language model">large language models</a>.<sup id="cite_ref-gpt12_41-0" class="reference"><a href="#cite_note-gpt12-41"><span class="cite-bracket">[</span>39<span class="cite-bracket">]</span></a></sup><sup id="cite_ref-ngEG3_42-0" class="reference"><a href="#cite_note-ngEG3-42"><span class="cite-bracket">[</span>40<span class="cite-bracket">]</span></a></sup> </p><p>Since 2020, Transformers have been applied in modalities beyond text, including the <a href="/wiki/Vision_transformer" title="Vision transformer">vision transformer</a>,<sup id="cite_ref-auto2_43-0" class="reference"><a href="#cite_note-auto2-43"><span class="cite-bracket">[</span>41<span class="cite-bracket">]</span></a></sup> speech recognition,<sup id="cite_ref-Gulati2020_44-0" class="reference"><a href="#cite_note-Gulati2020-44"><span class="cite-bracket">[</span>42<span class="cite-bracket">]</span></a></sup> robotics,<sup id="cite_ref-:10_6-1" class="reference"><a href="#cite_note-:10-6"><span class="cite-bracket">[</span>6<span class="cite-bracket">]</span></a></sup> and <a href="/wiki/Multimodal_learning" title="Multimodal learning">multimodal</a>.<sup id="cite_ref-choromanski2020_45-0" class="reference"><a href="#cite_note-choromanski2020-45"><span class="cite-bracket">[</span>43<span class="cite-bracket">]</span></a></sup> The vision transformer, in turn, stimulated new developments in <a href="/wiki/Convolutional_neural_network" title="Convolutional neural network">convolutional neural networks</a>.<sup id="cite_ref-46" class="reference"><a href="#cite_note-46"><span class="cite-bracket">[</span>44<span class="cite-bracket">]</span></a></sup> Image and video generators like <a href="/wiki/DALL-E" title="DALL-E">DALL-E</a> (2021), <a href="/wiki/Stable_Diffusion" title="Stable Diffusion">Stable Diffusion 3</a> (2024),<sup id="cite_ref-:62_47-0" class="reference"><a href="#cite_note-:62-47"><span class="cite-bracket">[</span>45<span class="cite-bracket">]</span></a></sup> and <a href="/wiki/Sora_(text-to-video_model)" title="Sora (text-to-video model)">Sora</a> (2024), are based on the Transformer architecture. </p> <div class="mw-heading mw-heading2"><h2 id="Training">Training</h2><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Transformer_(deep_learning_architecture)&action=edit&section=6" title="Edit section: Training"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <div class="mw-heading mw-heading3"><h3 id="Methods_for_stabilizing_training">Methods for stabilizing training</h3><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Transformer_(deep_learning_architecture)&action=edit&section=7" title="Edit section: Methods for stabilizing training"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <p>The plain transformer architecture had difficulty converging. In the original paper<sup id="cite_ref-2017_Attention_Is_All_You_Need_1-3" class="reference"><a href="#cite_note-2017_Attention_Is_All_You_Need-1"><span class="cite-bracket">[</span>1<span class="cite-bracket">]</span></a></sup> the authors recommended using learning rate warmup. That is, the learning rate should linearly scale up from 0 to maximal value for the first part of the training (usually recommended to be 2% of the total number of training steps), before decaying again. </p><p>A 2020 paper found that using <a href="/wiki/Layer_normalization" class="mw-redirect" title="Layer normalization">layer normalization</a> <i>before</i> (instead of after) multiheaded attention and feedforward layers stabilizes training, not requiring learning rate warmup.<sup id="cite_ref-auto1_48-0" class="reference"><a href="#cite_note-auto1-48"><span class="cite-bracket">[</span>46<span class="cite-bracket">]</span></a></sup> </p> <div class="mw-heading mw-heading3"><h3 id="Pretrain-finetune">Pretrain-finetune</h3><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Transformer_(deep_learning_architecture)&action=edit&section=8" title="Edit section: Pretrain-finetune"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <p>Transformers typically are first pretrained by <a href="/wiki/Self-supervised_learning" title="Self-supervised learning">self-supervised learning</a> on a large generic dataset, followed by <a href="/wiki/Supervised_learning" title="Supervised learning">supervised</a> <a href="/wiki/Fine-tuning_(deep_learning)" title="Fine-tuning (deep learning)">fine-tuning</a> on a small task-specific dataset. The pretrain dataset is typically an unlabeled large corpus, such as <a href="/wiki/The_Pile_(dataset)" title="The Pile (dataset)">The Pile</a>. Tasks for pretraining and fine-tuning commonly include: </p> <ul><li><a href="/wiki/Language_modeling" class="mw-redirect" title="Language modeling">language modeling</a><sup id="cite_ref-:6_12-1" class="reference"><a href="#cite_note-:6-12"><span class="cite-bracket">[</span>12<span class="cite-bracket">]</span></a></sup></li> <li>next-sentence prediction<sup id="cite_ref-:6_12-2" class="reference"><a href="#cite_note-:6-12"><span class="cite-bracket">[</span>12<span class="cite-bracket">]</span></a></sup></li> <li><a href="/wiki/Question_answering" title="Question answering">question answering</a><sup id="cite_ref-:7_3-1" class="reference"><a href="#cite_note-:7-3"><span class="cite-bracket">[</span>3<span class="cite-bracket">]</span></a></sup></li> <li><a href="/wiki/Natural-language_understanding" class="mw-redirect" title="Natural-language understanding">reading comprehension</a></li> <li><a href="/wiki/Sentiment_analysis" title="Sentiment analysis">sentiment analysis</a><sup id="cite_ref-2017_Attention_Is_All_You_Need_1-4" class="reference"><a href="#cite_note-2017_Attention_Is_All_You_Need-1"><span class="cite-bracket">[</span>1<span class="cite-bracket">]</span></a></sup></li> <li><a href="/wiki/Text_Summaries" class="mw-redirect" title="Text Summaries">paraphrasing</a><sup id="cite_ref-2017_Attention_Is_All_You_Need_1-5" class="reference"><a href="#cite_note-2017_Attention_Is_All_You_Need-1"><span class="cite-bracket">[</span>1<span class="cite-bracket">]</span></a></sup></li></ul> <p>The <a href="/wiki/T5_(language_model)" title="T5 (language model)">T5 transformer</a> report<sup id="cite_ref-:0_49-0" class="reference"><a href="#cite_note-:0-49"><span class="cite-bracket">[</span>47<span class="cite-bracket">]</span></a></sup> documents a large number of <a href="/wiki/Natural_language" title="Natural language">natural language</a> pretraining tasks. Some examples are: </p> <ul><li>restoring or repairing incomplete or corrupted text. For example, the input, <i>"Thank you ~~ me to your party ~~ week",</i> might generate the output, <i>"Thank you <b>for inviting</b> me to your party <b>last</b> week".</i></li> <li>translation between natural languages (<a href="/wiki/Machine_translation" title="Machine translation">machine translation</a>)</li> <li>judging the pragmatic acceptability of natural language. For example, the following sentence might be judged "not acceptable",<sup id="cite_ref-50" class="reference"><a href="#cite_note-50"><span class="cite-bracket">[</span>48<span class="cite-bracket">]</span></a></sup> because even though it is syntactically well-formed, it is improbable in ordinary human usage: <i>The course is jumping well.</i></li></ul> <p>Note that while each of these tasks is trivial or obvious for human native speakers of the language (or languages), they have typically proved challenging for previous generations of machine learning architecture. </p> <div class="mw-heading mw-heading3"><h3 id="Tasks">Tasks</h3><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Transformer_(deep_learning_architecture)&action=edit&section=9" title="Edit section: Tasks"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1236090951"><div role="note" class="hatnote navigation-not-searchable">See also: <a href="/wiki/Large_language_model#Evaluation" title="Large language model">Large language model § Evaluation</a></div> <p>In general, there are 3 classes of language modelling tasks: "masked",<sup id="cite_ref-:5_51-0" class="reference"><a href="#cite_note-:5-51"><span class="cite-bracket">[</span>49<span class="cite-bracket">]</span></a></sup> "autoregressive",<sup id="cite_ref-:8_52-0" class="reference"><a href="#cite_note-:8-52"><span class="cite-bracket">[</span>50<span class="cite-bracket">]</span></a></sup> and "prefixLM".<sup id="cite_ref-:4_53-0" class="reference"><a href="#cite_note-:4-53"><span class="cite-bracket">[</span>51<span class="cite-bracket">]</span></a></sup> These classes are independent of a specific modeling architecture such as Transformer, but they are often discussed in the context of Transformer. </p><p>In a masked task,<sup id="cite_ref-:5_51-1" class="reference"><a href="#cite_note-:5-51"><span class="cite-bracket">[</span>49<span class="cite-bracket">]</span></a></sup> one or more of the tokens is masked out, and the model would produce a probability distribution predicting what the masked-out tokens are based on the context. The <a href="/wiki/Loss_function" title="Loss function">loss function</a> for the task is typically sum of <a href="/wiki/Perplexity" title="Perplexity">log-perplexities</a> for the masked-out tokens: <span class="mwe-math-element"><span class="mwe-math-mathml-display mwe-math-mathml-a11y" style="display: none;"><math display="block" xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle {\text{Loss}}=-\sum _{t\in {\text{masked tokens}}}\ln({\text{probability of }}t{\text{ conditional on its context}})}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mrow class="MJX-TeXAtom-ORD"> <mtext>Loss</mtext> </mrow> <mo>=</mo> <mo>−<!-- − --></mo> <munder> <mo>∑<!-- ∑ --></mo> <mrow class="MJX-TeXAtom-ORD"> <mi>t</mi> <mo>∈<!-- ∈ --></mo> <mrow class="MJX-TeXAtom-ORD"> <mtext>masked tokens</mtext> </mrow> </mrow> </munder> <mi>ln</mi> <mo>⁡<!-- --></mo> <mo stretchy="false">(</mo> <mrow class="MJX-TeXAtom-ORD"> <mtext>probability of </mtext> </mrow> <mi>t</mi> <mrow class="MJX-TeXAtom-ORD"> <mtext> conditional on its context</mtext> </mrow> <mo stretchy="false">)</mo> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle {\text{Loss}}=-\sum _{t\in {\text{masked tokens}}}\ln({\text{probability of }}t{\text{ conditional on its context}})}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/55f0855cde2d171c96b77251ce43de9ee3cfd4e8" class="mwe-math-fallback-image-display mw-invert skin-invert" aria-hidden="true" style="vertical-align: -3.171ex; width:67.226ex; height:5.676ex;" alt="{\displaystyle {\text{Loss}}=-\sum _{t\in {\text{masked tokens}}}\ln({\text{probability of }}t{\text{ conditional on its context}})}"></span>and the model is trained to minimize this loss function. The <a href="/wiki/BERT_(language_model)" title="BERT (language model)">BERT series of models</a> are trained for masked token prediction and another task. </p><p>In an autoregressive task,<sup id="cite_ref-:8_52-1" class="reference"><a href="#cite_note-:8-52"><span class="cite-bracket">[</span>50<span class="cite-bracket">]</span></a></sup> the entire sequence is masked at first, and the model produces a probability distribution for the first token. Then the first token is revealed and the model predicts the second token, and so on. The loss function for the task is still typically the same. The <a href="/wiki/Generative_pre-trained_transformer" title="Generative pre-trained transformer">GPT series of models</a> are trained by autoregressive tasks. </p><p>In a prefixLM task,<sup id="cite_ref-:4_53-1" class="reference"><a href="#cite_note-:4-53"><span class="cite-bracket">[</span>51<span class="cite-bracket">]</span></a></sup> the sequence is divided into two parts. The first part is presented as context, and the model predicts the first token of the second part. Then that would be revealed, and the model predicts the second token, and so on. The loss function for the task is still typically the same. The <a href="/wiki/T5_(language_model)" title="T5 (language model)">T5 series of models</a> are trained by prefixLM tasks. </p><p>Note that "masked" as in "masked language modelling" is not "masked" as in "<a href="#Masked_attention">masked attention</a>", and "prefixLM" (prefix language modeling) is not <a href="#prefixLM">"prefixLM" (prefix language model)</a>. </p> <div class="mw-heading mw-heading2"><h2 id="Architecture">Architecture</h2><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Transformer_(deep_learning_architecture)&action=edit&section=10" title="Edit section: Architecture"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <p>All transformers have the same primary components: </p> <ul><li>Tokenizers, which convert text into tokens.</li> <li>Embedding layer, which converts tokens and positions of the tokens into vector representations.</li> <li>Transformer layers, which carry out repeated transformations on the vector representations, extracting more and more linguistic information. These consist of alternating attention and feedforward layers. There are two major types of transformer layers: encoder layers and decoder layers, with further variants.</li> <li>Un-embedding layer, which converts the final vector representations back to a probability distribution over the tokens.</li></ul> <p>The following description follows exactly the Transformer as described in the original paper. There are variants, described in the <a href="#Subsequent_work">following section</a>. </p><p>By convention, we write all vectors as row vectors. This, for example, means that pushing a vector through a linear layer means multiplying it by a weight matrix on the right, as <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle xW}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mi>x</mi> <mi>W</mi> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle xW}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/b366d97326adb9cbb74e22c5ee0966dd055cd2dc" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.338ex; width:3.765ex; height:2.176ex;" alt="{\displaystyle xW}"></span>. </p> <div class="mw-heading mw-heading3"><h3 id="Tokenization">Tokenization</h3><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Transformer_(deep_learning_architecture)&action=edit&section=11" title="Edit section: Tokenization"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1236090951"><div role="note" class="hatnote navigation-not-searchable">Main article: <a href="/wiki/Lexical_analysis" title="Lexical analysis">Lexical analysis</a></div> <p>As the Transformer architecture natively processes numerical data, not text, there must be a translation between text and tokens. A token is an integer that represents a character, or a short segment of characters. On the input side, the input text is parsed into a token sequence. Similarly, on the output side, the output tokens are parsed back to text. The module doing the conversion between texts and token sequences is a <a href="/wiki/Lexical_analysis" title="Lexical analysis">tokenizer</a>. </p><p>The set of all tokens is the vocabulary of the tokenizer, and its size is the <i>vocabulary size</i> <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle n_{\text{vocabulary}}}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <msub> <mi>n</mi> <mrow class="MJX-TeXAtom-ORD"> <mtext>vocabulary</mtext> </mrow> </msub> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle n_{\text{vocabulary}}}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/616dcbe2d8328a5a84a203a38d3b6adb80c1b070" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -1.005ex; width:9.489ex; height:2.343ex;" alt="{\displaystyle n_{\text{vocabulary}}}"></span>. When faced with tokens outside the vocabulary, typically a special token is used, written as "[UNK]" for "unknown". </p><p>Some commonly used tokenizers are <a href="/wiki/Byte_pair_encoding" title="Byte pair encoding">byte pair encoding</a>, WordPiece, and SentencePiece. </p> <div class="mw-heading mw-heading3"><h3 id="Embedding">Embedding</h3><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Transformer_(deep_learning_architecture)&action=edit&section=12" title="Edit section: Embedding"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1236090951"><div role="note" class="hatnote navigation-not-searchable">Further information: <a href="/wiki/Word_embedding" title="Word embedding">Word embedding</a></div> <p>Each token is converted into an embedding vector via a <a href="/wiki/Lookup_table" title="Lookup table">lookup table</a>. Equivalently stated, it multiplies a <a href="/wiki/One-hot" title="One-hot">one-hot</a> representation of the token by an embedding matrix <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle M}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mi>M</mi> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle M}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/f82cade9898ced02fdd08712e5f0c0151758a0dd" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.338ex; width:2.442ex; height:2.176ex;" alt="{\displaystyle M}"></span>. For example, if the input token is <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle 3}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mn>3</mn> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle 3}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/991e33c6e207b12546f15bdfee8b5726eafbbb2f" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.338ex; width:1.162ex; height:2.176ex;" alt="{\displaystyle 3}"></span>, then the one-hot representation is <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle [0,0,0,1,0,0,\dots ]}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mo stretchy="false">[</mo> <mn>0</mn> <mo>,</mo> <mn>0</mn> <mo>,</mo> <mn>0</mn> <mo>,</mo> <mn>1</mn> <mo>,</mo> <mn>0</mn> <mo>,</mo> <mn>0</mn> <mo>,</mo> <mo>…<!-- … --></mo> <mo stretchy="false">]</mo> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle [0,0,0,1,0,0,\dots ]}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/a5a20e2ecac4d6b6e2e9fa0f965758e488c1d70f" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.838ex; width:17.195ex; height:2.843ex;" alt="{\displaystyle [0,0,0,1,0,0,\dots ]}"></span>, and its embedding vector is<span class="mwe-math-element"><span class="mwe-math-mathml-display mwe-math-mathml-a11y" style="display: none;"><math display="block" xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle \mathrm {Embed} (3)=[0,0,0,1,0,0,\dots ]M}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mrow class="MJX-TeXAtom-ORD"> <mi mathvariant="normal">E</mi> <mi mathvariant="normal">m</mi> <mi mathvariant="normal">b</mi> <mi mathvariant="normal">e</mi> <mi mathvariant="normal">d</mi> </mrow> <mo stretchy="false">(</mo> <mn>3</mn> <mo stretchy="false">)</mo> <mo>=</mo> <mo stretchy="false">[</mo> <mn>0</mn> <mo>,</mo> <mn>0</mn> <mo>,</mo> <mn>0</mn> <mo>,</mo> <mn>1</mn> <mo>,</mo> <mn>0</mn> <mo>,</mo> <mn>0</mn> <mo>,</mo> <mo>…<!-- … --></mo> <mo stretchy="false">]</mo> <mi>M</mi> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle \mathrm {Embed} (3)=[0,0,0,1,0,0,\dots ]M}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/66ba0293d96eeea4e56e92c73333349bc813855c" class="mwe-math-fallback-image-display mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.838ex; width:32.844ex; height:2.843ex;" alt="{\displaystyle \mathrm {Embed} (3)=[0,0,0,1,0,0,\dots ]M}"></span>The token embedding vectors are added to their respective positional encoding vectors (see below), producing the sequence of input vectors. </p><p>The number of dimensions in an embedding vector is called <i>hidden size</i> or <i>embedding size</i> and written as <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle d_{\text{emb}}}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <msub> <mi>d</mi> <mrow class="MJX-TeXAtom-ORD"> <mtext>emb</mtext> </mrow> </msub> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle d_{\text{emb}}}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/4bf3df2909758ee1d69be67380da3263cfba984e" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.671ex; width:4.454ex; height:2.509ex;" alt="{\displaystyle d_{\text{emb}}}"></span>.<sup id="cite_ref-:03_37-1" class="reference"><a href="#cite_note-:03-37"><span class="cite-bracket">[</span>35<span class="cite-bracket">]</span></a></sup> This size is written as <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle d_{\text{model}}}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <msub> <mi>d</mi> <mrow class="MJX-TeXAtom-ORD"> <mtext>model</mtext> </mrow> </msub> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle d_{\text{model}}}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/aefdfb00976a3a5c5ec3c8fcbcc166e82ceb6268" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.671ex; width:5.733ex; height:2.509ex;" alt="{\displaystyle d_{\text{model}}}"></span> in the original Transformer paper.<sup id="cite_ref-2017_Attention_Is_All_You_Need_1-6" class="reference"><a href="#cite_note-2017_Attention_Is_All_You_Need-1"><span class="cite-bracket">[</span>1<span class="cite-bracket">]</span></a></sup> </p> <div class="mw-heading mw-heading3"><h3 id="Un-embedding">Un-embedding</h3><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Transformer_(deep_learning_architecture)&action=edit&section=13" title="Edit section: Un-embedding"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <p>An un-embedding layer is almost the reverse of an embedding layer. Whereas an embedding layer converts a token into a vector, an un-embedding layer converts a vector into a probability distribution over tokens. </p><p>The un-embedding layer is a linear-<a href="/wiki/Softmax_function" title="Softmax function">softmax</a> layer:<span class="mwe-math-element"><span class="mwe-math-mathml-display mwe-math-mathml-a11y" style="display: none;"><math display="block" xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle \mathrm {UnEmbed} (x)=\mathrm {softmax} (xW+b)}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mrow class="MJX-TeXAtom-ORD"> <mi mathvariant="normal">U</mi> <mi mathvariant="normal">n</mi> <mi mathvariant="normal">E</mi> <mi mathvariant="normal">m</mi> <mi mathvariant="normal">b</mi> <mi mathvariant="normal">e</mi> <mi mathvariant="normal">d</mi> </mrow> <mo stretchy="false">(</mo> <mi>x</mi> <mo stretchy="false">)</mo> <mo>=</mo> <mrow class="MJX-TeXAtom-ORD"> <mi mathvariant="normal">s</mi> <mi mathvariant="normal">o</mi> <mi mathvariant="normal">f</mi> <mi mathvariant="normal">t</mi> <mi mathvariant="normal">m</mi> <mi mathvariant="normal">a</mi> <mi mathvariant="normal">x</mi> </mrow> <mo stretchy="false">(</mo> <mi>x</mi> <mi>W</mi> <mo>+</mo> <mi>b</mi> <mo stretchy="false">)</mo> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle \mathrm {UnEmbed} (x)=\mathrm {softmax} (xW+b)}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/df5d49f6baaf8081c203cd3613765a48f0a23da5" class="mwe-math-fallback-image-display mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.838ex; width:33.996ex; height:2.843ex;" alt="{\displaystyle \mathrm {UnEmbed} (x)=\mathrm {softmax} (xW+b)}"></span>The matrix has shape <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle (d_{\text{emb}},n_{\text{vocabulary}})}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mo stretchy="false">(</mo> <msub> <mi>d</mi> <mrow class="MJX-TeXAtom-ORD"> <mtext>emb</mtext> </mrow> </msub> <mo>,</mo> <msub> <mi>n</mi> <mrow class="MJX-TeXAtom-ORD"> <mtext>vocabulary</mtext> </mrow> </msub> <mo stretchy="false">)</mo> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle (d_{\text{emb}},n_{\text{vocabulary}})}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/40463c2e097a1d4cee05618f3d3c04553f90800d" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -1.005ex; width:16.786ex; height:3.009ex;" alt="{\displaystyle (d_{\text{emb}},n_{\text{vocabulary}})}"></span>. The embedding matrix <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle M}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mi>M</mi> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle M}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/f82cade9898ced02fdd08712e5f0c0151758a0dd" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.338ex; width:2.442ex; height:2.176ex;" alt="{\displaystyle M}"></span> and the un-embedding matrix <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle W}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mi>W</mi> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle W}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/54a9c4c547f4d6111f81946cad242b18298d70b7" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.338ex; width:2.435ex; height:2.176ex;" alt="{\displaystyle W}"></span> are sometimes required to be transposes of each other, a practice called weight tying.<sup id="cite_ref-54" class="reference"><a href="#cite_note-54"><span class="cite-bracket">[</span>52<span class="cite-bracket">]</span></a></sup> </p> <div class="mw-heading mw-heading3"><h3 id="Positional_encoding">Positional encoding</h3><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Transformer_(deep_learning_architecture)&action=edit&section=14" title="Edit section: Positional encoding"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <figure class="mw-default-size" typeof="mw:File/Thumb"><a href="/wiki/File:Positional_encoding.png" class="mw-file-description"><img src="//upload.wikimedia.org/wikipedia/commons/thumb/0/02/Positional_encoding.png/220px-Positional_encoding.png" decoding="async" width="220" height="110" class="mw-file-element" srcset="//upload.wikimedia.org/wikipedia/commons/thumb/0/02/Positional_encoding.png/330px-Positional_encoding.png 1.5x, //upload.wikimedia.org/wikipedia/commons/thumb/0/02/Positional_encoding.png/440px-Positional_encoding.png 2x" data-file-width="1600" data-file-height="800" /></a><figcaption>A diagram of a <a href="/wiki/Sine_wave" title="Sine wave">sinusoidal</a> positional encoding with parameters <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle N=10000,d=100}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mi>N</mi> <mo>=</mo> <mn>10000</mn> <mo>,</mo> <mi>d</mi> <mo>=</mo> <mn>100</mn> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle N=10000,d=100}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/fe83017b9728026bf6d2bae4a357041d198ac494" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.671ex; width:19.81ex; height:2.509ex;" alt="{\displaystyle N=10000,d=100}"></span></figcaption></figure> <p>A positional encoding is a fixed-size vector representation of the relative positions of tokens within a sequence: it provides the transformer model with information about <i>where</i> the words are in the input sequence. This shall induce a <a href="/wiki/Inductive_bias" title="Inductive bias">bias</a> towards the order of the input sequence, so that, for example, the input sequence "<a href="/wiki/Man_bites_dog" title="Man bites dog">man bites dog</a>" is processed differently from "dog bites man". </p><p>The positional encoding is defined as a function of type <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle f:\mathbb {R} \to \mathbb {R} ^{d};d\in \mathbb {Z} ,d>0}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mi>f</mi> <mo>:</mo> <mrow class="MJX-TeXAtom-ORD"> <mi mathvariant="double-struck">R</mi> </mrow> <mo stretchy="false">→<!-- → --></mo> <msup> <mrow class="MJX-TeXAtom-ORD"> <mi mathvariant="double-struck">R</mi> </mrow> <mrow class="MJX-TeXAtom-ORD"> <mi>d</mi> </mrow> </msup> <mo>;</mo> <mi>d</mi> <mo>∈<!-- ∈ --></mo> <mrow class="MJX-TeXAtom-ORD"> <mi mathvariant="double-struck">Z</mi> </mrow> <mo>,</mo> <mi>d</mi> <mo>></mo> <mn>0</mn> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle f:\mathbb {R} \to \mathbb {R} ^{d};d\in \mathbb {Z} ,d>0}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/f97ea00a952cdbf0e678b544c0986c588571d9d4" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.671ex; width:24.43ex; height:3.009ex;" alt="{\displaystyle f:\mathbb {R} \to \mathbb {R} ^{d};d\in \mathbb {Z} ,d>0}"></span>, where <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle d}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mi>d</mi> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle d}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/e85ff03cbe0c7341af6b982e47e9f90d235c66ab" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.338ex; width:1.216ex; height:2.176ex;" alt="{\displaystyle d}"></span> is a positive even <a href="/wiki/Integer" title="Integer">integer</a>. The full positional encoding defined in the original paper<sup id="cite_ref-2017_Attention_Is_All_You_Need_1-7" class="reference"><a href="#cite_note-2017_Attention_Is_All_You_Need-1"><span class="cite-bracket">[</span>1<span class="cite-bracket">]</span></a></sup> is:<span class="mwe-math-element"><span class="mwe-math-mathml-display mwe-math-mathml-a11y" style="display: none;"><math display="block" xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle (f(t)_{2k},f(t)_{2k+1})=(\sin(\theta ),\cos(\theta ))\quad \forall k\in \{0,1,\ldots ,d/2-1\}}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mo stretchy="false">(</mo> <mi>f</mi> <mo stretchy="false">(</mo> <mi>t</mi> <msub> <mo stretchy="false">)</mo> <mrow class="MJX-TeXAtom-ORD"> <mn>2</mn> <mi>k</mi> </mrow> </msub> <mo>,</mo> <mi>f</mi> <mo stretchy="false">(</mo> <mi>t</mi> <msub> <mo stretchy="false">)</mo> <mrow class="MJX-TeXAtom-ORD"> <mn>2</mn> <mi>k</mi> <mo>+</mo> <mn>1</mn> </mrow> </msub> <mo stretchy="false">)</mo> <mo>=</mo> <mo stretchy="false">(</mo> <mi>sin</mi> <mo>⁡<!-- --></mo> <mo stretchy="false">(</mo> <mi>θ<!-- θ --></mi> <mo stretchy="false">)</mo> <mo>,</mo> <mi>cos</mi> <mo>⁡<!-- --></mo> <mo stretchy="false">(</mo> <mi>θ<!-- θ --></mi> <mo stretchy="false">)</mo> <mo stretchy="false">)</mo> <mspace width="1em" /> <mi mathvariant="normal">∀<!-- ∀ --></mi> <mi>k</mi> <mo>∈<!-- ∈ --></mo> <mo fence="false" stretchy="false">{</mo> <mn>0</mn> <mo>,</mo> <mn>1</mn> <mo>,</mo> <mo>…<!-- … --></mo> <mo>,</mo> <mi>d</mi> <mrow class="MJX-TeXAtom-ORD"> <mo>/</mo> </mrow> <mn>2</mn> <mo>−<!-- − --></mo> <mn>1</mn> <mo fence="false" stretchy="false">}</mo> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle (f(t)_{2k},f(t)_{2k+1})=(\sin(\theta ),\cos(\theta ))\quad \forall k\in \{0,1,\ldots ,d/2-1\}}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/cbca45061217292a4920ea20881b77a1b39a41ea" class="mwe-math-fallback-image-display mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.838ex; width:60.401ex; height:2.843ex;" alt="{\displaystyle (f(t)_{2k},f(t)_{2k+1})=(\sin(\theta ),\cos(\theta ))\quad \forall k\in \{0,1,\ldots ,d/2-1\}}"></span>where <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle \theta ={\frac {t}{r^{k}}},r=N^{2/d}}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mi>θ<!-- θ --></mi> <mo>=</mo> <mrow class="MJX-TeXAtom-ORD"> <mfrac> <mi>t</mi> <msup> <mi>r</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>k</mi> </mrow> </msup> </mfrac> </mrow> <mo>,</mo> <mi>r</mi> <mo>=</mo> <msup> <mi>N</mi> <mrow class="MJX-TeXAtom-ORD"> <mn>2</mn> <mrow class="MJX-TeXAtom-ORD"> <mo>/</mo> </mrow> <mi>d</mi> </mrow> </msup> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle \theta ={\frac {t}{r^{k}}},r=N^{2/d}}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/8ba16328b4d94e923d80ad257002a6d18deb2edb" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -2.171ex; width:17.202ex; height:5.509ex;" alt="{\displaystyle \theta ={\frac {t}{r^{k}}},r=N^{2/d}}"></span>. </p><p>Here, <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle N}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mi>N</mi> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle N}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/f5e3890c981ae85503089652feb48b191b57aae3" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.338ex; width:2.064ex; height:2.176ex;" alt="{\displaystyle N}"></span> is a free parameter that should be significantly larger than the biggest <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle k}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mi>k</mi> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle k}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/c3c9a2c7b599b37105512c5d570edc034056dd40" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.338ex; width:1.211ex; height:2.176ex;" alt="{\displaystyle k}"></span> that would be input into the positional encoding function. The original paper uses <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle N=10000}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mi>N</mi> <mo>=</mo> <mn>10000</mn> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle N=10000}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/8f13b17e1a7fbe046ab619ccc5167809d04872c2" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.338ex; width:10.974ex; height:2.176ex;" alt="{\displaystyle N=10000}"></span>. </p><p>The function is in a simpler form when written as a complex function of type <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle f:\mathbb {R} \to \mathbb {C} ^{d/2}}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mi>f</mi> <mo>:</mo> <mrow class="MJX-TeXAtom-ORD"> <mi mathvariant="double-struck">R</mi> </mrow> <mo stretchy="false">→<!-- → --></mo> <msup> <mrow class="MJX-TeXAtom-ORD"> <mi mathvariant="double-struck">C</mi> </mrow> <mrow class="MJX-TeXAtom-ORD"> <mi>d</mi> <mrow class="MJX-TeXAtom-ORD"> <mo>/</mo> </mrow> <mn>2</mn> </mrow> </msup> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle f:\mathbb {R} \to \mathbb {C} ^{d/2}}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/52c816b7a3f4f0855fc1cc7bafb9dbce1efc09d0" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.671ex; width:12.922ex; height:3.176ex;" alt="{\displaystyle f:\mathbb {R} \to \mathbb {C} ^{d/2}}"></span><span class="mwe-math-element"><span class="mwe-math-mathml-display mwe-math-mathml-a11y" style="display: none;"><math display="block" xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle f(t)=\left(e^{it/r^{k}}\right)_{k=0,1,\ldots ,{\frac {d}{2}}-1}}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mi>f</mi> <mo stretchy="false">(</mo> <mi>t</mi> <mo stretchy="false">)</mo> <mo>=</mo> <msub> <mrow> <mo>(</mo> <msup> <mi>e</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>i</mi> <mi>t</mi> <mrow class="MJX-TeXAtom-ORD"> <mo>/</mo> </mrow> <msup> <mi>r</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>k</mi> </mrow> </msup> </mrow> </msup> <mo>)</mo> </mrow> <mrow class="MJX-TeXAtom-ORD"> <mi>k</mi> <mo>=</mo> <mn>0</mn> <mo>,</mo> <mn>1</mn> <mo>,</mo> <mo>…<!-- … --></mo> <mo>,</mo> <mrow class="MJX-TeXAtom-ORD"> <mfrac> <mi>d</mi> <mn>2</mn> </mfrac> </mrow> <mo>−<!-- − --></mo> <mn>1</mn> </mrow> </msub> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle f(t)=\left(e^{it/r^{k}}\right)_{k=0,1,\ldots ,{\frac {d}{2}}-1}}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/4887fec07bd9ef29ad5783f32651b1e503a88130" class="mwe-math-fallback-image-display mw-invert skin-invert" aria-hidden="true" style="vertical-align: -2.838ex; width:25.645ex; height:5.843ex;" alt="{\displaystyle f(t)=\left(e^{it/r^{k}}\right)_{k=0,1,\ldots ,{\frac {d}{2}}-1}}"></span>where <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle r=N^{2/d}}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mi>r</mi> <mo>=</mo> <msup> <mi>N</mi> <mrow class="MJX-TeXAtom-ORD"> <mn>2</mn> <mrow class="MJX-TeXAtom-ORD"> <mo>/</mo> </mrow> <mi>d</mi> </mrow> </msup> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle r=N^{2/d}}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/11cb2cfb2ab25e2e0cb3ed51071f1e7f38060c99" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.338ex; width:9.006ex; height:2.843ex;" alt="{\displaystyle r=N^{2/d}}"></span>. </p><p>The main reason for using this positional encoding function is that using it, shifts are linear transformations:<span class="mwe-math-element"><span class="mwe-math-mathml-display mwe-math-mathml-a11y" style="display: none;"><math display="block" xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle f(t+\Delta t)=\mathrm {diag} (f(\Delta t))f(t)}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mi>f</mi> <mo stretchy="false">(</mo> <mi>t</mi> <mo>+</mo> <mi mathvariant="normal">Δ<!-- Δ --></mi> <mi>t</mi> <mo stretchy="false">)</mo> <mo>=</mo> <mrow class="MJX-TeXAtom-ORD"> <mi mathvariant="normal">d</mi> <mi mathvariant="normal">i</mi> <mi mathvariant="normal">a</mi> <mi mathvariant="normal">g</mi> </mrow> <mo stretchy="false">(</mo> <mi>f</mi> <mo stretchy="false">(</mo> <mi mathvariant="normal">Δ<!-- Δ --></mi> <mi>t</mi> <mo stretchy="false">)</mo> <mo stretchy="false">)</mo> <mi>f</mi> <mo stretchy="false">(</mo> <mi>t</mi> <mo stretchy="false">)</mo> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle f(t+\Delta t)=\mathrm {diag} (f(\Delta t))f(t)}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/cce4053e6d0f3e225b153bc362be4970c3e9b535" class="mwe-math-fallback-image-display mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.838ex; width:28.506ex; height:2.843ex;" alt="{\displaystyle f(t+\Delta t)=\mathrm {diag} (f(\Delta t))f(t)}"></span>where <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle \Delta t\in \mathbb {R} }"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mi mathvariant="normal">Δ<!-- Δ --></mi> <mi>t</mi> <mo>∈<!-- ∈ --></mo> <mrow class="MJX-TeXAtom-ORD"> <mi mathvariant="double-struck">R</mi> </mrow> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle \Delta t\in \mathbb {R} }</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/40be374d0e9f96fefe0a14ddb46218a42409b2a6" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.338ex; width:7.294ex; height:2.176ex;" alt="{\displaystyle \Delta t\in \mathbb {R} }"></span> is the distance one wishes to shift. This allows the transformer to take any encoded position, and find the encoding of the position n-steps-ahead or n-steps-behind, by a matrix multiplication. </p><p>By taking a linear sum, any convolution can also be implemented as linear transformations:<span class="mwe-math-element"><span class="mwe-math-mathml-display mwe-math-mathml-a11y" style="display: none;"><math display="block" xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle \sum _{j}c_{j}f(t+\Delta t_{j})=\left(\sum _{j}c_{j}\,\mathrm {diag} (f(\Delta t_{j}))\right)f(t)}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <munder> <mo>∑<!-- ∑ --></mo> <mrow class="MJX-TeXAtom-ORD"> <mi>j</mi> </mrow> </munder> <msub> <mi>c</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>j</mi> </mrow> </msub> <mi>f</mi> <mo stretchy="false">(</mo> <mi>t</mi> <mo>+</mo> <mi mathvariant="normal">Δ<!-- Δ --></mi> <msub> <mi>t</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>j</mi> </mrow> </msub> <mo stretchy="false">)</mo> <mo>=</mo> <mrow> <mo>(</mo> <mrow> <munder> <mo>∑<!-- ∑ --></mo> <mrow class="MJX-TeXAtom-ORD"> <mi>j</mi> </mrow> </munder> <msub> <mi>c</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>j</mi> </mrow> </msub> <mspace width="thinmathspace" /> <mrow class="MJX-TeXAtom-ORD"> <mi mathvariant="normal">d</mi> <mi mathvariant="normal">i</mi> <mi mathvariant="normal">a</mi> <mi mathvariant="normal">g</mi> </mrow> <mo stretchy="false">(</mo> <mi>f</mi> <mo stretchy="false">(</mo> <mi mathvariant="normal">Δ<!-- Δ --></mi> <msub> <mi>t</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>j</mi> </mrow> </msub> <mo stretchy="false">)</mo> <mo stretchy="false">)</mo> </mrow> <mo>)</mo> </mrow> <mi>f</mi> <mo stretchy="false">(</mo> <mi>t</mi> <mo stretchy="false">)</mo> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle \sum _{j}c_{j}f(t+\Delta t_{j})=\left(\sum _{j}c_{j}\,\mathrm {diag} (f(\Delta t_{j}))\right)f(t)}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/14a5ba5df2e142a7c6812dd345b2001550cfa3f0" class="mwe-math-fallback-image-display mw-invert skin-invert" aria-hidden="true" style="vertical-align: -3.338ex; width:46.098ex; height:7.676ex;" alt="{\displaystyle \sum _{j}c_{j}f(t+\Delta t_{j})=\left(\sum _{j}c_{j}\,\mathrm {diag} (f(\Delta t_{j}))\right)f(t)}"></span>for any constants <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle c_{j}}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <msub> <mi>c</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>j</mi> </mrow> </msub> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle c_{j}}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/a844d180d176af828d1636d4e85aa534d0b77baa" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -1.005ex; width:1.917ex; height:2.343ex;" alt="{\displaystyle c_{j}}"></span>. This allows the transformer to take any encoded position and find a linear sum of the encoded locations of its neighbors. This sum of encoded positions, when fed into the attention mechanism, would create attention weights on its neighbors, much like what happens in a <a href="/wiki/Convolutional_neural_network" title="Convolutional neural network">convolutional neural network</a> <a href="/wiki/Language_model" title="Language model">language model</a>. In the author's words, "we hypothesized it would allow the model to easily learn to attend by relative position." </p><p>In typical implementations, all operations are done over the real numbers, not the complex numbers, but since <a href="/wiki/Complex_number#Matrix_representation_of_complex_numbers" title="Complex number">complex multiplication can be implemented as real 2-by-2 matrix multiplication</a>, this is a mere notational difference. </p> <div class="mw-heading mw-heading3"><h3 id="Encoder-decoder_(overview)"><span id="Encoder-decoder_.28overview.29"></span>Encoder-decoder (overview)</h3><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Transformer_(deep_learning_architecture)&action=edit&section=15" title="Edit section: Encoder-decoder (overview)"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <figure typeof="mw:File/Thumb"><a href="/wiki/File:Transformer,_one_encoder-decoder_block.png" class="mw-file-description"><img src="//upload.wikimedia.org/wikipedia/commons/thumb/5/53/Transformer%2C_one_encoder-decoder_block.png/220px-Transformer%2C_one_encoder-decoder_block.png" decoding="async" width="220" height="122" class="mw-file-element" srcset="//upload.wikimedia.org/wikipedia/commons/thumb/5/53/Transformer%2C_one_encoder-decoder_block.png/330px-Transformer%2C_one_encoder-decoder_block.png 1.5x, //upload.wikimedia.org/wikipedia/commons/thumb/5/53/Transformer%2C_one_encoder-decoder_block.png/440px-Transformer%2C_one_encoder-decoder_block.png 2x" data-file-width="1426" data-file-height="791" /></a><figcaption>One encoder-decoder block</figcaption></figure> <figure class="mw-default-size" typeof="mw:File/Thumb"><a href="/wiki/File:Transformer,_stacked_layers_and_sublayers.png" class="mw-file-description"><img src="//upload.wikimedia.org/wikipedia/commons/thumb/5/5f/Transformer%2C_stacked_layers_and_sublayers.png/220px-Transformer%2C_stacked_layers_and_sublayers.png" decoding="async" width="220" height="110" class="mw-file-element" srcset="//upload.wikimedia.org/wikipedia/commons/thumb/5/5f/Transformer%2C_stacked_layers_and_sublayers.png/330px-Transformer%2C_stacked_layers_and_sublayers.png 1.5x, //upload.wikimedia.org/wikipedia/commons/thumb/5/5f/Transformer%2C_stacked_layers_and_sublayers.png/440px-Transformer%2C_stacked_layers_and_sublayers.png 2x" data-file-width="1426" data-file-height="711" /></a><figcaption>A Transformer is composed of stacked encoder layers and decoder layers.</figcaption></figure> <p>Like earlier <a href="/wiki/Seq2seq" title="Seq2seq">seq2seq</a> models, the original transformer model used an <b>encoder-decoder</b> architecture. The encoder consists of encoding layers that process all the input tokens together one layer after another, while the decoder consists of decoding layers that iteratively process the encoder's output and the decoder's output tokens so far. </p><p>The purpose of each encoder layer is to create contextualized representations of the tokens, where each representation corresponds to a token that "mixes" information from other input tokens via self-attention mechanism. Each decoder layer contains two attention sublayers: (1) cross-attention for incorporating the output of encoder (contextualized input token representations), and (2) self-attention for "mixing" information among the input tokens to the decoder (i.e. the tokens generated so far during inference time).<sup id="cite_ref-55" class="reference"><a href="#cite_note-55"><span class="cite-bracket">[</span>53<span class="cite-bracket">]</span></a></sup><sup id="cite_ref-:1_56-0" class="reference"><a href="#cite_note-:1-56"><span class="cite-bracket">[</span>54<span class="cite-bracket">]</span></a></sup> </p><p>Both the encoder and decoder layers have a <a href="/wiki/Feedforward_neural_network" title="Feedforward neural network">feed-forward neural network</a> for additional processing of their outputs and contain residual connections and layer normalization steps.<sup id="cite_ref-:1_56-1" class="reference"><a href="#cite_note-:1-56"><span class="cite-bracket">[</span>54<span class="cite-bracket">]</span></a></sup> These feed-forward layers contain most of the parameters in a Transformer model. </p> <div class="mw-heading mw-heading3"><h3 id="Feedforward_network">Feedforward network</h3><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Transformer_(deep_learning_architecture)&action=edit&section=16" title="Edit section: Feedforward network"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <p><span class="anchor" id="FFN"></span><span class="anchor" id="Feedforward_network"></span><span class="anchor" id="Feedforward_module"></span></p><figure class="mw-default-size" typeof="mw:File/Thumb"><a href="/wiki/File:Transformer_architecture_-_FFN_module.png" class="mw-file-description"><img src="//upload.wikimedia.org/wikipedia/commons/thumb/5/59/Transformer_architecture_-_FFN_module.png/220px-Transformer_architecture_-_FFN_module.png" decoding="async" width="220" height="166" class="mw-file-element" srcset="//upload.wikimedia.org/wikipedia/commons/thumb/5/59/Transformer_architecture_-_FFN_module.png/330px-Transformer_architecture_-_FFN_module.png 1.5x, //upload.wikimedia.org/wikipedia/commons/thumb/5/59/Transformer_architecture_-_FFN_module.png/440px-Transformer_architecture_-_FFN_module.png 2x" data-file-width="744" data-file-height="560" /></a><figcaption>The feedforward network module. It is a two-layered network that maps <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle d_{\text{emb}}}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <msub> <mi>d</mi> <mrow class="MJX-TeXAtom-ORD"> <mtext>emb</mtext> </mrow> </msub> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle d_{\text{emb}}}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/4bf3df2909758ee1d69be67380da3263cfba984e" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.671ex; width:4.454ex; height:2.509ex;" alt="{\displaystyle d_{\text{emb}}}"></span>-dimensional vectors into <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle d_{\text{emb}}}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <msub> <mi>d</mi> <mrow class="MJX-TeXAtom-ORD"> <mtext>emb</mtext> </mrow> </msub> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle d_{\text{emb}}}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/4bf3df2909758ee1d69be67380da3263cfba984e" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.671ex; width:4.454ex; height:2.509ex;" alt="{\displaystyle d_{\text{emb}}}"></span>-dimensional vectors.</figcaption></figure> <p>The feedforward network (FFN) modules in a Transformer are 2-layered <a href="/wiki/Feedforward_neural_network" title="Feedforward neural network">multilayer perceptrons</a>:<span class="mwe-math-element"><span class="mwe-math-mathml-display mwe-math-mathml-a11y" style="display: none;"><math display="block" xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle \mathrm {FFN} (x)=\phi (xW^{(1)}+b^{(1)})W^{(2)}+b^{(2)}}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mrow class="MJX-TeXAtom-ORD"> <mi mathvariant="normal">F</mi> <mi mathvariant="normal">F</mi> <mi mathvariant="normal">N</mi> </mrow> <mo stretchy="false">(</mo> <mi>x</mi> <mo stretchy="false">)</mo> <mo>=</mo> <mi>ϕ<!-- ϕ --></mi> <mo stretchy="false">(</mo> <mi>x</mi> <msup> <mi>W</mi> <mrow class="MJX-TeXAtom-ORD"> <mo stretchy="false">(</mo> <mn>1</mn> <mo stretchy="false">)</mo> </mrow> </msup> <mo>+</mo> <msup> <mi>b</mi> <mrow class="MJX-TeXAtom-ORD"> <mo stretchy="false">(</mo> <mn>1</mn> <mo stretchy="false">)</mo> </mrow> </msup> <mo stretchy="false">)</mo> <msup> <mi>W</mi> <mrow class="MJX-TeXAtom-ORD"> <mo stretchy="false">(</mo> <mn>2</mn> <mo stretchy="false">)</mo> </mrow> </msup> <mo>+</mo> <msup> <mi>b</mi> <mrow class="MJX-TeXAtom-ORD"> <mo stretchy="false">(</mo> <mn>2</mn> <mo stretchy="false">)</mo> </mrow> </msup> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle \mathrm {FFN} (x)=\phi (xW^{(1)}+b^{(1)})W^{(2)}+b^{(2)}}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/3018d1cafd676461ec6a1927aff651d73ce78377" class="mwe-math-fallback-image-display mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.838ex; width:37.567ex; height:3.343ex;" alt="{\displaystyle \mathrm {FFN} (x)=\phi (xW^{(1)}+b^{(1)})W^{(2)}+b^{(2)}}"></span>where <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle \phi }"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mi>ϕ<!-- ϕ --></mi> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle \phi }</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/72b1f30316670aee6270a28334bdf4f5072cdde4" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.671ex; width:1.385ex; height:2.509ex;" alt="{\displaystyle \phi }"></span> is its activation function. The original Transformer used <a href="/wiki/Rectifier_(neural_networks)" title="Rectifier (neural networks)">ReLU</a> activation. </p><p>The number of neurons in the middle layer is called <i>intermediate size</i> (GPT),<sup id="cite_ref-57" class="reference"><a href="#cite_note-57"><span class="cite-bracket">[</span>55<span class="cite-bracket">]</span></a></sup> <i>filter size</i> (BERT),<sup id="cite_ref-:03_37-2" class="reference"><a href="#cite_note-:03-37"><span class="cite-bracket">[</span>35<span class="cite-bracket">]</span></a></sup> or <i>feedforward size</i> (BERT).<sup id="cite_ref-:03_37-3" class="reference"><a href="#cite_note-:03-37"><span class="cite-bracket">[</span>35<span class="cite-bracket">]</span></a></sup> It is typically larger than the embedding size. For example, in both GPT-2 series and BERT series, the intermediate size of a model is 4 times its embedding size: <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle d_{\text{ffn}}=4d_{\text{emb}}}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <msub> <mi>d</mi> <mrow class="MJX-TeXAtom-ORD"> <mtext>ffn</mtext> </mrow> </msub> <mo>=</mo> <mn>4</mn> <msub> <mi>d</mi> <mrow class="MJX-TeXAtom-ORD"> <mtext>emb</mtext> </mrow> </msub> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle d_{\text{ffn}}=4d_{\text{emb}}}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/11c9e89a82cdff80cd9e03abfef22730a29bf958" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.671ex; width:12.077ex; height:2.509ex;" alt="{\displaystyle d_{\text{ffn}}=4d_{\text{emb}}}"></span>. </p> <div class="mw-heading mw-heading3"><h3 id="Scaled_dot-product_attention">Scaled dot-product attention</h3><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Transformer_(deep_learning_architecture)&action=edit&section=17" title="Edit section: Scaled dot-product attention"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1236090951"><div role="note" class="hatnote navigation-not-searchable">Main article: <a href="/wiki/Dot-product_attention" class="mw-redirect" title="Dot-product attention">Dot-product attention</a></div> <div class="mw-heading mw-heading4"><h4 id="Attention_head">Attention head</h4><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Transformer_(deep_learning_architecture)&action=edit&section=18" title="Edit section: Attention head"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <figure class="mw-default-size" typeof="mw:File/Thumb"><a href="/wiki/File:Transformer,_attention_block_diagram.png" class="mw-file-description"><img src="//upload.wikimedia.org/wikipedia/commons/thumb/1/1b/Transformer%2C_attention_block_diagram.png/220px-Transformer%2C_attention_block_diagram.png" decoding="async" width="220" height="586" class="mw-file-element" srcset="//upload.wikimedia.org/wikipedia/commons/thumb/1/1b/Transformer%2C_attention_block_diagram.png/330px-Transformer%2C_attention_block_diagram.png 1.5x, //upload.wikimedia.org/wikipedia/commons/1/1b/Transformer%2C_attention_block_diagram.png 2x" data-file-width="342" data-file-height="911" /></a><figcaption>Scaled dot-product attention, block diagram</figcaption></figure> <figure class="mw-default-size" typeof="mw:File/Thumb"><a href="/wiki/File:Transformer_architecture_-_Attention_Head_module.png" class="mw-file-description"><img src="//upload.wikimedia.org/wikipedia/commons/thumb/c/c4/Transformer_architecture_-_Attention_Head_module.png/220px-Transformer_architecture_-_Attention_Head_module.png" decoding="async" width="220" height="96" class="mw-file-element" srcset="//upload.wikimedia.org/wikipedia/commons/thumb/c/c4/Transformer_architecture_-_Attention_Head_module.png/330px-Transformer_architecture_-_Attention_Head_module.png 1.5x, //upload.wikimedia.org/wikipedia/commons/thumb/c/c4/Transformer_architecture_-_Attention_Head_module.png/440px-Transformer_architecture_-_Attention_Head_module.png 2x" data-file-width="1453" data-file-height="633" /></a><figcaption>Exact dimension counts within an attention head module </figcaption></figure> <p>The attention mechanism used in the Transformer architecture are scaled <a href="/wiki/Dot_product" title="Dot product">dot-product</a> <a href="/wiki/Attention_(machine_learning)" title="Attention (machine learning)">attention</a> units. For each unit, the transformer model learns three weight matrices: the query weights <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle W^{Q}}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <msup> <mi>W</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>Q</mi> </mrow> </msup> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle W^{Q}}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/12fad024825b554ffb621d5385460ffce533f1bb" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.338ex; width:4.04ex; height:2.676ex;" alt="{\displaystyle W^{Q}}"></span>, the key weights <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle W^{K}}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <msup> <mi>W</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>K</mi> </mrow> </msup> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle W^{K}}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/060b854f3f44615cefb8441780e6c31742f5dbd2" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.338ex; width:4.201ex; height:2.676ex;" alt="{\displaystyle W^{K}}"></span>, and the value weights <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle W^{V}}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <msup> <mi>W</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>V</mi> </mrow> </msup> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle W^{V}}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/ef0a2e5fb27e9e6d1fc86901833d40d7e685a460" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.338ex; width:4.004ex; height:2.676ex;" alt="{\displaystyle W^{V}}"></span>. </p><p>The module takes three sequences, a query sequence, a key sequence, and a value sequence. The query sequence is a sequence of length <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle \ell _{\text{seq, query}}}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <msub> <mi>ℓ<!-- ℓ --></mi> <mrow class="MJX-TeXAtom-ORD"> <mtext>seq, query</mtext> </mrow> </msub> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle \ell _{\text{seq, query}}}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/fe514b95a8db1105db6bcdf5a7148d8014461e59" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -1.005ex; width:8.51ex; height:2.843ex;" alt="{\displaystyle \ell _{\text{seq, query}}}"></span>, and each entry is a vector of dimension <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle d_{\text{emb, query}}}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <msub> <mi>d</mi> <mrow class="MJX-TeXAtom-ORD"> <mtext>emb, query</mtext> </mrow> </msub> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle d_{\text{emb, query}}}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/569324b1ee8814a09242042f2fffdbac51e922bd" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -1.005ex; width:9.517ex; height:2.843ex;" alt="{\displaystyle d_{\text{emb, query}}}"></span>. Similarly for the key and value sequences. </p><p>For each vector <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle x_{i,{\text{query}}}}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <msub> <mi>x</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>i</mi> <mo>,</mo> <mrow class="MJX-TeXAtom-ORD"> <mtext>query</mtext> </mrow> </mrow> </msub> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle x_{i,{\text{query}}}}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/0d096bf3997e6d4d12626058f5747fa4a19f0ca8" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -1.005ex; width:6.611ex; height:2.343ex;" alt="{\displaystyle x_{i,{\text{query}}}}"></span> in the query sequence, it is multiplied by a matrix <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle W^{Q}}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <msup> <mi>W</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>Q</mi> </mrow> </msup> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle W^{Q}}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/12fad024825b554ffb621d5385460ffce533f1bb" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.338ex; width:4.04ex; height:2.676ex;" alt="{\displaystyle W^{Q}}"></span> to produce a query vector <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle q_{i}=x_{i,{\text{query}}}W^{Q}}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <msub> <mi>q</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>i</mi> </mrow> </msub> <mo>=</mo> <msub> <mi>x</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>i</mi> <mo>,</mo> <mrow class="MJX-TeXAtom-ORD"> <mtext>query</mtext> </mrow> </mrow> </msub> <msup> <mi>W</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>Q</mi> </mrow> </msup> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle q_{i}=x_{i,{\text{query}}}W^{Q}}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/ad0056fecb11a213583cff46a83e331050830a13" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -1.005ex; width:15.586ex; height:3.343ex;" alt="{\displaystyle q_{i}=x_{i,{\text{query}}}W^{Q}}"></span>. The matrix of all query vectors is the query matrix:<span class="mwe-math-element"><span class="mwe-math-mathml-display mwe-math-mathml-a11y" style="display: none;"><math display="block" xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle Q=X_{\text{query}}W^{Q}}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mi>Q</mi> <mo>=</mo> <msub> <mi>X</mi> <mrow class="MJX-TeXAtom-ORD"> <mtext>query</mtext> </mrow> </msub> <msup> <mi>W</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>Q</mi> </mrow> </msup> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle Q=X_{\text{query}}W^{Q}}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/1fdcfa516a1e0a158286801715d89368d2d8a948" class="mwe-math-fallback-image-display mw-invert skin-invert" aria-hidden="true" style="vertical-align: -1.005ex; width:15.158ex; height:3.343ex;" alt="{\displaystyle Q=X_{\text{query}}W^{Q}}"></span>Similarly, we construct the key matrix <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle K=X_{\text{key}}W^{K}}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mi>K</mi> <mo>=</mo> <msub> <mi>X</mi> <mrow class="MJX-TeXAtom-ORD"> <mtext>key</mtext> </mrow> </msub> <msup> <mi>W</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>K</mi> </mrow> </msup> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle K=X_{\text{key}}W^{K}}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/25cdd16edf96b2453753c8d73d9b86bd7acee034" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -1.005ex; width:13.988ex; height:3.343ex;" alt="{\displaystyle K=X_{\text{key}}W^{K}}"></span> and the value matrix <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle V=X_{\text{value}}W^{V}}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mi>V</mi> <mo>=</mo> <msub> <mi>X</mi> <mrow class="MJX-TeXAtom-ORD"> <mtext>value</mtext> </mrow> </msub> <msup> <mi>W</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>V</mi> </mrow> </msup> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle V=X_{\text{value}}W^{V}}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/4a44f4ff55f3ba2722b925af1e749c37936c88a6" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.671ex; width:14.837ex; height:3.009ex;" alt="{\displaystyle V=X_{\text{value}}W^{V}}"></span>. </p><p>It is usually the case that all <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle W^{Q},W^{K},W^{V}}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <msup> <mi>W</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>Q</mi> </mrow> </msup> <mo>,</mo> <msup> <mi>W</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>K</mi> </mrow> </msup> <mo>,</mo> <msup> <mi>W</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>V</mi> </mrow> </msup> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle W^{Q},W^{K},W^{V}}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/f5c6380cb67a00550ee5dfb91733a9bfdad94b42" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.671ex; width:14.313ex; height:3.009ex;" alt="{\displaystyle W^{Q},W^{K},W^{V}}"></span> are square matrices, meaning <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle d_{\text{emb, query}}=d_{\text{query}}}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <msub> <mi>d</mi> <mrow class="MJX-TeXAtom-ORD"> <mtext>emb, query</mtext> </mrow> </msub> <mo>=</mo> <msub> <mi>d</mi> <mrow class="MJX-TeXAtom-ORD"> <mtext>query</mtext> </mrow> </msub> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle d_{\text{emb, query}}=d_{\text{query}}}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/ed9f9eea42977c0cab83ddcf2f1de48138e007c6" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -1.005ex; width:18.081ex; height:2.843ex;" alt="{\displaystyle d_{\text{emb, query}}=d_{\text{query}}}"></span>, etc. </p><p>Attention weights are calculated using the query and key vectors: the attention weight <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle a_{ij}}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <msub> <mi>a</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>i</mi> <mi>j</mi> </mrow> </msub> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle a_{ij}}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/ebea6cd2813c330c798921a2894b358f7b643917" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -1.005ex; width:2.707ex; height:2.343ex;" alt="{\displaystyle a_{ij}}"></span> from token <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle i}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mi>i</mi> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle i}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/add78d8608ad86e54951b8c8bd6c8d8416533d20" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.338ex; width:0.802ex; height:2.176ex;" alt="{\displaystyle i}"></span> to token <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle j}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mi>j</mi> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle j}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/2f461e54f5c093e92a55547b9764291390f0b5d0" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.671ex; margin-left: -0.027ex; width:0.985ex; height:2.509ex;" alt="{\displaystyle j}"></span> is the <a href="/wiki/Dot_product" title="Dot product">dot product</a> between <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle q_{i}}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <msub> <mi>q</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>i</mi> </mrow> </msub> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle q_{i}}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/2752dcbff884354069fe332b8e51eb0a70a531b6" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.671ex; width:1.837ex; height:2.009ex;" alt="{\displaystyle q_{i}}"></span> and <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle k_{j}}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <msub> <mi>k</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>j</mi> </mrow> </msub> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle k_{j}}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/05ddf2c6d7759ac955e001a7cfafb2abfca41b0b" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -1.005ex; width:2.121ex; height:2.843ex;" alt="{\displaystyle k_{j}}"></span>. The attention weights are divided by the square root of the dimension of the key vectors, <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle {\sqrt {d_{k}}}}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mrow class="MJX-TeXAtom-ORD"> <msqrt> <msub> <mi>d</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>k</mi> </mrow> </msub> </msqrt> </mrow> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle {\sqrt {d_{k}}}}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/0be678d1b945828faecd56b29927f5a60011be37" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -1.005ex; width:4.621ex; height:3.343ex;" alt="{\displaystyle {\sqrt {d_{k}}}}"></span>, which stabilizes gradients during training, and passed through a <a href="/wiki/Softmax_function" title="Softmax function">softmax</a> which normalizes the weights. The fact that <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle W^{Q}}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <msup> <mi>W</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>Q</mi> </mrow> </msup> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle W^{Q}}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/12fad024825b554ffb621d5385460ffce533f1bb" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.338ex; width:4.04ex; height:2.676ex;" alt="{\displaystyle W^{Q}}"></span> and <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle W^{K}}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <msup> <mi>W</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>K</mi> </mrow> </msup> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle W^{K}}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/060b854f3f44615cefb8441780e6c31742f5dbd2" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.338ex; width:4.201ex; height:2.676ex;" alt="{\displaystyle W^{K}}"></span> are different matrices allows attention to be non-symmetric: if token <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle i}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mi>i</mi> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle i}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/add78d8608ad86e54951b8c8bd6c8d8416533d20" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.338ex; width:0.802ex; height:2.176ex;" alt="{\displaystyle i}"></span> attends to token <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle j}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mi>j</mi> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle j}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/2f461e54f5c093e92a55547b9764291390f0b5d0" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.671ex; margin-left: -0.027ex; width:0.985ex; height:2.509ex;" alt="{\displaystyle j}"></span> (i.e. <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle q_{i}\cdot k_{j}}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <msub> <mi>q</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>i</mi> </mrow> </msub> <mo>⋅<!-- ⋅ --></mo> <msub> <mi>k</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>j</mi> </mrow> </msub> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle q_{i}\cdot k_{j}}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/7de4219a59ace005d92f8d0a13466dbdb5fd6d9c" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -1.005ex; width:5.637ex; height:2.843ex;" alt="{\displaystyle q_{i}\cdot k_{j}}"></span> is large), this does not necessarily mean that token <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle j}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mi>j</mi> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle j}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/2f461e54f5c093e92a55547b9764291390f0b5d0" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.671ex; margin-left: -0.027ex; width:0.985ex; height:2.509ex;" alt="{\displaystyle j}"></span> will attend to token <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle i}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mi>i</mi> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle i}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/add78d8608ad86e54951b8c8bd6c8d8416533d20" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.338ex; width:0.802ex; height:2.176ex;" alt="{\displaystyle i}"></span> (i.e. <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle q_{j}\cdot k_{i}}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <msub> <mi>q</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>j</mi> </mrow> </msub> <mo>⋅<!-- ⋅ --></mo> <msub> <mi>k</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>i</mi> </mrow> </msub> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle q_{j}\cdot k_{i}}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/d40445a57203e20510d7b629e0567957524700e4" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -1.005ex; width:5.637ex; height:2.843ex;" alt="{\displaystyle q_{j}\cdot k_{i}}"></span> could be small). The output of the attention unit for token <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle i}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mi>i</mi> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle i}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/add78d8608ad86e54951b8c8bd6c8d8416533d20" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.338ex; width:0.802ex; height:2.176ex;" alt="{\displaystyle i}"></span> is the weighted sum of the value vectors of all tokens, weighted by <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle a_{ij}}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <msub> <mi>a</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>i</mi> <mi>j</mi> </mrow> </msub> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle a_{ij}}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/ebea6cd2813c330c798921a2894b358f7b643917" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -1.005ex; width:2.707ex; height:2.343ex;" alt="{\displaystyle a_{ij}}"></span>, the attention from token <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle i}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mi>i</mi> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle i}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/add78d8608ad86e54951b8c8bd6c8d8416533d20" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.338ex; width:0.802ex; height:2.176ex;" alt="{\displaystyle i}"></span> to each token. </p><p>The attention calculation for all tokens can be expressed as one large matrix calculation using the <a href="/wiki/Softmax_function" title="Softmax function">softmax function</a>, which is useful for training due to computational matrix operation optimizations that quickly compute matrix operations. The matrices <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle Q}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mi>Q</mi> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle Q}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/8752c7023b4b3286800fe3238271bbca681219ed" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.671ex; width:1.838ex; height:2.509ex;" alt="{\displaystyle Q}"></span>, <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle K}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mi>K</mi> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle K}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/2b76fce82a62ed5461908f0dc8f037de4e3686b0" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.338ex; width:2.066ex; height:2.176ex;" alt="{\displaystyle K}"></span> and <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle V}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mi>V</mi> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle V}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/af0f6064540e84211d0ffe4dac72098adfa52845" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.338ex; width:1.787ex; height:2.176ex;" alt="{\displaystyle V}"></span> are defined as the matrices where the <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle i}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mi>i</mi> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle i}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/add78d8608ad86e54951b8c8bd6c8d8416533d20" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.338ex; width:0.802ex; height:2.176ex;" alt="{\displaystyle i}"></span>th rows are vectors <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle q_{i}}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <msub> <mi>q</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>i</mi> </mrow> </msub> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle q_{i}}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/2752dcbff884354069fe332b8e51eb0a70a531b6" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.671ex; width:1.837ex; height:2.009ex;" alt="{\displaystyle q_{i}}"></span>, <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle k_{i}}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <msub> <mi>k</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>i</mi> </mrow> </msub> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle k_{i}}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/f29138ed3ad54ffce527daccadc49c520459b0b0" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.671ex; width:2.011ex; height:2.509ex;" alt="{\displaystyle k_{i}}"></span>, and <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle v_{i}}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <msub> <mi>v</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>i</mi> </mrow> </msub> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle v_{i}}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/7dffe5726650f6daac54829972a94f38eb8ec127" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.671ex; width:1.927ex; height:2.009ex;" alt="{\displaystyle v_{i}}"></span> respectively. Then we can represent the attention as<span class="mwe-math-element"><span class="mwe-math-mathml-display mwe-math-mathml-a11y" style="display: none;"><math display="block" xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle {\begin{aligned}{\text{Attention}}(Q,K,V)={\text{softmax}}\left({\frac {QK^{\mathrm {T} }}{\sqrt {d_{k}}}}\right)V\end{aligned}}}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mrow class="MJX-TeXAtom-ORD"> <mtable columnalign="right left right left right left right left right left right left" rowspacing="3pt" columnspacing="0em 2em 0em 2em 0em 2em 0em 2em 0em 2em 0em" displaystyle="true"> <mtr> <mtd> <mrow class="MJX-TeXAtom-ORD"> <mtext>Attention</mtext> </mrow> <mo stretchy="false">(</mo> <mi>Q</mi> <mo>,</mo> <mi>K</mi> <mo>,</mo> <mi>V</mi> <mo stretchy="false">)</mo> <mo>=</mo> <mrow class="MJX-TeXAtom-ORD"> <mtext>softmax</mtext> </mrow> <mrow> <mo>(</mo> <mrow class="MJX-TeXAtom-ORD"> <mfrac> <mrow> <mi>Q</mi> <msup> <mi>K</mi> <mrow class="MJX-TeXAtom-ORD"> <mrow class="MJX-TeXAtom-ORD"> <mi mathvariant="normal">T</mi> </mrow> </mrow> </msup> </mrow> <msqrt> <msub> <mi>d</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>k</mi> </mrow> </msub> </msqrt> </mfrac> </mrow> <mo>)</mo> </mrow> <mi>V</mi> </mtd> </mtr> </mtable> </mrow> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle {\begin{aligned}{\text{Attention}}(Q,K,V)={\text{softmax}}\left({\frac {QK^{\mathrm {T} }}{\sqrt {d_{k}}}}\right)V\end{aligned}}}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/0b2afc7240eb97375a384b1628c18438e3068e3f" class="mwe-math-fallback-image-display mw-invert skin-invert" aria-hidden="true" style="vertical-align: -3.171ex; width:43.753ex; height:7.509ex;" alt="{\displaystyle {\begin{aligned}{\text{Attention}}(Q,K,V)={\text{softmax}}\left({\frac {QK^{\mathrm {T} }}{\sqrt {d_{k}}}}\right)V\end{aligned}}}"></span> </p><p>where the softmax is applied over each of the rows of the matrix. </p><p>The number of dimensions in a query vector is <i>query size</i> <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle d_{\text{query}}}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <msub> <mi>d</mi> <mrow class="MJX-TeXAtom-ORD"> <mtext>query</mtext> </mrow> </msub> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle d_{\text{query}}}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/a08e7abd7e328c9458c6522b8c0746f27c2d8b53" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -1.005ex; width:5.466ex; height:2.843ex;" alt="{\displaystyle d_{\text{query}}}"></span> and similarly for the <i>key size</i> <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle d_{\text{key}}}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <msub> <mi>d</mi> <mrow class="MJX-TeXAtom-ORD"> <mtext>key</mtext> </mrow> </msub> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle d_{\text{key}}}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/fb86df1f34f41a6100f5569f3b0ab489a14b7136" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -1.005ex; width:3.907ex; height:2.843ex;" alt="{\displaystyle d_{\text{key}}}"></span> and <i>value size</i> <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle d_{\text{value}}}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <msub> <mi>d</mi> <mrow class="MJX-TeXAtom-ORD"> <mtext>value</mtext> </mrow> </msub> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle d_{\text{value}}}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/046c86fdb216cdea1d8ee121df3b755763001ab1" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.671ex; width:5.232ex; height:2.509ex;" alt="{\displaystyle d_{\text{value}}}"></span>. The output dimension of an attention head is its <i>head dimension</i> <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle d_{\text{head}}}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <msub> <mi>d</mi> <mrow class="MJX-TeXAtom-ORD"> <mtext>head</mtext> </mrow> </msub> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle d_{\text{head}}}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/dc21fc3863abb48214d18eeb00efa0c3ae102896" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.671ex; width:4.821ex; height:2.509ex;" alt="{\displaystyle d_{\text{head}}}"></span>. The attention mechanism requires the following three equalities to hold:<span class="mwe-math-element"><span class="mwe-math-mathml-display mwe-math-mathml-a11y" style="display: none;"><math display="block" xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle \ell _{\text{seq, key}}=\ell _{\text{seq, value}},\;d_{\text{query}}=d_{\text{key}},\;d_{\text{value}}=d_{\text{head}}}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <msub> <mi>ℓ<!-- ℓ --></mi> <mrow class="MJX-TeXAtom-ORD"> <mtext>seq, key</mtext> </mrow> </msub> <mo>=</mo> <msub> <mi>ℓ<!-- ℓ --></mi> <mrow class="MJX-TeXAtom-ORD"> <mtext>seq, value</mtext> </mrow> </msub> <mo>,</mo> <mspace width="thickmathspace" /> <msub> <mi>d</mi> <mrow class="MJX-TeXAtom-ORD"> <mtext>query</mtext> </mrow> </msub> <mo>=</mo> <msub> <mi>d</mi> <mrow class="MJX-TeXAtom-ORD"> <mtext>key</mtext> </mrow> </msub> <mo>,</mo> <mspace width="thickmathspace" /> <msub> <mi>d</mi> <mrow class="MJX-TeXAtom-ORD"> <mtext>value</mtext> </mrow> </msub> <mo>=</mo> <msub> <mi>d</mi> <mrow class="MJX-TeXAtom-ORD"> <mtext>head</mtext> </mrow> </msub> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle \ell _{\text{seq, key}}=\ell _{\text{seq, value}},\;d_{\text{query}}=d_{\text{key}},\;d_{\text{value}}=d_{\text{head}}}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/b25b25f22a8a09c26350d2f065628dd4b3911669" class="mwe-math-fallback-image-display mw-invert skin-invert" aria-hidden="true" style="vertical-align: -1.005ex; width:47.309ex; height:2.843ex;" alt="{\displaystyle \ell _{\text{seq, key}}=\ell _{\text{seq, value}},\;d_{\text{query}}=d_{\text{key}},\;d_{\text{value}}=d_{\text{head}}}"></span>but is otherwise unconstrained. </p><p>If the attention head is used in a self-attention fashion, then <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle X_{\text{query}}=X_{\text{key}}=X_{\text{value}}}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <msub> <mi>X</mi> <mrow class="MJX-TeXAtom-ORD"> <mtext>query</mtext> </mrow> </msub> <mo>=</mo> <msub> <mi>X</mi> <mrow class="MJX-TeXAtom-ORD"> <mtext>key</mtext> </mrow> </msub> <mo>=</mo> <msub> <mi>X</mi> <mrow class="MJX-TeXAtom-ORD"> <mtext>value</mtext> </mrow> </msub> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle X_{\text{query}}=X_{\text{key}}=X_{\text{value}}}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/05e0578cf40298283b50d5e5874308d529fc6ec7" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -1.005ex; width:22.948ex; height:2.843ex;" alt="{\displaystyle X_{\text{query}}=X_{\text{key}}=X_{\text{value}}}"></span>. If the attention head is used in a cross-attention fashion, then usually <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle X_{\text{query}}\neq X_{\text{key}}=X_{\text{value}}}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <msub> <mi>X</mi> <mrow class="MJX-TeXAtom-ORD"> <mtext>query</mtext> </mrow> </msub> <mo>≠<!-- ≠ --></mo> <msub> <mi>X</mi> <mrow class="MJX-TeXAtom-ORD"> <mtext>key</mtext> </mrow> </msub> <mo>=</mo> <msub> <mi>X</mi> <mrow class="MJX-TeXAtom-ORD"> <mtext>value</mtext> </mrow> </msub> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle X_{\text{query}}\neq X_{\text{key}}=X_{\text{value}}}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/398ed1ae2490d963661a8ae37d94c04bfc732f9f" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -1.005ex; width:22.948ex; height:2.843ex;" alt="{\displaystyle X_{\text{query}}\neq X_{\text{key}}=X_{\text{value}}}"></span>. It is theoretically possible for all three to be different, but that is rarely the case in practice. </p> <div class="mw-heading mw-heading4"><h4 id="Multiheaded_attention">Multiheaded attention</h4><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Transformer_(deep_learning_architecture)&action=edit&section=19" title="Edit section: Multiheaded attention"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <figure class="mw-default-size" typeof="mw:File/Thumb"><a href="/wiki/File:Multiheaded_attention,_block_diagram.png" class="mw-file-description"><img src="//upload.wikimedia.org/wikipedia/commons/thumb/d/d2/Multiheaded_attention%2C_block_diagram.png/220px-Multiheaded_attention%2C_block_diagram.png" decoding="async" width="220" height="201" class="mw-file-element" srcset="//upload.wikimedia.org/wikipedia/commons/thumb/d/d2/Multiheaded_attention%2C_block_diagram.png/330px-Multiheaded_attention%2C_block_diagram.png 1.5x, //upload.wikimedia.org/wikipedia/commons/thumb/d/d2/Multiheaded_attention%2C_block_diagram.png/440px-Multiheaded_attention%2C_block_diagram.png 2x" data-file-width="656" data-file-height="600" /></a><figcaption>Multiheaded attention, block diagram</figcaption></figure> <figure class="mw-default-size" typeof="mw:File/Thumb"><a href="/wiki/File:Transformer_architecture_-_Multiheaded_Attention_module.png" class="mw-file-description"><img src="//upload.wikimedia.org/wikipedia/commons/thumb/1/15/Transformer_architecture_-_Multiheaded_Attention_module.png/220px-Transformer_architecture_-_Multiheaded_Attention_module.png" decoding="async" width="220" height="120" class="mw-file-element" srcset="//upload.wikimedia.org/wikipedia/commons/thumb/1/15/Transformer_architecture_-_Multiheaded_Attention_module.png/330px-Transformer_architecture_-_Multiheaded_Attention_module.png 1.5x, //upload.wikimedia.org/wikipedia/commons/thumb/1/15/Transformer_architecture_-_Multiheaded_Attention_module.png/440px-Transformer_architecture_-_Multiheaded_Attention_module.png 2x" data-file-width="1335" data-file-height="731" /></a><figcaption>Exact dimension counts within a multiheaded attention module</figcaption></figure> <p>One set of <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle \left(W^{Q},W^{K},W^{V}\right)}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mrow> <mo>(</mo> <mrow> <msup> <mi>W</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>Q</mi> </mrow> </msup> <mo>,</mo> <msup> <mi>W</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>K</mi> </mrow> </msup> <mo>,</mo> <msup> <mi>W</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>V</mi> </mrow> </msup> </mrow> <mo>)</mo> </mrow> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle \left(W^{Q},W^{K},W^{V}\right)}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/1306697728f261e1803778d1b2e042c7df652ae7" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -1.005ex; width:16.442ex; height:3.343ex;" alt="{\displaystyle \left(W^{Q},W^{K},W^{V}\right)}"></span> matrices is called an <i>attention head</i>, and each layer in a transformer model has multiple attention heads. While each attention head attends to the tokens that are relevant to each token, multiple attention heads allow the model to do this for different definitions of "relevance". Specifically, the query and key projection matrices, <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle W^{Q}}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <msup> <mi>W</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>Q</mi> </mrow> </msup> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle W^{Q}}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/12fad024825b554ffb621d5385460ffce533f1bb" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.338ex; width:4.04ex; height:2.676ex;" alt="{\displaystyle W^{Q}}"></span> and <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle W^{K}}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <msup> <mi>W</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>K</mi> </mrow> </msup> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle W^{K}}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/060b854f3f44615cefb8441780e6c31742f5dbd2" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.338ex; width:4.201ex; height:2.676ex;" alt="{\displaystyle W^{K}}"></span> , which are involved in the attention score computation, defines the "relevance". Meanwhile, the value projection matrix <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle W^{V}}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <msup> <mi>W</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>V</mi> </mrow> </msup> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle W^{V}}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/ef0a2e5fb27e9e6d1fc86901833d40d7e685a460" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.338ex; width:4.004ex; height:2.676ex;" alt="{\displaystyle W^{V}}"></span>, in combination with the part of the output projection matrix <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle W^{O}}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <msup> <mi>W</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>O</mi> </mrow> </msup> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle W^{O}}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/7f0f376fd1863b3a195a6bf34b56ff43ec5d695f" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.338ex; width:3.994ex; height:2.676ex;" alt="{\displaystyle W^{O}}"></span>, determines how the attended tokens influence what information is passed to subsequent layers and ultimately the output logits. In addition, the scope of attention, or the range of token relationships captured by each attention head, can expand as tokens pass through successive layers. This allows the model to capture more complex and long-range dependencies in deeper layers. Many transformer attention heads encode relevance relations that are meaningful to humans. For example, some attention heads can attend mostly to the next word, while others mainly attend from verbs to their direct objects.<sup id="cite_ref-58" class="reference"><a href="#cite_note-58"><span class="cite-bracket">[</span>56<span class="cite-bracket">]</span></a></sup> The computations for each attention head can be performed in <a href="/wiki/Parallel_computing" title="Parallel computing">parallel</a>, which allows for fast processing. The outputs for the attention layer are concatenated to pass into the <a href="/wiki/Feedforward_neural_network" title="Feedforward neural network">feed-forward neural network</a> layers. </p><p>Concretely, let the multiple attention heads be indexed by <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle i}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mi>i</mi> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle i}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/add78d8608ad86e54951b8c8bd6c8d8416533d20" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.338ex; width:0.802ex; height:2.176ex;" alt="{\displaystyle i}"></span>, then we have<span class="mwe-math-element"><span class="mwe-math-mathml-display mwe-math-mathml-a11y" style="display: none;"><math display="block" xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle {\text{MultiheadedAttention}}(Q,K,V)={\text{Concat}}_{i\in [n_{\text{heads}}]}({\text{Attention}}(QW_{i}^{Q},KW_{i}^{K},VW_{i}^{V}))W^{O}}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mrow class="MJX-TeXAtom-ORD"> <mtext>MultiheadedAttention</mtext> </mrow> <mo stretchy="false">(</mo> <mi>Q</mi> <mo>,</mo> <mi>K</mi> <mo>,</mo> <mi>V</mi> <mo stretchy="false">)</mo> <mo>=</mo> <msub> <mrow class="MJX-TeXAtom-ORD"> <mtext>Concat</mtext> </mrow> <mrow class="MJX-TeXAtom-ORD"> <mi>i</mi> <mo>∈<!-- ∈ --></mo> <mo stretchy="false">[</mo> <msub> <mi>n</mi> <mrow class="MJX-TeXAtom-ORD"> <mtext>heads</mtext> </mrow> </msub> <mo stretchy="false">]</mo> </mrow> </msub> <mo stretchy="false">(</mo> <mrow class="MJX-TeXAtom-ORD"> <mtext>Attention</mtext> </mrow> <mo stretchy="false">(</mo> <mi>Q</mi> <msubsup> <mi>W</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>i</mi> </mrow> <mrow class="MJX-TeXAtom-ORD"> <mi>Q</mi> </mrow> </msubsup> <mo>,</mo> <mi>K</mi> <msubsup> <mi>W</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>i</mi> </mrow> <mrow class="MJX-TeXAtom-ORD"> <mi>K</mi> </mrow> </msubsup> <mo>,</mo> <mi>V</mi> <msubsup> <mi>W</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>i</mi> </mrow> <mrow class="MJX-TeXAtom-ORD"> <mi>V</mi> </mrow> </msubsup> <mo stretchy="false">)</mo> <mo stretchy="false">)</mo> <msup> <mi>W</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>O</mi> </mrow> </msup> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle {\text{MultiheadedAttention}}(Q,K,V)={\text{Concat}}_{i\in [n_{\text{heads}}]}({\text{Attention}}(QW_{i}^{Q},KW_{i}^{K},VW_{i}^{V}))W^{O}}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/558264c96295acd2da4e675cacad3acf8f5896b4" class="mwe-math-fallback-image-display mw-invert skin-invert" aria-hidden="true" style="vertical-align: -1.171ex; width:87.242ex; height:3.676ex;" alt="{\displaystyle {\text{MultiheadedAttention}}(Q,K,V)={\text{Concat}}_{i\in [n_{\text{heads}}]}({\text{Attention}}(QW_{i}^{Q},KW_{i}^{K},VW_{i}^{V}))W^{O}}"></span> where the matrix <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle X}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mi>X</mi> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle X}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/68baa052181f707c662844a465bfeeb135e82bab" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.338ex; width:1.98ex; height:2.176ex;" alt="{\displaystyle X}"></span> is the concatenation of word embeddings, and the matrices <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle W_{i}^{Q},W_{i}^{K},W_{i}^{V}}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <msubsup> <mi>W</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>i</mi> </mrow> <mrow class="MJX-TeXAtom-ORD"> <mi>Q</mi> </mrow> </msubsup> <mo>,</mo> <msubsup> <mi>W</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>i</mi> </mrow> <mrow class="MJX-TeXAtom-ORD"> <mi>K</mi> </mrow> </msubsup> <mo>,</mo> <msubsup> <mi>W</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>i</mi> </mrow> <mrow class="MJX-TeXAtom-ORD"> <mi>V</mi> </mrow> </msubsup> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle W_{i}^{Q},W_{i}^{K},W_{i}^{V}}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/3087f1739ddc134719d701163d3a75af985498b1" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -1.005ex; width:14.313ex; height:3.509ex;" alt="{\displaystyle W_{i}^{Q},W_{i}^{K},W_{i}^{V}}"></span> are "projection matrices" owned by individual attention head <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle i}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mi>i</mi> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle i}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/add78d8608ad86e54951b8c8bd6c8d8416533d20" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.338ex; width:0.802ex; height:2.176ex;" alt="{\displaystyle i}"></span>, and <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle W^{O}}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <msup> <mi>W</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>O</mi> </mrow> </msup> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle W^{O}}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/7f0f376fd1863b3a195a6bf34b56ff43ec5d695f" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.338ex; width:3.994ex; height:2.676ex;" alt="{\displaystyle W^{O}}"></span> is a final projection matrix owned by the whole multi-headed attention head. </p><p>It is theoretically possible for each attention head to have a different head dimension <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle d_{\text{head}}}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <msub> <mi>d</mi> <mrow class="MJX-TeXAtom-ORD"> <mtext>head</mtext> </mrow> </msub> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle d_{\text{head}}}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/dc21fc3863abb48214d18eeb00efa0c3ae102896" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.671ex; width:4.821ex; height:2.509ex;" alt="{\displaystyle d_{\text{head}}}"></span>, but that is rarely the case in practice. </p><p>As an example, in the smallest GPT-2 model, there are only self-attention mechanisms. It has the following dimensions:<span class="mwe-math-element"><span class="mwe-math-mathml-display mwe-math-mathml-a11y" style="display: none;"><math display="block" xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle d_{\text{emb}}=768,n_{\text{head}}=12,d_{\text{head}}=64}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <msub> <mi>d</mi> <mrow class="MJX-TeXAtom-ORD"> <mtext>emb</mtext> </mrow> </msub> <mo>=</mo> <mn>768</mn> <mo>,</mo> <msub> <mi>n</mi> <mrow class="MJX-TeXAtom-ORD"> <mtext>head</mtext> </mrow> </msub> <mo>=</mo> <mn>12</mn> <mo>,</mo> <msub> <mi>d</mi> <mrow class="MJX-TeXAtom-ORD"> <mtext>head</mtext> </mrow> </msub> <mo>=</mo> <mn>64</mn> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle d_{\text{emb}}=768,n_{\text{head}}=12,d_{\text{head}}=64}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/dd3bb7b2294f3753ad86ffa754b8840b002bd63f" class="mwe-math-fallback-image-display mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.671ex; width:33.782ex; height:2.509ex;" alt="{\displaystyle d_{\text{emb}}=768,n_{\text{head}}=12,d_{\text{head}}=64}"></span>Since <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle 12\times 64=768}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mn>12</mn> <mo>×<!-- × --></mo> <mn>64</mn> <mo>=</mo> <mn>768</mn> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle 12\times 64=768}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/27d429c67c8d2bb230091a87d8f4b000de427bb3" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.338ex; width:14.076ex; height:2.176ex;" alt="{\displaystyle 12\times 64=768}"></span>, its output projection matrix <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle W^{O}\in \mathbb {R} ^{(12\times 64)\times 768}}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <msup> <mi>W</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>O</mi> </mrow> </msup> <mo>∈<!-- ∈ --></mo> <msup> <mrow class="MJX-TeXAtom-ORD"> <mi mathvariant="double-struck">R</mi> </mrow> <mrow class="MJX-TeXAtom-ORD"> <mo stretchy="false">(</mo> <mn>12</mn> <mo>×<!-- × --></mo> <mn>64</mn> <mo stretchy="false">)</mo> <mo>×<!-- × --></mo> <mn>768</mn> </mrow> </msup> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle W^{O}\in \mathbb {R} ^{(12\times 64)\times 768}}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/be9cea988019d5b7d190ec9592f3c7912eb5efd9" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.338ex; width:18.335ex; height:2.843ex;" alt="{\displaystyle W^{O}\in \mathbb {R} ^{(12\times 64)\times 768}}"></span> is a square matrix. </p> <div class="mw-heading mw-heading4"><h4 id="Masked_attention">Masked attention</h4><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Transformer_(deep_learning_architecture)&action=edit&section=20" title="Edit section: Masked attention"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <p>The Transformer architecture is constructed to calculate output tokens iteratively. Assuming <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle t=0}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mi>t</mi> <mo>=</mo> <mn>0</mn> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle t=0}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/43469ec032d858feae5aa87029e22eaaf0109e9c" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.338ex; width:5.101ex; height:2.176ex;" alt="{\displaystyle t=0}"></span> refers to the calculation of the first output token <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle i=0}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mi>i</mi> <mo>=</mo> <mn>0</mn> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle i=0}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/31a682d568ee6a5fe51d76423186057f625ada5c" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.338ex; width:5.063ex; height:2.176ex;" alt="{\displaystyle i=0}"></span>, for step <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle t>0}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mi>t</mi> <mo>></mo> <mn>0</mn> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle t>0}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/29a2960e88369263fe3cfe00ccbfeb83daee212a" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.338ex; width:5.101ex; height:2.176ex;" alt="{\displaystyle t>0}"></span>, the output token <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle i=0}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mi>i</mi> <mo>=</mo> <mn>0</mn> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle i=0}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/31a682d568ee6a5fe51d76423186057f625ada5c" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.338ex; width:5.063ex; height:2.176ex;" alt="{\displaystyle i=0}"></span> shall remain constant. This ensures properties of the model similar to <a href="/wiki/Autoregressive_models" class="mw-redirect" title="Autoregressive models">autoregressive models</a>.<sup id="cite_ref-2017_Attention_Is_All_You_Need_1-8" class="reference"><a href="#cite_note-2017_Attention_Is_All_You_Need-1"><span class="cite-bracket">[</span>1<span class="cite-bracket">]</span></a></sup> Therefore, at every time step <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle t}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mi>t</mi> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle t}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/65658b7b223af9e1acc877d848888ecdb4466560" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.338ex; width:0.84ex; height:2.009ex;" alt="{\displaystyle t}"></span>, the calculation for all outputs <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle i}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mi>i</mi> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle i}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/add78d8608ad86e54951b8c8bd6c8d8416533d20" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.338ex; width:0.802ex; height:2.176ex;" alt="{\displaystyle i}"></span> should not have access to tokens at position <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle j}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mi>j</mi> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle j}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/2f461e54f5c093e92a55547b9764291390f0b5d0" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.671ex; margin-left: -0.027ex; width:0.985ex; height:2.509ex;" alt="{\displaystyle j}"></span> for <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle j>=i}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mi>j</mi> <mo>>=</mo> <mi>i</mi> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle j>=i}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/0962580a4b2002cf7038e5f3529763c32044a040" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.671ex; margin-left: -0.027ex; width:6.694ex; height:2.509ex;" alt="{\displaystyle j>=i}"></span> (as it naturally is the case for time step <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle t=i}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mi>t</mi> <mo>=</mo> <mi>i</mi> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle t=i}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/211e2471862f32d3a3ab7718688b739056c20adc" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.338ex; width:4.741ex; height:2.176ex;" alt="{\displaystyle t=i}"></span>, when tokens <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle j>t}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mi>j</mi> <mo>></mo> <mi>t</mi> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle j>t}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/8b16985b39fa0ca2efcd29a2ea745312c6ee7915" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.671ex; margin-left: -0.027ex; width:4.923ex; height:2.509ex;" alt="{\displaystyle j>t}"></span> are not yet calculated). This behavior may be accomplished before the softmax stage by adding a mask matrix <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle M}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mi>M</mi> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle M}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/f82cade9898ced02fdd08712e5f0c0151758a0dd" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.338ex; width:2.442ex; height:2.176ex;" alt="{\displaystyle M}"></span> that is <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle -\infty }"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mo>−<!-- − --></mo> <mi mathvariant="normal">∞<!-- ∞ --></mi> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle -\infty }</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/ca2608c4b5fd3bffc73585f8c67e379b4e99b6f1" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.505ex; width:4.132ex; height:2.176ex;" alt="{\displaystyle -\infty }"></span> at entries where the attention link must be cut, and <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle 0}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mn>0</mn> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle 0}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/2aae8864a3c1fec9585261791a809ddec1489950" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.338ex; width:1.162ex; height:2.176ex;" alt="{\displaystyle 0}"></span> at other places:<span class="mwe-math-element"><span class="mwe-math-mathml-display mwe-math-mathml-a11y" style="display: none;"><math display="block" xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle {\begin{aligned}{\text{MaskedAttention}}(Q,K,V)={\text{softmax}}\left(M+{\frac {QK^{\mathrm {T} }}{\sqrt {d_{k}}}}\right)V\end{aligned}}}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mrow class="MJX-TeXAtom-ORD"> <mtable columnalign="right left right left right left right left right left right left" rowspacing="3pt" columnspacing="0em 2em 0em 2em 0em 2em 0em 2em 0em 2em 0em" displaystyle="true"> <mtr> <mtd> <mrow class="MJX-TeXAtom-ORD"> <mtext>MaskedAttention</mtext> </mrow> <mo stretchy="false">(</mo> <mi>Q</mi> <mo>,</mo> <mi>K</mi> <mo>,</mo> <mi>V</mi> <mo stretchy="false">)</mo> <mo>=</mo> <mrow class="MJX-TeXAtom-ORD"> <mtext>softmax</mtext> </mrow> <mrow> <mo>(</mo> <mrow> <mi>M</mi> <mo>+</mo> <mrow class="MJX-TeXAtom-ORD"> <mfrac> <mrow> <mi>Q</mi> <msup> <mi>K</mi> <mrow class="MJX-TeXAtom-ORD"> <mrow class="MJX-TeXAtom-ORD"> <mi mathvariant="normal">T</mi> </mrow> </mrow> </msup> </mrow> <msqrt> <msub> <mi>d</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>k</mi> </mrow> </msub> </msqrt> </mfrac> </mrow> </mrow> <mo>)</mo> </mrow> <mi>V</mi> </mtd> </mtr> </mtable> </mrow> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle {\begin{aligned}{\text{MaskedAttention}}(Q,K,V)={\text{softmax}}\left(M+{\frac {QK^{\mathrm {T} }}{\sqrt {d_{k}}}}\right)V\end{aligned}}}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/8d99a80dbf8da6e52c37ba3c9965387a19f82975" class="mwe-math-fallback-image-display mw-invert skin-invert" aria-hidden="true" style="vertical-align: -3.171ex; width:56.798ex; height:7.509ex;" alt="{\displaystyle {\begin{aligned}{\text{MaskedAttention}}(Q,K,V)={\text{softmax}}\left(M+{\frac {QK^{\mathrm {T} }}{\sqrt {d_{k}}}}\right)V\end{aligned}}}"></span> The following matrix is commonly used in decoder self-attention modules, called "causal masking":<span class="mwe-math-element"><span class="mwe-math-mathml-display mwe-math-mathml-a11y" style="display: none;"><math display="block" xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle M_{\text{causal}}={\begin{bmatrix}0&-\infty &-\infty &\dots &-\infty \\0&0&-\infty &\dots &-\infty \\0&0&0&\dots &-\infty \\\vdots &\vdots &\vdots &\ddots &\vdots \\0&0&0&\dots &0\end{bmatrix}}}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <msub> <mi>M</mi> <mrow class="MJX-TeXAtom-ORD"> <mtext>causal</mtext> </mrow> </msub> <mo>=</mo> <mrow class="MJX-TeXAtom-ORD"> <mrow> <mo>[</mo> <mtable rowspacing="4pt" columnspacing="1em"> <mtr> <mtd> <mn>0</mn> </mtd> <mtd> <mo>−<!-- − --></mo> <mi mathvariant="normal">∞<!-- ∞ --></mi> </mtd> <mtd> <mo>−<!-- − --></mo> <mi mathvariant="normal">∞<!-- ∞ --></mi> </mtd> <mtd> <mo>…<!-- … --></mo> </mtd> <mtd> <mo>−<!-- − --></mo> <mi mathvariant="normal">∞<!-- ∞ --></mi> </mtd> </mtr> <mtr> <mtd> <mn>0</mn> </mtd> <mtd> <mn>0</mn> </mtd> <mtd> <mo>−<!-- − --></mo> <mi mathvariant="normal">∞<!-- ∞ --></mi> </mtd> <mtd> <mo>…<!-- … --></mo> </mtd> <mtd> <mo>−<!-- − --></mo> <mi mathvariant="normal">∞<!-- ∞ --></mi> </mtd> </mtr> <mtr> <mtd> <mn>0</mn> </mtd> <mtd> <mn>0</mn> </mtd> <mtd> <mn>0</mn> </mtd> <mtd> <mo>…<!-- … --></mo> </mtd> <mtd> <mo>−<!-- − --></mo> <mi mathvariant="normal">∞<!-- ∞ --></mi> </mtd> </mtr> <mtr> <mtd> <mo>⋮<!-- ⋮ --></mo> </mtd> <mtd> <mo>⋮<!-- ⋮ --></mo> </mtd> <mtd> <mo>⋮<!-- ⋮ --></mo> </mtd> <mtd> <mo>⋱<!-- ⋱ --></mo> </mtd> <mtd> <mo>⋮<!-- ⋮ --></mo> </mtd> </mtr> <mtr> <mtd> <mn>0</mn> </mtd> <mtd> <mn>0</mn> </mtd> <mtd> <mn>0</mn> </mtd> <mtd> <mo>…<!-- … --></mo> </mtd> <mtd> <mn>0</mn> </mtd> </mtr> </mtable> <mo>]</mo> </mrow> </mrow> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle M_{\text{causal}}={\begin{bmatrix}0&-\infty &-\infty &\dots &-\infty \\0&0&-\infty &\dots &-\infty \\0&0&0&\dots &-\infty \\\vdots &\vdots &\vdots &\ddots &\vdots \\0&0&0&\dots &0\end{bmatrix}}}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/981c71d86645b9f71d314dc671903905c0c30a9a" class="mwe-math-fallback-image-display mw-invert skin-invert" aria-hidden="true" style="vertical-align: -8.171ex; width:39.657ex; height:17.509ex;" alt="{\displaystyle M_{\text{causal}}={\begin{bmatrix}0&-\infty &-\infty &\dots &-\infty \\0&0&-\infty &\dots &-\infty \\0&0&0&\dots &-\infty \\\vdots &\vdots &\vdots &\ddots &\vdots \\0&0&0&\dots &0\end{bmatrix}}}"></span> </p><p>In words, it means that each token can pay attention to itself, and every token before it, but not any after it. A non-masked attention module can be thought of as a masked attention module where the mask has all entries zero. As an example of an uncommon use of mask matrix, the <a href="/wiki/XLNet" title="XLNet">XLNet</a> considers all masks of the form <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle PM_{\text{causal}}P^{-1}}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mi>P</mi> <msub> <mi>M</mi> <mrow class="MJX-TeXAtom-ORD"> <mtext>causal</mtext> </mrow> </msub> <msup> <mi>P</mi> <mrow class="MJX-TeXAtom-ORD"> <mo>−<!-- − --></mo> <mn>1</mn> </mrow> </msup> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle PM_{\text{causal}}P^{-1}}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/08430a7d86ac20f4e73548c704aff43512d88f46" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.671ex; width:12.779ex; height:3.009ex;" alt="{\displaystyle PM_{\text{causal}}P^{-1}}"></span>, where <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle P}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mi>P</mi> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle P}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/b4dc73bf40314945ff376bd363916a738548d40a" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.338ex; width:1.745ex; height:2.176ex;" alt="{\displaystyle P}"></span> is a random <a href="/wiki/Permutation_matrix" title="Permutation matrix">permutation matrix</a>.<sup id="cite_ref-59" class="reference"><a href="#cite_note-59"><span class="cite-bracket">[</span>57<span class="cite-bracket">]</span></a></sup> </p> <div class="mw-heading mw-heading3"><h3 id="Encoder">Encoder</h3><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Transformer_(deep_learning_architecture)&action=edit&section=21" title="Edit section: Encoder"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <figure class="mw-default-size" typeof="mw:File/Thumb"><a href="/wiki/File:Transformer,_one_encoder_block.png" class="mw-file-description"><img src="//upload.wikimedia.org/wikipedia/commons/thumb/9/92/Transformer%2C_one_encoder_block.png/220px-Transformer%2C_one_encoder_block.png" decoding="async" width="220" height="78" class="mw-file-element" srcset="//upload.wikimedia.org/wikipedia/commons/thumb/9/92/Transformer%2C_one_encoder_block.png/330px-Transformer%2C_one_encoder_block.png 1.5x, //upload.wikimedia.org/wikipedia/commons/thumb/9/92/Transformer%2C_one_encoder_block.png/440px-Transformer%2C_one_encoder_block.png 2x" data-file-width="1426" data-file-height="503" /></a><figcaption>One encoder layer</figcaption></figure> <p>An encoder consists of an embedding layer, followed by multiple encoder layers. </p><p>Each encoder layer consists of two major components: a self-attention mechanism and a feed-forward layer. It takes an input as a sequence of input vectors, applies the self-attention mechanism, to produce an intermediate sequence of vectors, then applies the feed-forward layer for each vector individually. Schematically, we have:<span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle {\begin{aligned}{\text{given input vectors }}&h_{0},h_{1},\dots \\{\text{combine them into a matrix }}H&={\begin{bmatrix}h_{0}\\h_{1}\\\vdots \end{bmatrix}}\\{\text{EncoderLayer}}(H)&={\begin{bmatrix}{\text{FFN}}({\text{MultiheadedAttention}}(H,H,H)_{0})\\{\text{FFN}}({\text{MultiheadedAttention}}(H,H,H)_{1})\\\vdots \end{bmatrix}}\\\end{aligned}}}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mrow class="MJX-TeXAtom-ORD"> <mtable columnalign="right left right left right left right left right left right left" rowspacing="3pt" columnspacing="0em 2em 0em 2em 0em 2em 0em 2em 0em 2em 0em" displaystyle="true"> <mtr> <mtd> <mrow class="MJX-TeXAtom-ORD"> <mtext>given input vectors </mtext> </mrow> </mtd> <mtd> <msub> <mi>h</mi> <mrow class="MJX-TeXAtom-ORD"> <mn>0</mn> </mrow> </msub> <mo>,</mo> <msub> <mi>h</mi> <mrow class="MJX-TeXAtom-ORD"> <mn>1</mn> </mrow> </msub> <mo>,</mo> <mo>…<!-- … --></mo> </mtd> </mtr> <mtr> <mtd> <mrow class="MJX-TeXAtom-ORD"> <mtext>combine them into a matrix </mtext> </mrow> <mi>H</mi> </mtd> <mtd> <mi></mi> <mo>=</mo> <mrow class="MJX-TeXAtom-ORD"> <mrow> <mo>[</mo> <mtable rowspacing="4pt" columnspacing="1em"> <mtr> <mtd> <msub> <mi>h</mi> <mrow class="MJX-TeXAtom-ORD"> <mn>0</mn> </mrow> </msub> </mtd> </mtr> <mtr> <mtd> <msub> <mi>h</mi> <mrow class="MJX-TeXAtom-ORD"> <mn>1</mn> </mrow> </msub> </mtd> </mtr> <mtr> <mtd> <mo>⋮<!-- ⋮ --></mo> </mtd> </mtr> </mtable> <mo>]</mo> </mrow> </mrow> </mtd> </mtr> <mtr> <mtd> <mrow class="MJX-TeXAtom-ORD"> <mtext>EncoderLayer</mtext> </mrow> <mo stretchy="false">(</mo> <mi>H</mi> <mo stretchy="false">)</mo> </mtd> <mtd> <mi></mi> <mo>=</mo> <mrow class="MJX-TeXAtom-ORD"> <mrow> <mo>[</mo> <mtable rowspacing="4pt" columnspacing="1em"> <mtr> <mtd> <mrow class="MJX-TeXAtom-ORD"> <mtext>FFN</mtext> </mrow> <mo stretchy="false">(</mo> <mrow class="MJX-TeXAtom-ORD"> <mtext>MultiheadedAttention</mtext> </mrow> <mo stretchy="false">(</mo> <mi>H</mi> <mo>,</mo> <mi>H</mi> <mo>,</mo> <mi>H</mi> <msub> <mo stretchy="false">)</mo> <mrow class="MJX-TeXAtom-ORD"> <mn>0</mn> </mrow> </msub> <mo stretchy="false">)</mo> </mtd> </mtr> <mtr> <mtd> <mrow class="MJX-TeXAtom-ORD"> <mtext>FFN</mtext> </mrow> <mo stretchy="false">(</mo> <mrow class="MJX-TeXAtom-ORD"> <mtext>MultiheadedAttention</mtext> </mrow> <mo stretchy="false">(</mo> <mi>H</mi> <mo>,</mo> <mi>H</mi> <mo>,</mo> <mi>H</mi> <msub> <mo stretchy="false">)</mo> <mrow class="MJX-TeXAtom-ORD"> <mn>1</mn> </mrow> </msub> <mo stretchy="false">)</mo> </mtd> </mtr> <mtr> <mtd> <mo>⋮<!-- ⋮ --></mo> </mtd> </mtr> </mtable> <mo>]</mo> </mrow> </mrow> </mtd> </mtr> </mtable> </mrow> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle {\begin{aligned}{\text{given input vectors }}&h_{0},h_{1},\dots \\{\text{combine them into a matrix }}H&={\begin{bmatrix}h_{0}\\h_{1}\\\vdots \end{bmatrix}}\\{\text{EncoderLayer}}(H)&={\begin{bmatrix}{\text{FFN}}({\text{MultiheadedAttention}}(H,H,H)_{0})\\{\text{FFN}}({\text{MultiheadedAttention}}(H,H,H)_{1})\\\vdots \end{bmatrix}}\\\end{aligned}}}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/2406534c5b7eeb8ab22f4268af0d6af3dad61b3d" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -11.671ex; width:78.508ex; height:24.509ex;" alt="{\displaystyle {\begin{aligned}{\text{given input vectors }}&h_{0},h_{1},\dots \\{\text{combine them into a matrix }}H&={\begin{bmatrix}h_{0}\\h_{1}\\\vdots \end{bmatrix}}\\{\text{EncoderLayer}}(H)&={\begin{bmatrix}{\text{FFN}}({\text{MultiheadedAttention}}(H,H,H)_{0})\\{\text{FFN}}({\text{MultiheadedAttention}}(H,H,H)_{1})\\\vdots \end{bmatrix}}\\\end{aligned}}}"></span> </p><p>where <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle {\text{FFN}}}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mrow class="MJX-TeXAtom-ORD"> <mtext>FFN</mtext> </mrow> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle {\text{FFN}}}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/ad653c6de611b65e2c8f58ac99a27bb4406b1ed8" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.338ex; width:4.779ex; height:2.176ex;" alt="{\displaystyle {\text{FFN}}}"></span> stands for "feed-forward network". We can more succinctly write it as<span class="mwe-math-element"><span class="mwe-math-mathml-display mwe-math-mathml-a11y" style="display: none;"><math display="block" xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle {\text{EncoderLayer}}(H)={\text{FFN}}({\text{MultiheadedAttention}}(H,H,H))}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mrow class="MJX-TeXAtom-ORD"> <mtext>EncoderLayer</mtext> </mrow> <mo stretchy="false">(</mo> <mi>H</mi> <mo stretchy="false">)</mo> <mo>=</mo> <mrow class="MJX-TeXAtom-ORD"> <mtext>FFN</mtext> </mrow> <mo stretchy="false">(</mo> <mrow class="MJX-TeXAtom-ORD"> <mtext>MultiheadedAttention</mtext> </mrow> <mo stretchy="false">(</mo> <mi>H</mi> <mo>,</mo> <mi>H</mi> <mo>,</mo> <mi>H</mi> <mo stretchy="false">)</mo> <mo stretchy="false">)</mo> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle {\text{EncoderLayer}}(H)={\text{FFN}}({\text{MultiheadedAttention}}(H,H,H))}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/227164950ed4f7645860f47a509724cfee40349f" class="mwe-math-fallback-image-display mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.838ex; width:60.331ex; height:2.843ex;" alt="{\displaystyle {\text{EncoderLayer}}(H)={\text{FFN}}({\text{MultiheadedAttention}}(H,H,H))}"></span>with the implicit convention that the <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle {\text{FFN}}}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mrow class="MJX-TeXAtom-ORD"> <mtext>FFN</mtext> </mrow> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle {\text{FFN}}}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/ad653c6de611b65e2c8f58ac99a27bb4406b1ed8" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.338ex; width:4.779ex; height:2.176ex;" alt="{\displaystyle {\text{FFN}}}"></span> is applied to each row of the matrix individually. </p><p>The encoder layers are stacked. The first encoder layer takes the sequence of input vectors from the embedding layer, producing a sequence of vectors. This sequence of vectors is processed by the second encoder, and so on. The output from the final encoder layer is then used by the decoder. </p><p>As the encoder processes the entire input all at once, every token can attend to every other token (all-to-all attention), so there is no need for causal masking. </p> <div class="mw-heading mw-heading3"><h3 id="Decoder">Decoder</h3><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Transformer_(deep_learning_architecture)&action=edit&section=22" title="Edit section: Decoder"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <figure class="mw-default-size" typeof="mw:File/Thumb"><a href="/wiki/File:Transformer,_one_decoder_block.png" class="mw-file-description"><img src="//upload.wikimedia.org/wikipedia/commons/thumb/5/55/Transformer%2C_one_decoder_block.png/220px-Transformer%2C_one_decoder_block.png" decoding="async" width="220" height="123" class="mw-file-element" srcset="//upload.wikimedia.org/wikipedia/commons/thumb/5/55/Transformer%2C_one_decoder_block.png/330px-Transformer%2C_one_decoder_block.png 1.5x, //upload.wikimedia.org/wikipedia/commons/thumb/5/55/Transformer%2C_one_decoder_block.png/440px-Transformer%2C_one_decoder_block.png 2x" data-file-width="1426" data-file-height="796" /></a><figcaption>One decoder layer</figcaption></figure> <p>A decoder consists of an embedding layer, followed by multiple decoder layers, followed by an un-embedding layer. </p><p>Each decoder consists of three major components: a causally masked self-attention mechanism, a cross-attention mechanism, and a feed-forward neural network. The decoder functions in a similar fashion to the encoder, but an additional attention mechanism is inserted which instead draws relevant information from the encodings generated by the encoders. This mechanism can also be called the <i>encoder-decoder attention</i>.<sup id="cite_ref-2017_Attention_Is_All_You_Need_1-9" class="reference"><a href="#cite_note-2017_Attention_Is_All_You_Need-1"><span class="cite-bracket">[</span>1<span class="cite-bracket">]</span></a></sup><sup id="cite_ref-:1_56-2" class="reference"><a href="#cite_note-:1-56"><span class="cite-bracket">[</span>54<span class="cite-bracket">]</span></a></sup> </p><p>Like the first encoder, the first decoder takes positional information and embeddings of the output sequence as its input, rather than encodings. The transformer must not use the current or future output to predict an output, so the output sequence must be partially masked to prevent this reverse information flow.<sup id="cite_ref-2017_Attention_Is_All_You_Need_1-10" class="reference"><a href="#cite_note-2017_Attention_Is_All_You_Need-1"><span class="cite-bracket">[</span>1<span class="cite-bracket">]</span></a></sup> This allows for <a href="/wiki/Autoregressive_model" title="Autoregressive model">autoregressive</a> text generation. For decoding, all-to-all attention is inappropriate, because a token cannot attend to tokens not yet generated. Thus, the self-attention module in the decoder is causally masked. </p><p>In contrast, the cross-attention mechanism attends to the output vectors of the encoder, which is computed before the decoder starts decoding. Consequently, there is no need for masking in the cross-attention mechanism. </p><p>Schematically, we have:<span class="mwe-math-element"><span class="mwe-math-mathml-display mwe-math-mathml-a11y" style="display: none;"><math display="block" xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle {\begin{aligned}H'&={\text{MaskedMultiheadedAttention}}(H,H,H)\\{\text{DecoderLayer}}(H)&={\text{FFN}}({\text{MultiheadedAttention}}(H',H^{E},H^{E}))\end{aligned}}}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mrow class="MJX-TeXAtom-ORD"> <mtable columnalign="right left right left right left right left right left right left" rowspacing="3pt" columnspacing="0em 2em 0em 2em 0em 2em 0em 2em 0em 2em 0em" displaystyle="true"> <mtr> <mtd> <msup> <mi>H</mi> <mo>′</mo> </msup> </mtd> <mtd> <mi></mi> <mo>=</mo> <mrow class="MJX-TeXAtom-ORD"> <mtext>MaskedMultiheadedAttention</mtext> </mrow> <mo stretchy="false">(</mo> <mi>H</mi> <mo>,</mo> <mi>H</mi> <mo>,</mo> <mi>H</mi> <mo stretchy="false">)</mo> </mtd> </mtr> <mtr> <mtd> <mrow class="MJX-TeXAtom-ORD"> <mtext>DecoderLayer</mtext> </mrow> <mo stretchy="false">(</mo> <mi>H</mi> <mo stretchy="false">)</mo> </mtd> <mtd> <mi></mi> <mo>=</mo> <mrow class="MJX-TeXAtom-ORD"> <mtext>FFN</mtext> </mrow> <mo stretchy="false">(</mo> <mrow class="MJX-TeXAtom-ORD"> <mtext>MultiheadedAttention</mtext> </mrow> <mo stretchy="false">(</mo> <msup> <mi>H</mi> <mo>′</mo> </msup> <mo>,</mo> <msup> <mi>H</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>E</mi> </mrow> </msup> <mo>,</mo> <msup> <mi>H</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>E</mi> </mrow> </msup> <mo stretchy="false">)</mo> <mo stretchy="false">)</mo> </mtd> </mtr> </mtable> </mrow> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle {\begin{aligned}H'&={\text{MaskedMultiheadedAttention}}(H,H,H)\\{\text{DecoderLayer}}(H)&={\text{FFN}}({\text{MultiheadedAttention}}(H',H^{E},H^{E}))\end{aligned}}}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/53517bba056de79c117a30490add4f73868af2b7" class="mwe-math-fallback-image-display mw-invert skin-invert" aria-hidden="true" style="vertical-align: -2.33ex; margin-bottom: -0.175ex; width:64.795ex; height:6.176ex;" alt="{\displaystyle {\begin{aligned}H'&={\text{MaskedMultiheadedAttention}}(H,H,H)\\{\text{DecoderLayer}}(H)&={\text{FFN}}({\text{MultiheadedAttention}}(H',H^{E},H^{E}))\end{aligned}}}"></span>where <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle H^{E}}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <msup> <mi>H</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>E</mi> </mrow> </msup> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle H^{E}}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/ca07d5c8fe45403067683c7e75a5e1e50d461dea" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.338ex; width:3.591ex; height:2.676ex;" alt="{\displaystyle H^{E}}"></span> is the matrix with rows being the output vectors from the encoder. </p><p>The last decoder is followed by a final un-embedding layer. to produce the output probabilities over the vocabulary. Then, one of the tokens is sampled according to the probability, and the decoder can be run again to produce the next token, etc, autoregressively generating output text. </p> <div class="mw-heading mw-heading3"><h3 id="Adapted_architectures">Adapted architectures</h3><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Transformer_(deep_learning_architecture)&action=edit&section=23" title="Edit section: Adapted architectures"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <p>Many <a href="/wiki/Large_language_models" class="mw-redirect" title="Large language models">large language models</a>, since they do not need to predict a whole new sequence from an input sequence, only use the encoder or decoder of the original transformer architecture. Early <a href="/wiki/Generative_pre-trained_transformer" title="Generative pre-trained transformer">GPT</a> models are decoder-only models trained to predict the next token in a sequence.<sup id="cite_ref-gpt1paper_60-0" class="reference"><a href="#cite_note-gpt1paper-60"><span class="cite-bracket">[</span>58<span class="cite-bracket">]</span></a></sup> <a href="/wiki/BERT_(language_model)" title="BERT (language model)">BERT</a>, another language model, only makes use of an encoder, and is trained to predict a randomly masked token in a sequence.<sup id="cite_ref-:03_37-4" class="reference"><a href="#cite_note-:03-37"><span class="cite-bracket">[</span>35<span class="cite-bracket">]</span></a></sup> </p> <div class="mw-heading mw-heading2"><h2 id="Full_transformer_architecture">Full transformer architecture</h2><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Transformer_(deep_learning_architecture)&action=edit&section=24" title="Edit section: Full transformer architecture"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <div class="mw-heading mw-heading3"><h3 id="Sublayers">Sublayers</h3><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Transformer_(deep_learning_architecture)&action=edit&section=25" title="Edit section: Sublayers"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <figure class="mw-default-size" typeof="mw:File/Thumb"><a href="/wiki/File:Transformer,_stacked_multilayers.png" class="mw-file-description"><img src="//upload.wikimedia.org/wikipedia/commons/thumb/7/7a/Transformer%2C_stacked_multilayers.png/220px-Transformer%2C_stacked_multilayers.png" decoding="async" width="220" height="221" class="mw-file-element" srcset="//upload.wikimedia.org/wikipedia/commons/thumb/7/7a/Transformer%2C_stacked_multilayers.png/330px-Transformer%2C_stacked_multilayers.png 1.5x, //upload.wikimedia.org/wikipedia/commons/thumb/7/7a/Transformer%2C_stacked_multilayers.png/440px-Transformer%2C_stacked_multilayers.png 2x" data-file-width="1426" data-file-height="1430" /></a><figcaption>(a) One encoder layer and one decoder layer. (b) Two encoder layers and two decoder layers. The sublayers are labelled as well.</figcaption></figure><p>Each encoder layer contains 2 sublayers: the self-attention and the feedforward network. Each decoder layer contains 3 sublayers: the causally masked self-attention, the cross-attention, and the feedforward network. </p><figure class="mw-default-size" typeof="mw:File/Thumb"><a href="/wiki/File:Transformer_encoder,_with_norm-first_and_norm-last.png" class="mw-file-description"><img src="//upload.wikimedia.org/wikipedia/commons/thumb/4/4d/Transformer_encoder%2C_with_norm-first_and_norm-last.png/220px-Transformer_encoder%2C_with_norm-first_and_norm-last.png" decoding="async" width="220" height="185" class="mw-file-element" srcset="//upload.wikimedia.org/wikipedia/commons/thumb/4/4d/Transformer_encoder%2C_with_norm-first_and_norm-last.png/330px-Transformer_encoder%2C_with_norm-first_and_norm-last.png 1.5x, //upload.wikimedia.org/wikipedia/commons/thumb/4/4d/Transformer_encoder%2C_with_norm-first_and_norm-last.png/440px-Transformer_encoder%2C_with_norm-first_and_norm-last.png 2x" data-file-width="1426" data-file-height="1200" /></a><figcaption>Transformer encoder with norm-first and norm-last</figcaption></figure> <figure class="mw-default-size" typeof="mw:File/Thumb"><a href="/wiki/File:Transformer_decoder,_with_norm-first_and_norm-last.png" class="mw-file-description"><img src="//upload.wikimedia.org/wikipedia/commons/thumb/c/c9/Transformer_decoder%2C_with_norm-first_and_norm-last.png/220px-Transformer_decoder%2C_with_norm-first_and_norm-last.png" decoding="async" width="220" height="231" class="mw-file-element" srcset="//upload.wikimedia.org/wikipedia/commons/thumb/c/c9/Transformer_decoder%2C_with_norm-first_and_norm-last.png/330px-Transformer_decoder%2C_with_norm-first_and_norm-last.png 1.5x, //upload.wikimedia.org/wikipedia/commons/thumb/c/c9/Transformer_decoder%2C_with_norm-first_and_norm-last.png/440px-Transformer_decoder%2C_with_norm-first_and_norm-last.png 2x" data-file-width="1426" data-file-height="1500" /></a><figcaption>Transformer decoder with norm-first and norm-last</figcaption></figure> <figure class="mw-default-size" typeof="mw:File/Thumb"><a href="/wiki/File:Transformer,_full_architecture.png" class="mw-file-description"><img src="//upload.wikimedia.org/wikipedia/commons/thumb/3/34/Transformer%2C_full_architecture.png/220px-Transformer%2C_full_architecture.png" decoding="async" width="220" height="231" class="mw-file-element" srcset="//upload.wikimedia.org/wikipedia/commons/thumb/3/34/Transformer%2C_full_architecture.png/330px-Transformer%2C_full_architecture.png 1.5x, //upload.wikimedia.org/wikipedia/commons/thumb/3/34/Transformer%2C_full_architecture.png/440px-Transformer%2C_full_architecture.png 2x" data-file-width="1426" data-file-height="1500" /></a><figcaption>Block diagram for the full Transformer architecture</figcaption></figure><figure class="mw-default-size" typeof="mw:File/Thumb"><a href="/wiki/File:Transformer,_schematic_object_hierarchy,_for_implementation_in_object-oriented_programming.png" class="mw-file-description"><img src="//upload.wikimedia.org/wikipedia/commons/thumb/b/ba/Transformer%2C_schematic_object_hierarchy%2C_for_implementation_in_object-oriented_programming.png/220px-Transformer%2C_schematic_object_hierarchy%2C_for_implementation_in_object-oriented_programming.png" decoding="async" width="220" height="123" class="mw-file-element" srcset="//upload.wikimedia.org/wikipedia/commons/thumb/b/ba/Transformer%2C_schematic_object_hierarchy%2C_for_implementation_in_object-oriented_programming.png/330px-Transformer%2C_schematic_object_hierarchy%2C_for_implementation_in_object-oriented_programming.png 1.5x, //upload.wikimedia.org/wikipedia/commons/thumb/b/ba/Transformer%2C_schematic_object_hierarchy%2C_for_implementation_in_object-oriented_programming.png/440px-Transformer%2C_schematic_object_hierarchy%2C_for_implementation_in_object-oriented_programming.png 2x" data-file-width="1426" data-file-height="800" /></a><figcaption>Schematic <a href="/wiki/Object_hierarchy" title="Object hierarchy">object hierarchy</a> for the full Transformer architecture, in <a href="/wiki/Object-oriented_programming" title="Object-oriented programming">object-oriented programming</a> style</figcaption></figure><p>The final points of detail are the <a href="/wiki/Residual_neural_network" title="Residual neural network">residual connections</a> and <a href="/wiki/Layer_normalization" class="mw-redirect" title="Layer normalization">layer normalization</a> (LayerNorm, or LN), which while conceptually unnecessary, are necessary for numerical stability and convergence. Similarly to how the feedforward network modules are applied individually to each vector, the LayerNorm is also applied individually to each vector. </p><p><span class="anchor" id="pre-LN"></span>There are two common conventions in use: the <i>post-LN</i> and the <i>pre-LN</i> convention. In the post-LN convention, the output of each sublayer is <span class="mwe-math-element"><span class="mwe-math-mathml-display mwe-math-mathml-a11y" style="display: none;"><math display="block" xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle \mathrm {LayerNorm} (x+\mathrm {Sublayer} (x))}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mrow class="MJX-TeXAtom-ORD"> <mi mathvariant="normal">L</mi> <mi mathvariant="normal">a</mi> <mi mathvariant="normal">y</mi> <mi mathvariant="normal">e</mi> <mi mathvariant="normal">r</mi> <mi mathvariant="normal">N</mi> <mi mathvariant="normal">o</mi> <mi mathvariant="normal">r</mi> <mi mathvariant="normal">m</mi> </mrow> <mo stretchy="false">(</mo> <mi>x</mi> <mo>+</mo> <mrow class="MJX-TeXAtom-ORD"> <mi mathvariant="normal">S</mi> <mi mathvariant="normal">u</mi> <mi mathvariant="normal">b</mi> <mi mathvariant="normal">l</mi> <mi mathvariant="normal">a</mi> <mi mathvariant="normal">y</mi> <mi mathvariant="normal">e</mi> <mi mathvariant="normal">r</mi> </mrow> <mo stretchy="false">(</mo> <mi>x</mi> <mo stretchy="false">)</mo> <mo stretchy="false">)</mo> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle \mathrm {LayerNorm} (x+\mathrm {Sublayer} (x))}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/dbfd5ef346a396976d4b6cea30408e27192b4ca8" class="mwe-math-fallback-image-display mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.838ex; width:29.516ex; height:2.843ex;" alt="{\displaystyle \mathrm {LayerNorm} (x+\mathrm {Sublayer} (x))}"></span>where <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle \mathrm {Sublayer} (x)}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mrow class="MJX-TeXAtom-ORD"> <mi mathvariant="normal">S</mi> <mi mathvariant="normal">u</mi> <mi mathvariant="normal">b</mi> <mi mathvariant="normal">l</mi> <mi mathvariant="normal">a</mi> <mi mathvariant="normal">y</mi> <mi mathvariant="normal">e</mi> <mi mathvariant="normal">r</mi> </mrow> <mo stretchy="false">(</mo> <mi>x</mi> <mo stretchy="false">)</mo> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle \mathrm {Sublayer} (x)}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/a8c8327dbca36935f9f5f157967e6df5603880c9" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.838ex; width:11.997ex; height:2.843ex;" alt="{\displaystyle \mathrm {Sublayer} (x)}"></span> is the function implemented by the sublayer itself. </p><p>In the pre-LN convention, the output of each sublayer is<span class="mwe-math-element"><span class="mwe-math-mathml-display mwe-math-mathml-a11y" style="display: none;"><math display="block" xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle x+\mathrm {Sublayer} (\mathrm {LayerNorm} (x))}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mi>x</mi> <mo>+</mo> <mrow class="MJX-TeXAtom-ORD"> <mi mathvariant="normal">S</mi> <mi mathvariant="normal">u</mi> <mi mathvariant="normal">b</mi> <mi mathvariant="normal">l</mi> <mi mathvariant="normal">a</mi> <mi mathvariant="normal">y</mi> <mi mathvariant="normal">e</mi> <mi mathvariant="normal">r</mi> </mrow> <mo stretchy="false">(</mo> <mrow class="MJX-TeXAtom-ORD"> <mi mathvariant="normal">L</mi> <mi mathvariant="normal">a</mi> <mi mathvariant="normal">y</mi> <mi mathvariant="normal">e</mi> <mi mathvariant="normal">r</mi> <mi mathvariant="normal">N</mi> <mi mathvariant="normal">o</mi> <mi mathvariant="normal">r</mi> <mi mathvariant="normal">m</mi> </mrow> <mo stretchy="false">(</mo> <mi>x</mi> <mo stretchy="false">)</mo> <mo stretchy="false">)</mo> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle x+\mathrm {Sublayer} (\mathrm {LayerNorm} (x))}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/14a098c27734ee53b1efbf41656e1797f9ed2874" class="mwe-math-fallback-image-display mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.838ex; width:29.516ex; height:2.843ex;" alt="{\displaystyle x+\mathrm {Sublayer} (\mathrm {LayerNorm} (x))}"></span>The original 2017 Transformer used the post-LN convention. It was difficult to train and required careful hyperparameter tuning and a "warm-up" in learning rate, where it starts small and gradually increases. The pre-LN convention, proposed several times in 2018,<sup id="cite_ref-61" class="reference"><a href="#cite_note-61"><span class="cite-bracket">[</span>59<span class="cite-bracket">]</span></a></sup> was found to be easier to train, requiring no warm-up, leading to faster convergence.<sup id="cite_ref-auto1_48-1" class="reference"><a href="#cite_note-auto1-48"><span class="cite-bracket">[</span>46<span class="cite-bracket">]</span></a></sup> </p> <div class="mw-heading mw-heading3"><h3 id="Pseudocode">Pseudocode</h3><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Transformer_(deep_learning_architecture)&action=edit&section=26" title="Edit section: Pseudocode"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <p>The following is the pseudocode for a standard pre-LN encoder-decoder Transformer, adapted from<sup id="cite_ref-62" class="reference"><a href="#cite_note-62"><span class="cite-bracket">[</span>60<span class="cite-bracket">]</span></a></sup> </p> <pre><b>input:</b> Encoder input t_e Decoder input t_d <b>output:</b> Array of probability distributions, with shape (decoder vocabulary size x length(decoder output sequence)) /* encoder */ z_e ← encoder.tokenizer(t_e) <b>for</b> <b>each</b> t <b>in</b> 1:length(z_e) <b>do</b> z_e[t] ← encoder.embedding(z_e[t]) + encoder.positional_embedding(t) <b>for</b> <b>each</b> l <b>in</b> 1:length(encoder.layers) <b>do</b> layer ← encoder.layers[l] /* first sublayer */ z_e_copy ← copy(z_e) <b>for each</b> t <b>in</b> 1:length(z_e) <b>do</b> z_e[t] ← layer.layer_norm(z_e[t]) z_e ← layer.multiheaded_attention(z_e, z_e, z_e) <b>for each</b> t <b>in</b> 1:length(z_e) <b>do</b> z_e[t] ← z_e[t] + z_e_copy[t] /* second sublayer */ z_e_copy ← copy(z_e) <b>for each</b> t <b>in</b> 1:length(z_e) <b>do</b> z_e[t] ← layer.layer_norm(z_e[t]) z_e ← layer.feedforward(z_e) <b>for each</b> t <b>in</b> 1:length(z_e) <b>do</b> z_e[t] ← z_e[t] + z_e_copy[t] <b>for each</b> t <b>in</b> 1:length(z_e) <b>do</b> z_e[t] ← encoder.final_layer_norm(z_e[t]) /* decoder */ z_d ← decoder.tokenizer(t_d) <b>for</b> <b>each</b> t <b>in</b> 1:length(z_d) <b>do</b> z_d[t] ← decoder.embedding(z_d[t]) + decoder.positional_embedding(t) <b>for</b> <b>each</b> l <b>in</b> 1:length(decoder.layers) <b>do</b> layer ← decoder.layers[l] /* first sublayer */ z_d_copy ← copy(z_d) <b>for each</b> t <b>in</b> 1:length(z_d) <b>do</b> z_d[t] ← layer.layer_norm(z_d[t]) z_d ← layer.masked_multiheaded_attention(z_d, z_d, z_d) <b>for each</b> t <b>in</b> 1:length(z_d) <b>do</b> z_d[t] ← z_d[t] + z_d_copy[t] /* second sublayer */ z_d_copy ← copy(z_d) <b>for each</b> t <b>in</b> 1:length(z_d) <b>do</b> z_d[t] ← layer.layer_norm(z_d[t]) z_d ← layer.multiheaded_attention(z_d, z_e, z_e) <b>for each</b> i <b>in</b> 1:length(z_d) <b>do</b> z_d[t] ← z_d[t] + z_d_copy[t] /* third sublayer */ z_d_copy ← copy(z_d) <b>for each</b> t <b>in</b> 1:length(z_d) <b>do</b> z_d[t] ← layer.layer_norm(z_d[t]) z_d ← layer.feedforward(z_d) <b>for each</b> t <b>in</b> 1:length(z_d) <b>do</b> z_d[t] ← z_d[t] + z_d_copy[t] z_d ← decoder.final_layer_norm(z_d) output_distributions ← [] <b>for each</b> t <b>in</b> 1:length(z_d) <b>do</b> output_distributions.append(decoder.unembed(z_d[t])) <b>return</b> output_distributions </pre> <div class="mw-heading mw-heading3"><h3 id="Terminology">Terminology</h3><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Transformer_(deep_learning_architecture)&action=edit&section=27" title="Edit section: Terminology"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <p>The Transformer architecture, being modular, allows variations. Several common variations are described here.<sup id="cite_ref-:3_63-0" class="reference"><a href="#cite_note-:3-63"><span class="cite-bracket">[</span>61<span class="cite-bracket">]</span></a></sup> </p><p><span class="anchor" id="encoder-only"></span>An "encoder-only" Transformer applies the encoder to map an input text into a sequence of vectors that represent the input text. This is usually used for text embedding and <a href="/wiki/Feature_learning" title="Feature learning">representation learning</a> for downstream applications. <a href="/wiki/BERT_(language_model)" title="BERT (language model)">BERT</a> is encoder-only. They are less often used currently, as they were found to be not significantly better than training an encoder-decoder Transformer, then taking just the encoder.<sup id="cite_ref-:4_53-2" class="reference"><a href="#cite_note-:4-53"><span class="cite-bracket">[</span>51<span class="cite-bracket">]</span></a></sup> </p><p><span class="anchor" id="decoder-only"></span>A "decoder-only" Transformer is not literally decoder-only, since without an encoder, the cross-attention mechanism has nothing to attend to. Thus, the decoder layers in a decoder-only Transformer is composed of just two sublayers: the causally masked self-attention, and the feedforward network. This is usually used for <a href="/wiki/Natural_language_generation" title="Natural language generation">text generation</a> and <a href="/wiki/Large_language_model#Instruction_tuning" title="Large language model">instruction following</a>. The models in the <a href="/wiki/Generative_pre-trained_transformer" title="Generative pre-trained transformer">GPT series</a> and <a href="/wiki/Chinchilla_(language_model)" title="Chinchilla (language model)">Chinchilla series</a> are decoder-only. </p><p><span class="anchor" id="encoder-decoder"></span>An "encoder-decoder" Transformer is generally the same as the original Transformer, with 2 sublayers per encoder layer and 3 sublayers per decoder layer, etc. They might have minor architectural improvements, such as <a href="#Alternative_activation_functions">alternative activation functions</a>, <a href="#pre-LN">changing the location of normalization</a>, etc. This is also usually used for text generation and instruction following. The models in the <a href="/wiki/T5_(language_model)" title="T5 (language model)">T5 series</a> are encoder-decoder.<sup id="cite_ref-:3_63-1" class="reference"><a href="#cite_note-:3-63"><span class="cite-bracket">[</span>61<span class="cite-bracket">]</span></a></sup> </p><p><span class="anchor" id="prefixLM"></span>A "prefixLM" (prefix language model) is a decoder-only architecture, but with prefix masking, which is different from causal masking. Specifically, it has mask of the form<sup id="cite_ref-:3_63-2" class="reference"><a href="#cite_note-:3-63"><span class="cite-bracket">[</span>61<span class="cite-bracket">]</span></a></sup><sup class="reference nowrap"><span title="Location: Figure 3">: Figure 3 </span></sup><span class="mwe-math-element"><span class="mwe-math-mathml-display mwe-math-mathml-a11y" style="display: none;"><math display="block" xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle M_{\text{prefixLM}}={\begin{bmatrix}\mathbf {0} &-\infty \\\mathbf {0} &M_{\text{causal}}\end{bmatrix}}}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <msub> <mi>M</mi> <mrow class="MJX-TeXAtom-ORD"> <mtext>prefixLM</mtext> </mrow> </msub> <mo>=</mo> <mrow class="MJX-TeXAtom-ORD"> <mrow> <mo>[</mo> <mtable rowspacing="4pt" columnspacing="1em"> <mtr> <mtd> <mrow class="MJX-TeXAtom-ORD"> <mn mathvariant="bold">0</mn> </mrow> </mtd> <mtd> <mo>−<!-- − --></mo> <mi mathvariant="normal">∞<!-- ∞ --></mi> </mtd> </mtr> <mtr> <mtd> <mrow class="MJX-TeXAtom-ORD"> <mn mathvariant="bold">0</mn> </mrow> </mtd> <mtd> <msub> <mi>M</mi> <mrow class="MJX-TeXAtom-ORD"> <mtext>causal</mtext> </mrow> </msub> </mtd> </mtr> </mtable> <mo>]</mo> </mrow> </mrow> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle M_{\text{prefixLM}}={\begin{bmatrix}\mathbf {0} &-\infty \\\mathbf {0} &M_{\text{causal}}\end{bmatrix}}}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/55dde3d4ea5e6d9e321ca34ebfdd37268558e1aa" class="mwe-math-fallback-image-display mw-invert skin-invert" aria-hidden="true" style="vertical-align: -2.505ex; width:25.981ex; height:6.176ex;" alt="{\displaystyle M_{\text{prefixLM}}={\begin{bmatrix}\mathbf {0} &-\infty \\\mathbf {0} &M_{\text{causal}}\end{bmatrix}}}"></span>where the first columns correspond to the "prefix", and the subsequent columns correspond to the autoregressively generated text based on the prefix. They resemble encoder-decoder models, but has less "sparsity". Such models are rarely used, though they are cited as theoretical possibilities and benchmarked comparisons.<sup id="cite_ref-:4_53-3" class="reference"><a href="#cite_note-:4-53"><span class="cite-bracket">[</span>51<span class="cite-bracket">]</span></a></sup> </p><p>There are also mixed seq2seq models. For example, in 2020, Google Translate replaced the previous RNN-encoder–RNN-decoder model by a Transformer-encoder–RNN-decoder model, on the argument that an RNN-decoder runs much faster than Transformer-decoder when run autoregressively.<sup id="cite_ref-64" class="reference"><a href="#cite_note-64"><span class="cite-bracket">[</span>62<span class="cite-bracket">]</span></a></sup> </p> <div class="mw-heading mw-heading2"><h2 id="Subsequent_work">Subsequent work</h2><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Transformer_(deep_learning_architecture)&action=edit&section=28" title="Edit section: Subsequent work"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <div class="mw-heading mw-heading3"><h3 id="Alternative_activation_functions">Alternative activation functions</h3><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Transformer_(deep_learning_architecture)&action=edit&section=29" title="Edit section: Alternative activation functions"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <p>The original transformer uses <a href="/wiki/ReLU" class="mw-redirect" title="ReLU">ReLU</a> <a href="/wiki/Activation_function" title="Activation function">activation function</a>. Other activation functions were developed. The <a href="/wiki/Llama_(language_model)" title="Llama (language model)">Llama series</a> and <a href="/wiki/PaLM" title="PaLM">PaLM</a> used SwiGLU;<sup id="cite_ref-65" class="reference"><a href="#cite_note-65"><span class="cite-bracket">[</span>63<span class="cite-bracket">]</span></a></sup> both GPT-1 and BERT<sup id="cite_ref-:03_37-5" class="reference"><a href="#cite_note-:03-37"><span class="cite-bracket">[</span>35<span class="cite-bracket">]</span></a></sup> used GELU.<sup id="cite_ref-66" class="reference"><a href="#cite_note-66"><span class="cite-bracket">[</span>64<span class="cite-bracket">]</span></a></sup> </p><p>Alternative activation functions are often used in combination with <a href="/wiki/Gated_Linear_Unit" class="mw-redirect" title="Gated Linear Unit">Gated Linear Units</a> in the feedforward module.<sup id="cite_ref-67" class="reference"><a href="#cite_note-67"><span class="cite-bracket">[</span>65<span class="cite-bracket">]</span></a></sup> </p> <div class="mw-heading mw-heading3"><h3 id="Alternative_normalizations">Alternative normalizations</h3><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Transformer_(deep_learning_architecture)&action=edit&section=30" title="Edit section: Alternative normalizations"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <p>The normalization used in the Transformer can be different from LayerNorm. One example is <a href="/wiki/RMSNorm" class="mw-redirect" title="RMSNorm">RMSNorm</a><sup id="cite_ref-68" class="reference"><a href="#cite_note-68"><span class="cite-bracket">[</span>66<span class="cite-bracket">]</span></a></sup> which is used in the <a href="/wiki/Llama_(language_model)" title="Llama (language model)">Llama series</a>. Other examples include CapsuleNorm<sup id="cite_ref-69" class="reference"><a href="#cite_note-69"><span class="cite-bracket">[</span>67<span class="cite-bracket">]</span></a></sup> ScaleNorm,<sup id="cite_ref-:9_70-0" class="reference"><a href="#cite_note-:9-70"><span class="cite-bracket">[</span>68<span class="cite-bracket">]</span></a></sup> or FixNorm.<sup id="cite_ref-:9_70-1" class="reference"><a href="#cite_note-:9-70"><span class="cite-bracket">[</span>68<span class="cite-bracket">]</span></a></sup> </p> <div class="mw-heading mw-heading3"><h3 id="Alternative_positional_encodings">Alternative positional encodings</h3><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Transformer_(deep_learning_architecture)&action=edit&section=31" title="Edit section: Alternative positional encodings"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <p>Transformers may use other positional encoding methods than sinusoidal.<sup id="cite_ref-71" class="reference"><a href="#cite_note-71"><span class="cite-bracket">[</span>69<span class="cite-bracket">]</span></a></sup> </p><p>The original Transformer paper reported using a learned positional encoding,<sup id="cite_ref-72" class="reference"><a href="#cite_note-72"><span class="cite-bracket">[</span>70<span class="cite-bracket">]</span></a></sup> but finding it not superior to the sinusoidal one.<sup id="cite_ref-2017_Attention_Is_All_You_Need_1-11" class="reference"><a href="#cite_note-2017_Attention_Is_All_You_Need-1"><span class="cite-bracket">[</span>1<span class="cite-bracket">]</span></a></sup> Later, <sup id="cite_ref-73" class="reference"><a href="#cite_note-73"><span class="cite-bracket">[</span>71<span class="cite-bracket">]</span></a></sup> found that causal masking itself provides enough signal to a Transformer decoder that it can learn to implicitly perform absolute positional encoding without the positional encoding module. </p> <div class="mw-heading mw-heading4"><h4 id="RoPE">RoPE</h4><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Transformer_(deep_learning_architecture)&action=edit&section=32" title="Edit section: RoPE"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <p><span class="anchor" id="Rotary_positional_embedding"></span>RoPE (rotary positional embedding),<sup id="cite_ref-74" class="reference"><a href="#cite_note-74"><span class="cite-bracket">[</span>72<span class="cite-bracket">]</span></a></sup> is best explained by considering a list of 2-dimensional vectors <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle [(x_{1}^{(1)},x_{1}^{(2)}),(x_{2}^{(1)},x_{2}^{(2)}),(x_{3}^{(1)},x_{3}^{(2)}),...]}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mo stretchy="false">[</mo> <mo stretchy="false">(</mo> <msubsup> <mi>x</mi> <mrow class="MJX-TeXAtom-ORD"> <mn>1</mn> </mrow> <mrow class="MJX-TeXAtom-ORD"> <mo stretchy="false">(</mo> <mn>1</mn> <mo stretchy="false">)</mo> </mrow> </msubsup> <mo>,</mo> <msubsup> <mi>x</mi> <mrow class="MJX-TeXAtom-ORD"> <mn>1</mn> </mrow> <mrow class="MJX-TeXAtom-ORD"> <mo stretchy="false">(</mo> <mn>2</mn> <mo stretchy="false">)</mo> </mrow> </msubsup> <mo stretchy="false">)</mo> <mo>,</mo> <mo stretchy="false">(</mo> <msubsup> <mi>x</mi> <mrow class="MJX-TeXAtom-ORD"> <mn>2</mn> </mrow> <mrow class="MJX-TeXAtom-ORD"> <mo stretchy="false">(</mo> <mn>1</mn> <mo stretchy="false">)</mo> </mrow> </msubsup> <mo>,</mo> <msubsup> <mi>x</mi> <mrow class="MJX-TeXAtom-ORD"> <mn>2</mn> </mrow> <mrow class="MJX-TeXAtom-ORD"> <mo stretchy="false">(</mo> <mn>2</mn> <mo stretchy="false">)</mo> </mrow> </msubsup> <mo stretchy="false">)</mo> <mo>,</mo> <mo stretchy="false">(</mo> <msubsup> <mi>x</mi> <mrow class="MJX-TeXAtom-ORD"> <mn>3</mn> </mrow> <mrow class="MJX-TeXAtom-ORD"> <mo stretchy="false">(</mo> <mn>1</mn> <mo stretchy="false">)</mo> </mrow> </msubsup> <mo>,</mo> <msubsup> <mi>x</mi> <mrow class="MJX-TeXAtom-ORD"> <mn>3</mn> </mrow> <mrow class="MJX-TeXAtom-ORD"> <mo stretchy="false">(</mo> <mn>2</mn> <mo stretchy="false">)</mo> </mrow> </msubsup> <mo stretchy="false">)</mo> <mo>,</mo> <mo>.</mo> <mo>.</mo> <mo>.</mo> <mo stretchy="false">]</mo> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle [(x_{1}^{(1)},x_{1}^{(2)}),(x_{2}^{(1)},x_{2}^{(2)}),(x_{3}^{(1)},x_{3}^{(2)}),...]}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/08b00c812263b798fed7b345975d49dbebdfada5" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -1.005ex; width:38.007ex; height:3.676ex;" alt="{\displaystyle [(x_{1}^{(1)},x_{1}^{(2)}),(x_{2}^{(1)},x_{2}^{(2)}),(x_{3}^{(1)},x_{3}^{(2)}),...]}"></span>. Now pick some angle <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle \theta }"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mi>θ<!-- θ --></mi> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle \theta }</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/6e5ab2664b422d53eb0c7df3b87e1360d75ad9af" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.338ex; width:1.09ex; height:2.176ex;" alt="{\displaystyle \theta }"></span>. Then RoPE encoding is<span class="mwe-math-element"><span class="mwe-math-mathml-display mwe-math-mathml-a11y" style="display: none;"><math display="block" xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle {\text{RoPE}}{\big (}x_{m}^{(1)},x_{m}^{(2)},m{\big )}={\begin{pmatrix}\cos m\theta &-\sin m\theta \\\sin m\theta &\cos m\theta \end{pmatrix}}{\begin{pmatrix}x_{m}^{(1)}\\x_{m}^{(2)}\\\end{pmatrix}}={\begin{pmatrix}x_{m}^{(1)}\cos m\theta -x_{m}^{(2)}\sin m\theta \\x_{m}^{(2)}\cos m\theta +x_{m}^{(1)}\sin m\theta \\\end{pmatrix}}}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mrow class="MJX-TeXAtom-ORD"> <mtext>RoPE</mtext> </mrow> <mrow class="MJX-TeXAtom-ORD"> <mrow class="MJX-TeXAtom-ORD"> <mo maxsize="1.2em" minsize="1.2em">(</mo> </mrow> </mrow> <msubsup> <mi>x</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>m</mi> </mrow> <mrow class="MJX-TeXAtom-ORD"> <mo stretchy="false">(</mo> <mn>1</mn> <mo stretchy="false">)</mo> </mrow> </msubsup> <mo>,</mo> <msubsup> <mi>x</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>m</mi> </mrow> <mrow class="MJX-TeXAtom-ORD"> <mo stretchy="false">(</mo> <mn>2</mn> <mo stretchy="false">)</mo> </mrow> </msubsup> <mo>,</mo> <mi>m</mi> <mrow class="MJX-TeXAtom-ORD"> <mrow class="MJX-TeXAtom-ORD"> <mo maxsize="1.2em" minsize="1.2em">)</mo> </mrow> </mrow> <mo>=</mo> <mrow class="MJX-TeXAtom-ORD"> <mrow> <mo>(</mo> <mtable rowspacing="4pt" columnspacing="1em"> <mtr> <mtd> <mi>cos</mi> <mo>⁡<!-- --></mo> <mi>m</mi> <mi>θ<!-- θ --></mi> </mtd> <mtd> <mo>−<!-- − --></mo> <mi>sin</mi> <mo>⁡<!-- --></mo> <mi>m</mi> <mi>θ<!-- θ --></mi> </mtd> </mtr> <mtr> <mtd> <mi>sin</mi> <mo>⁡<!-- --></mo> <mi>m</mi> <mi>θ<!-- θ --></mi> </mtd> <mtd> <mi>cos</mi> <mo>⁡<!-- --></mo> <mi>m</mi> <mi>θ<!-- θ --></mi> </mtd> </mtr> </mtable> <mo>)</mo> </mrow> </mrow> <mrow class="MJX-TeXAtom-ORD"> <mrow> <mo>(</mo> <mtable rowspacing="4pt" columnspacing="1em"> <mtr> <mtd> <msubsup> <mi>x</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>m</mi> </mrow> <mrow class="MJX-TeXAtom-ORD"> <mo stretchy="false">(</mo> <mn>1</mn> <mo stretchy="false">)</mo> </mrow> </msubsup> </mtd> </mtr> <mtr> <mtd> <msubsup> <mi>x</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>m</mi> </mrow> <mrow class="MJX-TeXAtom-ORD"> <mo stretchy="false">(</mo> <mn>2</mn> <mo stretchy="false">)</mo> </mrow> </msubsup> </mtd> </mtr> </mtable> <mo>)</mo> </mrow> </mrow> <mo>=</mo> <mrow class="MJX-TeXAtom-ORD"> <mrow> <mo>(</mo> <mtable rowspacing="4pt" columnspacing="1em"> <mtr> <mtd> <msubsup> <mi>x</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>m</mi> </mrow> <mrow class="MJX-TeXAtom-ORD"> <mo stretchy="false">(</mo> <mn>1</mn> <mo stretchy="false">)</mo> </mrow> </msubsup> <mi>cos</mi> <mo>⁡<!-- --></mo> <mi>m</mi> <mi>θ<!-- θ --></mi> <mo>−<!-- − --></mo> <msubsup> <mi>x</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>m</mi> </mrow> <mrow class="MJX-TeXAtom-ORD"> <mo stretchy="false">(</mo> <mn>2</mn> <mo stretchy="false">)</mo> </mrow> </msubsup> <mi>sin</mi> <mo>⁡<!-- --></mo> <mi>m</mi> <mi>θ<!-- θ --></mi> </mtd> </mtr> <mtr> <mtd> <msubsup> <mi>x</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>m</mi> </mrow> <mrow class="MJX-TeXAtom-ORD"> <mo stretchy="false">(</mo> <mn>2</mn> <mo stretchy="false">)</mo> </mrow> </msubsup> <mi>cos</mi> <mo>⁡<!-- --></mo> <mi>m</mi> <mi>θ<!-- θ --></mi> <mo>+</mo> <msubsup> <mi>x</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>m</mi> </mrow> <mrow class="MJX-TeXAtom-ORD"> <mo stretchy="false">(</mo> <mn>1</mn> <mo stretchy="false">)</mo> </mrow> </msubsup> <mi>sin</mi> <mo>⁡<!-- --></mo> <mi>m</mi> <mi>θ<!-- θ --></mi> </mtd> </mtr> </mtable> <mo>)</mo> </mrow> </mrow> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle {\text{RoPE}}{\big (}x_{m}^{(1)},x_{m}^{(2)},m{\big )}={\begin{pmatrix}\cos m\theta &-\sin m\theta \\\sin m\theta &\cos m\theta \end{pmatrix}}{\begin{pmatrix}x_{m}^{(1)}\\x_{m}^{(2)}\\\end{pmatrix}}={\begin{pmatrix}x_{m}^{(1)}\cos m\theta -x_{m}^{(2)}\sin m\theta \\x_{m}^{(2)}\cos m\theta +x_{m}^{(1)}\sin m\theta \\\end{pmatrix}}}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/0a15ebe8d780e084275a5115ab1bd66867b2df86" class="mwe-math-fallback-image-display mw-invert skin-invert" aria-hidden="true" style="vertical-align: -3.171ex; width:83.966ex; height:7.509ex;" alt="{\displaystyle {\text{RoPE}}{\big (}x_{m}^{(1)},x_{m}^{(2)},m{\big )}={\begin{pmatrix}\cos m\theta &-\sin m\theta \\\sin m\theta &\cos m\theta \end{pmatrix}}{\begin{pmatrix}x_{m}^{(1)}\\x_{m}^{(2)}\\\end{pmatrix}}={\begin{pmatrix}x_{m}^{(1)}\cos m\theta -x_{m}^{(2)}\sin m\theta \\x_{m}^{(2)}\cos m\theta +x_{m}^{(1)}\sin m\theta \\\end{pmatrix}}}"></span>Equivalently, if we write the 2-dimensional vectors as complex numbers <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle z_{m}:=x_{m}^{(1)}+ix_{m}^{(2)}}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <msub> <mi>z</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>m</mi> </mrow> </msub> <mo>:=</mo> <msubsup> <mi>x</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>m</mi> </mrow> <mrow class="MJX-TeXAtom-ORD"> <mo stretchy="false">(</mo> <mn>1</mn> <mo stretchy="false">)</mo> </mrow> </msubsup> <mo>+</mo> <mi>i</mi> <msubsup> <mi>x</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>m</mi> </mrow> <mrow class="MJX-TeXAtom-ORD"> <mo stretchy="false">(</mo> <mn>2</mn> <mo stretchy="false">)</mo> </mrow> </msubsup> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle z_{m}:=x_{m}^{(1)}+ix_{m}^{(2)}}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/ed57dd02fd5aafea96a4ee28322e10ca6883249f" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.671ex; width:17.471ex; height:3.343ex;" alt="{\displaystyle z_{m}:=x_{m}^{(1)}+ix_{m}^{(2)}}"></span>, then RoPE encoding is just multiplication by an angle:<span class="mwe-math-element"><span class="mwe-math-mathml-display mwe-math-mathml-a11y" style="display: none;"><math display="block" xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle {\text{RoPE}}{\big (}z_{m},m{\big )}=e^{im\theta }z_{m}}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mrow class="MJX-TeXAtom-ORD"> <mtext>RoPE</mtext> </mrow> <mrow class="MJX-TeXAtom-ORD"> <mrow class="MJX-TeXAtom-ORD"> <mo maxsize="1.2em" minsize="1.2em">(</mo> </mrow> </mrow> <msub> <mi>z</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>m</mi> </mrow> </msub> <mo>,</mo> <mi>m</mi> <mrow class="MJX-TeXAtom-ORD"> <mrow class="MJX-TeXAtom-ORD"> <mo maxsize="1.2em" minsize="1.2em">)</mo> </mrow> </mrow> <mo>=</mo> <msup> <mi>e</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>i</mi> <mi>m</mi> <mi>θ<!-- θ --></mi> </mrow> </msup> <msub> <mi>z</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>m</mi> </mrow> </msub> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle {\text{RoPE}}{\big (}z_{m},m{\big )}=e^{im\theta }z_{m}}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/de13aa67381dcfbf189a2beca91c14aa914c1d62" class="mwe-math-fallback-image-display mw-invert skin-invert" aria-hidden="true" style="vertical-align: -1.005ex; width:23.951ex; height:3.343ex;" alt="{\displaystyle {\text{RoPE}}{\big (}z_{m},m{\big )}=e^{im\theta }z_{m}}"></span>For a list of <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle 2n}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mn>2</mn> <mi>n</mi> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle 2n}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/134afa8ff09fdddd24b06f289e92e3a045092bd1" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.338ex; width:2.557ex; height:2.176ex;" alt="{\displaystyle 2n}"></span>-dimensional vectors, a RoPE encoder is defined by a sequence of angles <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle \theta ^{(1)},...,\theta ^{(n)}}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <msup> <mi>θ<!-- θ --></mi> <mrow class="MJX-TeXAtom-ORD"> <mo stretchy="false">(</mo> <mn>1</mn> <mo stretchy="false">)</mo> </mrow> </msup> <mo>,</mo> <mo>.</mo> <mo>.</mo> <mo>.</mo> <mo>,</mo> <msup> <mi>θ<!-- θ --></mi> <mrow class="MJX-TeXAtom-ORD"> <mo stretchy="false">(</mo> <mi>n</mi> <mo stretchy="false">)</mo> </mrow> </msup> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle \theta ^{(1)},...,\theta ^{(n)}}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/cf7ced81707daa22c51c4294a84b3f9d14d30eef" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.671ex; width:12.182ex; height:3.176ex;" alt="{\displaystyle \theta ^{(1)},...,\theta ^{(n)}}"></span>. Then the RoPE encoding is applied to each pair of coordinates. </p><p>The benefit of RoPE is that the dot-product between two vectors depends on their relative location only:<span class="mwe-math-element"><span class="mwe-math-mathml-display mwe-math-mathml-a11y" style="display: none;"><math display="block" xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle {\text{RoPE}}{\big (}x,m{\big )}^{T}{\text{RoPE}}{\big (}y,n{\big )}={\text{RoPE}}{\big (}x,m+k{\big )}^{T}{\text{RoPE}}{\big (}y,n+k{\big )}}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mrow class="MJX-TeXAtom-ORD"> <mtext>RoPE</mtext> </mrow> <mrow class="MJX-TeXAtom-ORD"> <mrow class="MJX-TeXAtom-ORD"> <mo maxsize="1.2em" minsize="1.2em">(</mo> </mrow> </mrow> <mi>x</mi> <mo>,</mo> <mi>m</mi> <msup> <mrow class="MJX-TeXAtom-ORD"> <mrow class="MJX-TeXAtom-ORD"> <mo maxsize="1.2em" minsize="1.2em">)</mo> </mrow> </mrow> <mrow class="MJX-TeXAtom-ORD"> <mi>T</mi> </mrow> </msup> <mrow class="MJX-TeXAtom-ORD"> <mtext>RoPE</mtext> </mrow> <mrow class="MJX-TeXAtom-ORD"> <mrow class="MJX-TeXAtom-ORD"> <mo maxsize="1.2em" minsize="1.2em">(</mo> </mrow> </mrow> <mi>y</mi> <mo>,</mo> <mi>n</mi> <mrow class="MJX-TeXAtom-ORD"> <mrow class="MJX-TeXAtom-ORD"> <mo maxsize="1.2em" minsize="1.2em">)</mo> </mrow> </mrow> <mo>=</mo> <mrow class="MJX-TeXAtom-ORD"> <mtext>RoPE</mtext> </mrow> <mrow class="MJX-TeXAtom-ORD"> <mrow class="MJX-TeXAtom-ORD"> <mo maxsize="1.2em" minsize="1.2em">(</mo> </mrow> </mrow> <mi>x</mi> <mo>,</mo> <mi>m</mi> <mo>+</mo> <mi>k</mi> <msup> <mrow class="MJX-TeXAtom-ORD"> <mrow class="MJX-TeXAtom-ORD"> <mo maxsize="1.2em" minsize="1.2em">)</mo> </mrow> </mrow> <mrow class="MJX-TeXAtom-ORD"> <mi>T</mi> </mrow> </msup> <mrow class="MJX-TeXAtom-ORD"> <mtext>RoPE</mtext> </mrow> <mrow class="MJX-TeXAtom-ORD"> <mrow class="MJX-TeXAtom-ORD"> <mo maxsize="1.2em" minsize="1.2em">(</mo> </mrow> </mrow> <mi>y</mi> <mo>,</mo> <mi>n</mi> <mo>+</mo> <mi>k</mi> <mrow class="MJX-TeXAtom-ORD"> <mrow class="MJX-TeXAtom-ORD"> <mo maxsize="1.2em" minsize="1.2em">)</mo> </mrow> </mrow> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle {\text{RoPE}}{\big (}x,m{\big )}^{T}{\text{RoPE}}{\big (}y,n{\big )}={\text{RoPE}}{\big (}x,m+k{\big )}^{T}{\text{RoPE}}{\big (}y,n+k{\big )}}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/e31c0d3482350f84642a7ec384650ab781eda79d" class="mwe-math-fallback-image-display mw-invert skin-invert" aria-hidden="true" style="vertical-align: -1.005ex; width:62.631ex; height:3.676ex;" alt="{\displaystyle {\text{RoPE}}{\big (}x,m{\big )}^{T}{\text{RoPE}}{\big (}y,n{\big )}={\text{RoPE}}{\big (}x,m+k{\big )}^{T}{\text{RoPE}}{\big (}y,n+k{\big )}}"></span> for any integer <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle k}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mi>k</mi> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle k}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/c3c9a2c7b599b37105512c5d570edc034056dd40" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.338ex; width:1.211ex; height:2.176ex;" alt="{\displaystyle k}"></span>. </p> <div class="mw-heading mw-heading4"><h4 id="ALiBi">ALiBi</h4><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Transformer_(deep_learning_architecture)&action=edit&section=33" title="Edit section: ALiBi"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <p>ALiBi (Attention with Linear Biases)<sup id="cite_ref-75" class="reference"><a href="#cite_note-75"><span class="cite-bracket">[</span>73<span class="cite-bracket">]</span></a></sup> is not a <i>replacement</i> for the positional encoder on the original transformer. Instead, it is an <i>additional</i> positional encoder that is directly plugged into the attention mechanism. Specifically, the ALiBi attention mechanism is<span class="mwe-math-element"><span class="mwe-math-mathml-display mwe-math-mathml-a11y" style="display: none;"><math display="block" xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle {\begin{aligned}{\text{Attention}}(Q,K,V)={\text{softmax}}\left({\frac {QK^{\mathrm {T} }}{\sqrt {d_{k}}}}+sB\right)V\end{aligned}}}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mrow class="MJX-TeXAtom-ORD"> <mtable columnalign="right left right left right left right left right left right left" rowspacing="3pt" columnspacing="0em 2em 0em 2em 0em 2em 0em 2em 0em 2em 0em" displaystyle="true"> <mtr> <mtd> <mrow class="MJX-TeXAtom-ORD"> <mtext>Attention</mtext> </mrow> <mo stretchy="false">(</mo> <mi>Q</mi> <mo>,</mo> <mi>K</mi> <mo>,</mo> <mi>V</mi> <mo stretchy="false">)</mo> <mo>=</mo> <mrow class="MJX-TeXAtom-ORD"> <mtext>softmax</mtext> </mrow> <mrow> <mo>(</mo> <mrow> <mrow class="MJX-TeXAtom-ORD"> <mfrac> <mrow> <mi>Q</mi> <msup> <mi>K</mi> <mrow class="MJX-TeXAtom-ORD"> <mrow class="MJX-TeXAtom-ORD"> <mi mathvariant="normal">T</mi> </mrow> </mrow> </msup> </mrow> <msqrt> <msub> <mi>d</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>k</mi> </mrow> </msub> </msqrt> </mfrac> </mrow> <mo>+</mo> <mi>s</mi> <mi>B</mi> </mrow> <mo>)</mo> </mrow> <mi>V</mi> </mtd> </mtr> </mtable> </mrow> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle {\begin{aligned}{\text{Attention}}(Q,K,V)={\text{softmax}}\left({\frac {QK^{\mathrm {T} }}{\sqrt {d_{k}}}}+sB\right)V\end{aligned}}}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/bffa099b8703700a9c48178b2158edb858fc58b9" class="mwe-math-fallback-image-display mw-invert skin-invert" aria-hidden="true" style="vertical-align: -3.171ex; width:49.448ex; height:7.509ex;" alt="{\displaystyle {\begin{aligned}{\text{Attention}}(Q,K,V)={\text{softmax}}\left({\frac {QK^{\mathrm {T} }}{\sqrt {d_{k}}}}+sB\right)V\end{aligned}}}"></span>Here, <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle s}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mi>s</mi> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle s}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/01d131dfd7673938b947072a13a9744fe997e632" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.338ex; width:1.09ex; height:1.676ex;" alt="{\displaystyle s}"></span> is a real number ("scalar"), and <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle B}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mi>B</mi> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle B}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/47136aad860d145f75f3eed3022df827cee94d7a" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.338ex; width:1.764ex; height:2.176ex;" alt="{\displaystyle B}"></span> is the <i>linear bias</i> matrix defined by<span class="mwe-math-element"><span class="mwe-math-mathml-display mwe-math-mathml-a11y" style="display: none;"><math display="block" xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle B={\begin{pmatrix}0&1&2&3&\cdots \\-1&0&1&2&\cdots \\-2&-1&0&1&\cdots \\-3&-2&-1&0&\cdots \\\vdots &\vdots &\vdots &\vdots &\ddots \\\end{pmatrix}}}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mi>B</mi> <mo>=</mo> <mrow class="MJX-TeXAtom-ORD"> <mrow> <mo>(</mo> <mtable rowspacing="4pt" columnspacing="1em"> <mtr> <mtd> <mn>0</mn> </mtd> <mtd> <mn>1</mn> </mtd> <mtd> <mn>2</mn> </mtd> <mtd> <mn>3</mn> </mtd> <mtd> <mo>⋯<!-- ⋯ --></mo> </mtd> </mtr> <mtr> <mtd> <mo>−<!-- − --></mo> <mn>1</mn> </mtd> <mtd> <mn>0</mn> </mtd> <mtd> <mn>1</mn> </mtd> <mtd> <mn>2</mn> </mtd> <mtd> <mo>⋯<!-- ⋯ --></mo> </mtd> </mtr> <mtr> <mtd> <mo>−<!-- − --></mo> <mn>2</mn> </mtd> <mtd> <mo>−<!-- − --></mo> <mn>1</mn> </mtd> <mtd> <mn>0</mn> </mtd> <mtd> <mn>1</mn> </mtd> <mtd> <mo>⋯<!-- ⋯ --></mo> </mtd> </mtr> <mtr> <mtd> <mo>−<!-- − --></mo> <mn>3</mn> </mtd> <mtd> <mo>−<!-- − --></mo> <mn>2</mn> </mtd> <mtd> <mo>−<!-- − --></mo> <mn>1</mn> </mtd> <mtd> <mn>0</mn> </mtd> <mtd> <mo>⋯<!-- ⋯ --></mo> </mtd> </mtr> <mtr> <mtd> <mo>⋮<!-- ⋮ --></mo> </mtd> <mtd> <mo>⋮<!-- ⋮ --></mo> </mtd> <mtd> <mo>⋮<!-- ⋮ --></mo> </mtd> <mtd> <mo>⋮<!-- ⋮ --></mo> </mtd> <mtd> <mo>⋱<!-- ⋱ --></mo> </mtd> </mtr> </mtable> <mo>)</mo> </mrow> </mrow> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle B={\begin{pmatrix}0&1&2&3&\cdots \\-1&0&1&2&\cdots \\-2&-1&0&1&\cdots \\-3&-2&-1&0&\cdots \\\vdots &\vdots &\vdots &\vdots &\ddots \\\end{pmatrix}}}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/12fa6edca20501fef7d70c74860db1e2fc70068a" class="mwe-math-fallback-image-display mw-invert skin-invert" aria-hidden="true" style="vertical-align: -8.171ex; width:32.024ex; height:17.509ex;" alt="{\displaystyle B={\begin{pmatrix}0&1&2&3&\cdots \\-1&0&1&2&\cdots \\-2&-1&0&1&\cdots \\-3&-2&-1&0&\cdots \\\vdots &\vdots &\vdots &\vdots &\ddots \\\end{pmatrix}}}"></span>in other words, <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle B_{i,j}=j-i}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <msub> <mi>B</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>i</mi> <mo>,</mo> <mi>j</mi> </mrow> </msub> <mo>=</mo> <mi>j</mi> <mo>−<!-- − --></mo> <mi>i</mi> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle B_{i,j}=j-i}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/1be4dbbb894617d7ae4dd3118e0be60cd018aec7" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -1.005ex; width:11.398ex; height:2.843ex;" alt="{\displaystyle B_{i,j}=j-i}"></span>. The idea being that the linear bias matrix is a softened mask. Just as <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle 0}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mn>0</mn> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle 0}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/2aae8864a3c1fec9585261791a809ddec1489950" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.338ex; width:1.162ex; height:2.176ex;" alt="{\displaystyle 0}"></span> represent full attention paid, and <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle -\infty }"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mo>−<!-- − --></mo> <mi mathvariant="normal">∞<!-- ∞ --></mi> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle -\infty }</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/ca2608c4b5fd3bffc73585f8c67e379b4e99b6f1" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.505ex; width:4.132ex; height:2.176ex;" alt="{\displaystyle -\infty }"></span> represents no attention paid, the linear bias matrix increases attention paid in one direction and decreases attention paid in the other direction. </p><p>ALiBi allows pretraining on short context windows, then fine-tuning on longer context windows. Since it is directly plugged into the attention mechanism, it can be combined with any positional encoder that is plugged into the "bottom" of the entire network (which is where the sinusoidal encoder on the original transformer, as well as RoPE and many others, are located). </p> <div class="mw-heading mw-heading4"><h4 id="Relative_Position_Encodings">Relative Position Encodings</h4><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Transformer_(deep_learning_architecture)&action=edit&section=34" title="Edit section: Relative Position Encodings"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <p>Relative Position Encodings<sup id="cite_ref-76" class="reference"><a href="#cite_note-76"><span class="cite-bracket">[</span>74<span class="cite-bracket">]</span></a></sup> is similar to ALiBi, but more generic:<span class="mwe-math-element"><span class="mwe-math-mathml-display mwe-math-mathml-a11y" style="display: none;"><math display="block" xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle {\begin{aligned}{\text{Attention}}(Q,K,V)={\text{softmax}}\left({\frac {QK^{\mathrm {T} }}{\sqrt {d_{k}}}}+B\right)V\end{aligned}}}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mrow class="MJX-TeXAtom-ORD"> <mtable columnalign="right left right left right left right left right left right left" rowspacing="3pt" columnspacing="0em 2em 0em 2em 0em 2em 0em 2em 0em 2em 0em" displaystyle="true"> <mtr> <mtd> <mrow class="MJX-TeXAtom-ORD"> <mtext>Attention</mtext> </mrow> <mo stretchy="false">(</mo> <mi>Q</mi> <mo>,</mo> <mi>K</mi> <mo>,</mo> <mi>V</mi> <mo stretchy="false">)</mo> <mo>=</mo> <mrow class="MJX-TeXAtom-ORD"> <mtext>softmax</mtext> </mrow> <mrow> <mo>(</mo> <mrow> <mrow class="MJX-TeXAtom-ORD"> <mfrac> <mrow> <mi>Q</mi> <msup> <mi>K</mi> <mrow class="MJX-TeXAtom-ORD"> <mrow class="MJX-TeXAtom-ORD"> <mi mathvariant="normal">T</mi> </mrow> </mrow> </msup> </mrow> <msqrt> <msub> <mi>d</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>k</mi> </mrow> </msub> </msqrt> </mfrac> </mrow> <mo>+</mo> <mi>B</mi> </mrow> <mo>)</mo> </mrow> <mi>V</mi> </mtd> </mtr> </mtable> </mrow> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle {\begin{aligned}{\text{Attention}}(Q,K,V)={\text{softmax}}\left({\frac {QK^{\mathrm {T} }}{\sqrt {d_{k}}}}+B\right)V\end{aligned}}}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/f15684ca2a0019fc697c04d62cea397b8f47dbb2" class="mwe-math-fallback-image-display mw-invert skin-invert" aria-hidden="true" style="vertical-align: -3.171ex; width:48.358ex; height:7.509ex;" alt="{\displaystyle {\begin{aligned}{\text{Attention}}(Q,K,V)={\text{softmax}}\left({\frac {QK^{\mathrm {T} }}{\sqrt {d_{k}}}}+B\right)V\end{aligned}}}"></span>where <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle B}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mi>B</mi> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle B}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/47136aad860d145f75f3eed3022df827cee94d7a" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.338ex; width:1.764ex; height:2.176ex;" alt="{\displaystyle B}"></span> is a <a href="/wiki/Toeplitz_matrix" title="Toeplitz matrix">Toeplitz matrix</a>, that is, <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle B_{i,j}=B_{i',j'}}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <msub> <mi>B</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>i</mi> <mo>,</mo> <mi>j</mi> </mrow> </msub> <mo>=</mo> <msub> <mi>B</mi> <mrow class="MJX-TeXAtom-ORD"> <msup> <mi>i</mi> <mo>′</mo> </msup> <mo>,</mo> <msup> <mi>j</mi> <mo>′</mo> </msup> </mrow> </msub> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle B_{i,j}=B_{i',j'}}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/fe7155b71c5f6c6f726c49a3c7f3a9047414171d" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -1.171ex; width:11.559ex; height:3.009ex;" alt="{\displaystyle B_{i,j}=B_{i',j'}}"></span> whenever <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle i-j=i'-j'}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mi>i</mi> <mo>−<!-- − --></mo> <mi>j</mi> <mo>=</mo> <msup> <mi>i</mi> <mo>′</mo> </msup> <mo>−<!-- − --></mo> <msup> <mi>j</mi> <mo>′</mo> </msup> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle i-j=i'-j'}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/a421740142156ea5322442d9d58cf47412bd30bf" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.671ex; width:13.67ex; height:2.843ex;" alt="{\displaystyle i-j=i'-j'}"></span>. This is contrasted with the original sinusoidal positional encoding, which is an "absolute positional encoding".<sup id="cite_ref-77" class="reference"><a href="#cite_note-77"><span class="cite-bracket">[</span>75<span class="cite-bracket">]</span></a></sup> </p> <div class="mw-heading mw-heading3"><h3 id="Efficient_implementation">Efficient implementation</h3><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Transformer_(deep_learning_architecture)&action=edit&section=35" title="Edit section: Efficient implementation"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <p>The transformer model has been implemented in standard deep learning <a href="/wiki/Framework_(computer_science)" class="mw-redirect" title="Framework (computer science)">frameworks</a> such as <a href="/wiki/TensorFlow" title="TensorFlow">TensorFlow</a> and <a href="/wiki/PyTorch" title="PyTorch">PyTorch</a>. <i>Transformers</i> is a library produced by <a href="/wiki/Hugging_Face" title="Hugging Face">Hugging Face</a> that supplies transformer-based architectures and pretrained models.<sup id="cite_ref-wolf2020_11-1" class="reference"><a href="#cite_note-wolf2020-11"><span class="cite-bracket">[</span>11<span class="cite-bracket">]</span></a></sup> </p> <div class="mw-heading mw-heading4"><h4 id="KV_caching">KV caching</h4><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Transformer_(deep_learning_architecture)&action=edit&section=36" title="Edit section: KV caching"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <p>When an autoregressive transformer is used for inference, such as generating text, the query vector is different at each step, but the already-computed key and value vectors are always the same. The <b>KV caching</b> method saves the computed key and value vectors at each attention block, so that they are not recomputed at each new token. <b>PagedAttention</b> applies <a href="/wiki/Memory_paging" title="Memory paging">memory paging</a> to KV caching.<sup id="cite_ref-78" class="reference"><a href="#cite_note-78"><span class="cite-bracket">[</span>76<span class="cite-bracket">]</span></a></sup><sup id="cite_ref-79" class="reference"><a href="#cite_note-79"><span class="cite-bracket">[</span>77<span class="cite-bracket">]</span></a></sup><sup id="cite_ref-80" class="reference"><a href="#cite_note-80"><span class="cite-bracket">[</span>78<span class="cite-bracket">]</span></a></sup> </p><p>If a transformer is used with a baked-in prompt, such as ["You are a customer support agent..."], then the key and value vectors can be computed for the prompt, and saved on disk. The saving in compute is significant when the model is used for many short interactions, such as in online chatbots. </p> <div class="mw-heading mw-heading4"><h4 id="FlashAttention">FlashAttention</h4><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Transformer_(deep_learning_architecture)&action=edit&section=37" title="Edit section: FlashAttention"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <p>FlashAttention<sup id="cite_ref-81" class="reference"><a href="#cite_note-81"><span class="cite-bracket">[</span>79<span class="cite-bracket">]</span></a></sup> is an algorithm that implements the transformer attention mechanism efficiently on a GPU. It is a communication-avoiding algorithm that performs <a href="/wiki/Block_matrix#Block_matrix_operations" title="Block matrix">matrix multiplications in blocks</a>, such that each block fits within the <a href="/wiki/Cache_(computing)" title="Cache (computing)">cache</a> of a GPU, and by careful management of the blocks it minimizes data copying between GPU caches (as data movement is slow). See the page on <a href="/wiki/Softmax_function#Numerical_algorithms" title="Softmax function">softmax</a> for details. </p><p>An improved version, FlashAttention-2,<sup id="cite_ref-82" class="reference"><a href="#cite_note-82"><span class="cite-bracket">[</span>80<span class="cite-bracket">]</span></a></sup><sup id="cite_ref-83" class="reference"><a href="#cite_note-83"><span class="cite-bracket">[</span>81<span class="cite-bracket">]</span></a></sup><sup id="cite_ref-84" class="reference"><a href="#cite_note-84"><span class="cite-bracket">[</span>82<span class="cite-bracket">]</span></a></sup> was developed to cater to the rising demand for language models capable of handling longer context lengths. It offers enhancements in work partitioning and parallelism, enabling it to achieve up to 230 TFLOPs/s on <a href="/wiki/Nvidia_A100" class="mw-redirect" title="Nvidia A100">A100</a> GPUs (<a href="/wiki/FP16" class="mw-redirect" title="FP16">FP16</a>/<a href="/wiki/BF16" class="mw-redirect" title="BF16">BF16</a>), a 2x speed increase over the original FlashAttention. </p><p>Key advancements in FlashAttention-2 include the reduction of non-matmul FLOPs, improved parallelism over the sequence length dimension, better work partitioning between GPU warps, and added support for head dimensions up to 256 and multi-query attention (MQA) and grouped-query attention (GQA).<sup id="cite_ref-85" class="reference"><a href="#cite_note-85"><span class="cite-bracket">[</span>83<span class="cite-bracket">]</span></a></sup> </p><p>Benchmarks revealed FlashAttention-2 to be up to 2x faster than FlashAttention and up to 9x faster than a standard attention implementation in PyTorch. Future developments include optimization for new hardware like <a href="/wiki/Nvidia_H100" class="mw-redirect" title="Nvidia H100">H100</a> GPUs and new data types like FP8. </p> <div class="mw-heading mw-heading4"><h4 id="Multi-Query_Attention">Multi-Query Attention</h4><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Transformer_(deep_learning_architecture)&action=edit&section=38" title="Edit section: Multi-Query Attention"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <p><span class="anchor" id="Multi-query_attention"></span><span class="anchor" id="Grouped-query_attention"></span> </p> <figure class="mw-default-size" typeof="mw:File/Thumb"><a href="/wiki/File:DeepSeek_KV_cache_comparison_between_MHA,_GQA,_MQA,_MLA.svg" class="mw-file-description"><img src="//upload.wikimedia.org/wikipedia/commons/thumb/8/83/DeepSeek_KV_cache_comparison_between_MHA%2C_GQA%2C_MQA%2C_MLA.svg/220px-DeepSeek_KV_cache_comparison_between_MHA%2C_GQA%2C_MQA%2C_MLA.svg.png" decoding="async" width="220" height="59" class="mw-file-element" srcset="//upload.wikimedia.org/wikipedia/commons/thumb/8/83/DeepSeek_KV_cache_comparison_between_MHA%2C_GQA%2C_MQA%2C_MLA.svg/330px-DeepSeek_KV_cache_comparison_between_MHA%2C_GQA%2C_MQA%2C_MLA.svg.png 1.5x, //upload.wikimedia.org/wikipedia/commons/thumb/8/83/DeepSeek_KV_cache_comparison_between_MHA%2C_GQA%2C_MQA%2C_MLA.svg/440px-DeepSeek_KV_cache_comparison_between_MHA%2C_GQA%2C_MQA%2C_MLA.svg.png 2x" data-file-width="1565" data-file-height="423" /></a><figcaption>Comparison between several different forms of attention mechanism and the amount of KV caching necessary for each.</figcaption></figure> <p>Multi-Query Attention changes the multiheaded attention mechanism.<sup id="cite_ref-86" class="reference"><a href="#cite_note-86"><span class="cite-bracket">[</span>84<span class="cite-bracket">]</span></a></sup> Whereas normally, </p><p><span class="mwe-math-element"><span class="mwe-math-mathml-display mwe-math-mathml-a11y" style="display: none;"><math display="block" xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle {\text{MultiheadedAttention}}(Q,K,V)={\text{Concat}}_{i\in [n_{\text{heads}}]}\left({\text{Attention}}(XW_{i}^{Q},XW_{i}^{K},XW_{i}^{V})\right)W^{O}}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mrow class="MJX-TeXAtom-ORD"> <mtext>MultiheadedAttention</mtext> </mrow> <mo stretchy="false">(</mo> <mi>Q</mi> <mo>,</mo> <mi>K</mi> <mo>,</mo> <mi>V</mi> <mo stretchy="false">)</mo> <mo>=</mo> <msub> <mrow class="MJX-TeXAtom-ORD"> <mtext>Concat</mtext> </mrow> <mrow class="MJX-TeXAtom-ORD"> <mi>i</mi> <mo>∈<!-- ∈ --></mo> <mo stretchy="false">[</mo> <msub> <mi>n</mi> <mrow class="MJX-TeXAtom-ORD"> <mtext>heads</mtext> </mrow> </msub> <mo stretchy="false">]</mo> </mrow> </msub> <mrow> <mo>(</mo> <mrow> <mrow class="MJX-TeXAtom-ORD"> <mtext>Attention</mtext> </mrow> <mo stretchy="false">(</mo> <mi>X</mi> <msubsup> <mi>W</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>i</mi> </mrow> <mrow class="MJX-TeXAtom-ORD"> <mi>Q</mi> </mrow> </msubsup> <mo>,</mo> <mi>X</mi> <msubsup> <mi>W</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>i</mi> </mrow> <mrow class="MJX-TeXAtom-ORD"> <mi>K</mi> </mrow> </msubsup> <mo>,</mo> <mi>X</mi> <msubsup> <mi>W</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>i</mi> </mrow> <mrow class="MJX-TeXAtom-ORD"> <mi>V</mi> </mrow> </msubsup> <mo stretchy="false">)</mo> </mrow> <mo>)</mo> </mrow> <msup> <mi>W</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>O</mi> </mrow> </msup> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle {\text{MultiheadedAttention}}(Q,K,V)={\text{Concat}}_{i\in [n_{\text{heads}}]}\left({\text{Attention}}(XW_{i}^{Q},XW_{i}^{K},XW_{i}^{V})\right)W^{O}}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/8c4da1f52f4912855ddbc0fb74d3a2c94831e27e" class="mwe-math-fallback-image-display mw-invert skin-invert" aria-hidden="true" style="vertical-align: -1.838ex; width:89.231ex; height:4.843ex;" alt="{\displaystyle {\text{MultiheadedAttention}}(Q,K,V)={\text{Concat}}_{i\in [n_{\text{heads}}]}\left({\text{Attention}}(XW_{i}^{Q},XW_{i}^{K},XW_{i}^{V})\right)W^{O}}"></span>with Multi-Query Attention, there is just one <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle W^{K},W^{V}}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <msup> <mi>W</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>K</mi> </mrow> </msup> <mo>,</mo> <msup> <mi>W</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>V</mi> </mrow> </msup> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle W^{K},W^{V}}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/d4e0456602ee9f229ab1286d5f486eb5f71332ae" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.671ex; width:9.239ex; height:3.009ex;" alt="{\displaystyle W^{K},W^{V}}"></span>, thus: </p><p><span class="mwe-math-element"><span class="mwe-math-mathml-display mwe-math-mathml-a11y" style="display: none;"><math display="block" xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle {\text{MultiQueryAttention}}(Q,K,V)={\text{Concat}}_{i\in [n_{\text{heads}}]}\left({\text{Attention}}(XW_{i}^{Q},XW^{K},XW^{V})\right)W^{O}}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mrow class="MJX-TeXAtom-ORD"> <mtext>MultiQueryAttention</mtext> </mrow> <mo stretchy="false">(</mo> <mi>Q</mi> <mo>,</mo> <mi>K</mi> <mo>,</mo> <mi>V</mi> <mo stretchy="false">)</mo> <mo>=</mo> <msub> <mrow class="MJX-TeXAtom-ORD"> <mtext>Concat</mtext> </mrow> <mrow class="MJX-TeXAtom-ORD"> <mi>i</mi> <mo>∈<!-- ∈ --></mo> <mo stretchy="false">[</mo> <msub> <mi>n</mi> <mrow class="MJX-TeXAtom-ORD"> <mtext>heads</mtext> </mrow> </msub> <mo stretchy="false">]</mo> </mrow> </msub> <mrow> <mo>(</mo> <mrow> <mrow class="MJX-TeXAtom-ORD"> <mtext>Attention</mtext> </mrow> <mo stretchy="false">(</mo> <mi>X</mi> <msubsup> <mi>W</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>i</mi> </mrow> <mrow class="MJX-TeXAtom-ORD"> <mi>Q</mi> </mrow> </msubsup> <mo>,</mo> <mi>X</mi> <msup> <mi>W</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>K</mi> </mrow> </msup> <mo>,</mo> <mi>X</mi> <msup> <mi>W</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>V</mi> </mrow> </msup> <mo stretchy="false">)</mo> </mrow> <mo>)</mo> </mrow> <msup> <mi>W</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>O</mi> </mrow> </msup> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle {\text{MultiQueryAttention}}(Q,K,V)={\text{Concat}}_{i\in [n_{\text{heads}}]}\left({\text{Attention}}(XW_{i}^{Q},XW^{K},XW^{V})\right)W^{O}}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/2eb0939568b3364f0c300eca805463355ce6d554" class="mwe-math-fallback-image-display mw-invert skin-invert" aria-hidden="true" style="vertical-align: -1.838ex; width:88.398ex; height:4.843ex;" alt="{\displaystyle {\text{MultiQueryAttention}}(Q,K,V)={\text{Concat}}_{i\in [n_{\text{heads}}]}\left({\text{Attention}}(XW_{i}^{Q},XW^{K},XW^{V})\right)W^{O}}"></span> </p><p>This has a neutral effect on model quality and training speed, but increases inference speed. </p><p>More generally, grouped-query attention (GQA) partitions attention heads into groups, each of which shares the key-value pair. MQA is GQA with one group, while standard multiheaded attention is GQA with the maximal number of groups.<sup id="cite_ref-87" class="reference"><a href="#cite_note-87"><span class="cite-bracket">[</span>85<span class="cite-bracket">]</span></a></sup> </p> <figure class="mw-default-size" typeof="mw:File/Thumb"><a href="/wiki/File:DeepSeek_MoE_and_MLA_(DeepSeek-V2).svg" class="mw-file-description"><img src="//upload.wikimedia.org/wikipedia/commons/thumb/2/20/DeepSeek_MoE_and_MLA_%28DeepSeek-V2%29.svg/220px-DeepSeek_MoE_and_MLA_%28DeepSeek-V2%29.svg.png" decoding="async" width="220" height="177" class="mw-file-element" srcset="//upload.wikimedia.org/wikipedia/commons/thumb/2/20/DeepSeek_MoE_and_MLA_%28DeepSeek-V2%29.svg/330px-DeepSeek_MoE_and_MLA_%28DeepSeek-V2%29.svg.png 1.5x, //upload.wikimedia.org/wikipedia/commons/thumb/2/20/DeepSeek_MoE_and_MLA_%28DeepSeek-V2%29.svg/440px-DeepSeek_MoE_and_MLA_%28DeepSeek-V2%29.svg.png 2x" data-file-width="1349" data-file-height="1085" /></a><figcaption>The architecture of V2, showing both MLA and a variant of <a href="/wiki/Mixture_of_experts" title="Mixture of experts">mixture of experts</a>.<sup id="cite_ref-:73_88-0" class="reference"><a href="#cite_note-:73-88"><span class="cite-bracket">[</span>86<span class="cite-bracket">]</span></a></sup><sup class="reference nowrap"><span title="Location: Figure 2">: Figure 2 </span></sup></figcaption></figure> <p><span class="anchor" id="MLA"></span><span class="anchor" id="Multihead_Latent_Attention"></span> Multihead Latent Attention (MLA) is a <a href="/wiki/Low-rank_approximation" title="Low-rank approximation">low-rank approximation</a> to standard MHA. Specifically, each hidden vector, before entering the attention mechanism, is first projected to two low-dimensional spaces ("latent space"), one for query and one for key-value (KV vector). This design minimizes the KV cache, as only the low-dimensional KV vector needs to be cached.<sup id="cite_ref-:73_88-1" class="reference"><a href="#cite_note-:73-88"><span class="cite-bracket">[</span>86<span class="cite-bracket">]</span></a></sup> </p> <div class="mw-heading mw-heading4"><h4 id="Speculative_decoding">Speculative decoding</h4><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Transformer_(deep_learning_architecture)&action=edit&section=39" title="Edit section: Speculative decoding"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <p>Speculative decoding<sup id="cite_ref-:2_89-0" class="reference"><a href="#cite_note-:2-89"><span class="cite-bracket">[</span>87<span class="cite-bracket">]</span></a></sup><sup id="cite_ref-90" class="reference"><a href="#cite_note-90"><span class="cite-bracket">[</span>88<span class="cite-bracket">]</span></a></sup> is a method to accelerate token decoding. Similarly to <a href="/wiki/Speculative_execution" title="Speculative execution">speculative execution</a> in CPUs, future tokens are computed quickly, then verified. If the quickly computed tokens are incorrect, they are discarded and computed slowly. </p><p>The key factor in speculative decoding is that a Transformer decoder can verify faster than it can decode, in the following sense. </p><p>Suppose we have two transformer models like GPT-3 and GPT-3-small, both with a context window size of 512. To generate an entire context window autoregressively with greedy decoding with GPT-3, it must be run for 512 times, each time generating a token <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle x_{1},x_{2},...,x_{512}}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <msub> <mi>x</mi> <mrow class="MJX-TeXAtom-ORD"> <mn>1</mn> </mrow> </msub> <mo>,</mo> <msub> <mi>x</mi> <mrow class="MJX-TeXAtom-ORD"> <mn>2</mn> </mrow> </msub> <mo>,</mo> <mo>.</mo> <mo>.</mo> <mo>.</mo> <mo>,</mo> <msub> <mi>x</mi> <mrow class="MJX-TeXAtom-ORD"> <mn>512</mn> </mrow> </msub> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle x_{1},x_{2},...,x_{512}}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/fd0b4677bb81d2d8af40d42edf0c42d0b2eaa34e" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.671ex; width:14.999ex; height:2.009ex;" alt="{\displaystyle x_{1},x_{2},...,x_{512}}"></span>, taking time <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle 512T_{\text{GPT-3}}}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mn>512</mn> <msub> <mi>T</mi> <mrow class="MJX-TeXAtom-ORD"> <mtext>GPT-3</mtext> </mrow> </msub> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle 512T_{\text{GPT-3}}}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/68cfccf622ab2cc50786ad9bcdc7ed5ab1dff812" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.671ex; width:10.043ex; height:2.509ex;" alt="{\displaystyle 512T_{\text{GPT-3}}}"></span>. However, if we had some educated guess for the values of these tokens, we could verify all of them in parallel, in one run of the model, by checking that each <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle x_{t}}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <msub> <mi>x</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>t</mi> </mrow> </msub> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle x_{t}}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/f279a30bc8eabc788f3fe81c9cfb674e72e858db" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.671ex; width:2.156ex; height:2.009ex;" alt="{\displaystyle x_{t}}"></span> is indeed the token with the largest log-likelihood in the <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle t}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mi>t</mi> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle t}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/65658b7b223af9e1acc877d848888ecdb4466560" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.338ex; width:0.84ex; height:2.009ex;" alt="{\displaystyle t}"></span>-th output. </p><p>In speculative decoding, a smaller model or some other simple heuristic is used to generate a few speculative tokens that are subsequently verified by the larger model. For example, suppose we use GPT-3-small to generate four speculative tokens: <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle {\tilde {x}}_{1},{\tilde {x}}_{2},{\tilde {x}}_{3},{\tilde {x}}_{4}}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <msub> <mrow class="MJX-TeXAtom-ORD"> <mrow class="MJX-TeXAtom-ORD"> <mover> <mi>x</mi> <mo stretchy="false">~<!-- ~ --></mo> </mover> </mrow> </mrow> <mrow class="MJX-TeXAtom-ORD"> <mn>1</mn> </mrow> </msub> <mo>,</mo> <msub> <mrow class="MJX-TeXAtom-ORD"> <mrow class="MJX-TeXAtom-ORD"> <mover> <mi>x</mi> <mo stretchy="false">~<!-- ~ --></mo> </mover> </mrow> </mrow> <mrow class="MJX-TeXAtom-ORD"> <mn>2</mn> </mrow> </msub> <mo>,</mo> <msub> <mrow class="MJX-TeXAtom-ORD"> <mrow class="MJX-TeXAtom-ORD"> <mover> <mi>x</mi> <mo stretchy="false">~<!-- ~ --></mo> </mover> </mrow> </mrow> <mrow class="MJX-TeXAtom-ORD"> <mn>3</mn> </mrow> </msub> <mo>,</mo> <msub> <mrow class="MJX-TeXAtom-ORD"> <mrow class="MJX-TeXAtom-ORD"> <mover> <mi>x</mi> <mo stretchy="false">~<!-- ~ --></mo> </mover> </mrow> </mrow> <mrow class="MJX-TeXAtom-ORD"> <mn>4</mn> </mrow> </msub> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle {\tilde {x}}_{1},{\tilde {x}}_{2},{\tilde {x}}_{3},{\tilde {x}}_{4}}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/d8db060dad8f25abc632d0c847694d10943fefc0" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.671ex; width:12.638ex; height:2.509ex;" alt="{\displaystyle {\tilde {x}}_{1},{\tilde {x}}_{2},{\tilde {x}}_{3},{\tilde {x}}_{4}}"></span>. This only takes <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle 4T_{\text{GPT-3-small}}}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mn>4</mn> <msub> <mi>T</mi> <mrow class="MJX-TeXAtom-ORD"> <mtext>GPT-3-small</mtext> </mrow> </msub> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle 4T_{\text{GPT-3-small}}}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/f27766b2e3974a6b36c1a34989690d453598eedd" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.671ex; width:12.019ex; height:2.509ex;" alt="{\displaystyle 4T_{\text{GPT-3-small}}}"></span>. These tokens are then run through the larger GPT-3 in one go. Suppose that <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle {\tilde {x}}_{1}}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <msub> <mrow class="MJX-TeXAtom-ORD"> <mrow class="MJX-TeXAtom-ORD"> <mover> <mi>x</mi> <mo stretchy="false">~<!-- ~ --></mo> </mover> </mrow> </mrow> <mrow class="MJX-TeXAtom-ORD"> <mn>1</mn> </mrow> </msub> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle {\tilde {x}}_{1}}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/bd6433a2811e7287025f60332718fe31d72003a7" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.671ex; width:2.384ex; height:2.509ex;" alt="{\displaystyle {\tilde {x}}_{1}}"></span> and <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle {\tilde {x}}_{2}}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <msub> <mrow class="MJX-TeXAtom-ORD"> <mrow class="MJX-TeXAtom-ORD"> <mover> <mi>x</mi> <mo stretchy="false">~<!-- ~ --></mo> </mover> </mrow> </mrow> <mrow class="MJX-TeXAtom-ORD"> <mn>2</mn> </mrow> </msub> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle {\tilde {x}}_{2}}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/e6c8b294488a1ddcc0380c8a8cb75b3410fe7ede" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.671ex; width:2.384ex; height:2.509ex;" alt="{\displaystyle {\tilde {x}}_{2}}"></span> are verified by GPT-3 as what it would have picked, then those are kept, but <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle {\tilde {x}}_{3}}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <msub> <mrow class="MJX-TeXAtom-ORD"> <mrow class="MJX-TeXAtom-ORD"> <mover> <mi>x</mi> <mo stretchy="false">~<!-- ~ --></mo> </mover> </mrow> </mrow> <mrow class="MJX-TeXAtom-ORD"> <mn>3</mn> </mrow> </msub> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle {\tilde {x}}_{3}}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/2de3175aa437329b2c5b3a3fc1167be15882b81d" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.671ex; width:2.384ex; height:2.509ex;" alt="{\displaystyle {\tilde {x}}_{3}}"></span> is not, so <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle {\tilde {x}}_{3},{\tilde {x}}_{4}}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <msub> <mrow class="MJX-TeXAtom-ORD"> <mrow class="MJX-TeXAtom-ORD"> <mover> <mi>x</mi> <mo stretchy="false">~<!-- ~ --></mo> </mover> </mrow> </mrow> <mrow class="MJX-TeXAtom-ORD"> <mn>3</mn> </mrow> </msub> <mo>,</mo> <msub> <mrow class="MJX-TeXAtom-ORD"> <mrow class="MJX-TeXAtom-ORD"> <mover> <mi>x</mi> <mo stretchy="false">~<!-- ~ --></mo> </mover> </mrow> </mrow> <mrow class="MJX-TeXAtom-ORD"> <mn>4</mn> </mrow> </msub> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle {\tilde {x}}_{3},{\tilde {x}}_{4}}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/fa75c04dd4b830a3f8686e81cc88136365640db8" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.671ex; width:5.802ex; height:2.509ex;" alt="{\displaystyle {\tilde {x}}_{3},{\tilde {x}}_{4}}"></span> are discarded, and GPT-3 is run on those. This would take <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle 4T_{\text{GPT-3-small}}+3T_{\text{GPT-3}}}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mn>4</mn> <msub> <mi>T</mi> <mrow class="MJX-TeXAtom-ORD"> <mtext>GPT-3-small</mtext> </mrow> </msub> <mo>+</mo> <mn>3</mn> <msub> <mi>T</mi> <mrow class="MJX-TeXAtom-ORD"> <mtext>GPT-3</mtext> </mrow> </msub> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle 4T_{\text{GPT-3-small}}+3T_{\text{GPT-3}}}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/4c60c2a824e236297f444a500e8c324540dfd24d" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.671ex; width:22.577ex; height:2.509ex;" alt="{\displaystyle 4T_{\text{GPT-3-small}}+3T_{\text{GPT-3}}}"></span>, which might be shorter than <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle 4T_{\text{GPT-3}}}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mn>4</mn> <msub> <mi>T</mi> <mrow class="MJX-TeXAtom-ORD"> <mtext>GPT-3</mtext> </mrow> </msub> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle 4T_{\text{GPT-3}}}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/e2bf07e1b17f4de6fb7480487563b3a31613dfd7" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.671ex; width:7.718ex; height:2.509ex;" alt="{\displaystyle 4T_{\text{GPT-3}}}"></span>. </p><p>For non-greedy decoding, similar ideas apply, except the speculative tokens are accepted or rejected stochastically, in a way that guarantees the final output distribution is the same as if speculative decoding was not used.<sup id="cite_ref-:2_89-1" class="reference"><a href="#cite_note-:2-89"><span class="cite-bracket">[</span>87<span class="cite-bracket">]</span></a></sup><sup id="cite_ref-91" class="reference"><a href="#cite_note-91"><span class="cite-bracket">[</span>89<span class="cite-bracket">]</span></a></sup> </p> <figure class="mw-default-size" typeof="mw:File/Thumb"><a href="/wiki/File:Multi-Token_Prediction_(DeepSeek)_01.svg" class="mw-file-description"><img src="//upload.wikimedia.org/wikipedia/commons/thumb/0/0b/Multi-Token_Prediction_%28DeepSeek%29_01.svg/220px-Multi-Token_Prediction_%28DeepSeek%29_01.svg.png" decoding="async" width="220" height="98" class="mw-file-element" srcset="//upload.wikimedia.org/wikipedia/commons/thumb/0/0b/Multi-Token_Prediction_%28DeepSeek%29_01.svg/330px-Multi-Token_Prediction_%28DeepSeek%29_01.svg.png 1.5x, //upload.wikimedia.org/wikipedia/commons/thumb/0/0b/Multi-Token_Prediction_%28DeepSeek%29_01.svg/440px-Multi-Token_Prediction_%28DeepSeek%29_01.svg.png 2x" data-file-width="1560" data-file-height="695" /></a><figcaption>Multi-Token Prediction.</figcaption></figure> <p><span class="anchor" id="Multi-Token_Prediction"></span>In Multi-Token Prediction, a single forward pass creates a final embedding vector, which then is un-embedded into a token probability. However, that vector can then be further processed by another Transformer block to predict the <i>next</i> token, and so on for arbitrarily many steps into the future. This trades off accuracy for speed, since each new token costs just one more Transformer block, rather than the entire stack.<sup id="cite_ref-92" class="reference"><a href="#cite_note-92"><span class="cite-bracket">[</span>90<span class="cite-bracket">]</span></a></sup><sup id="cite_ref-93" class="reference"><a href="#cite_note-93"><span class="cite-bracket">[</span>91<span class="cite-bracket">]</span></a></sup> </p> <div class="mw-heading mw-heading3"><h3 id="Sub-quadratic_transformers">Sub-quadratic transformers</h3><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Transformer_(deep_learning_architecture)&action=edit&section=40" title="Edit section: Sub-quadratic transformers"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <p>Training transformer-based architectures can be expensive, especially for long inputs.<sup id="cite_ref-reformer_94-0" class="reference"><a href="#cite_note-reformer-94"><span class="cite-bracket">[</span>92<span class="cite-bracket">]</span></a></sup> Many methods have been developed to attempt to address the issue. In the image domain, Swin Transformer is an efficient architecture that performs attention inside shifting windows.<sup id="cite_ref-95" class="reference"><a href="#cite_note-95"><span class="cite-bracket">[</span>93<span class="cite-bracket">]</span></a></sup> In the audio domain, SepTr decouples the attention in time and frequency domains.<sup id="cite_ref-96" class="reference"><a href="#cite_note-96"><span class="cite-bracket">[</span>94<span class="cite-bracket">]</span></a></sup> <i>Long Range Arena</i> (2020)<sup id="cite_ref-97" class="reference"><a href="#cite_note-97"><span class="cite-bracket">[</span>95<span class="cite-bracket">]</span></a></sup> is a standard benchmark for comparing the behavior of transformer architectures over long inputs. </p> <div class="mw-heading mw-heading4"><h4 id="Alternative_attention_graphs">Alternative attention graphs</h4><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Transformer_(deep_learning_architecture)&action=edit&section=41" title="Edit section: Alternative attention graphs"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <p>The standard attention graph is either all-to-all or causal, both of which scales as <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle O(N^{2})}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mi>O</mi> <mo stretchy="false">(</mo> <msup> <mi>N</mi> <mrow class="MJX-TeXAtom-ORD"> <mn>2</mn> </mrow> </msup> <mo stretchy="false">)</mo> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle O(N^{2})}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/e5d43a3df904fa4d7220f5b86285298aa36d969b" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.838ex; width:6.76ex; height:3.176ex;" alt="{\displaystyle O(N^{2})}"></span> where <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle N}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mi>N</mi> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle N}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/f5e3890c981ae85503089652feb48b191b57aae3" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.338ex; width:2.064ex; height:2.176ex;" alt="{\displaystyle N}"></span> is the number of tokens in a sequence. </p><p>Reformer (2020)<sup id="cite_ref-reformer_94-1" class="reference"><a href="#cite_note-reformer-94"><span class="cite-bracket">[</span>92<span class="cite-bracket">]</span></a></sup><sup id="cite_ref-98" class="reference"><a href="#cite_note-98"><span class="cite-bracket">[</span>96<span class="cite-bracket">]</span></a></sup> reduces the computational load from <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle O(N^{2})}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mi>O</mi> <mo stretchy="false">(</mo> <msup> <mi>N</mi> <mrow class="MJX-TeXAtom-ORD"> <mn>2</mn> </mrow> </msup> <mo stretchy="false">)</mo> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle O(N^{2})}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/e5d43a3df904fa4d7220f5b86285298aa36d969b" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.838ex; width:6.76ex; height:3.176ex;" alt="{\displaystyle O(N^{2})}"></span> to <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle O(N\ln N)}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mi>O</mi> <mo stretchy="false">(</mo> <mi>N</mi> <mi>ln</mi> <mo>⁡<!-- --></mo> <mi>N</mi> <mo stretchy="false">)</mo> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle O(N\ln N)}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/d3d19d1f2923ba0d7170ade3df165c0de1d2423e" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.838ex; width:10.423ex; height:2.843ex;" alt="{\displaystyle O(N\ln N)}"></span> by using <a href="/wiki/Locality-sensitive_hashing" title="Locality-sensitive hashing">locality-sensitive hashing</a> and reversible layers.<sup id="cite_ref-99" class="reference"><a href="#cite_note-99"><span class="cite-bracket">[</span>97<span class="cite-bracket">]</span></a></sup> </p><p>Sparse attention<sup id="cite_ref-100" class="reference"><a href="#cite_note-100"><span class="cite-bracket">[</span>98<span class="cite-bracket">]</span></a></sup> uses attention graphs that grows slower than <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle O(N^{2})}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mi>O</mi> <mo stretchy="false">(</mo> <msup> <mi>N</mi> <mrow class="MJX-TeXAtom-ORD"> <mn>2</mn> </mrow> </msup> <mo stretchy="false">)</mo> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle O(N^{2})}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/e5d43a3df904fa4d7220f5b86285298aa36d969b" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.838ex; width:6.76ex; height:3.176ex;" alt="{\displaystyle O(N^{2})}"></span>. For example, BigBird (2020)<sup id="cite_ref-101" class="reference"><a href="#cite_note-101"><span class="cite-bracket">[</span>99<span class="cite-bracket">]</span></a></sup> uses random <a href="/wiki/Small-world_network" title="Small-world network">small-world networks</a> which grows as <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle O(N)}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mi>O</mi> <mo stretchy="false">(</mo> <mi>N</mi> <mo stretchy="false">)</mo> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle O(N)}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/78484c5c26cfc97bb3b915418caa09454421e80b" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.838ex; width:5.646ex; height:2.843ex;" alt="{\displaystyle O(N)}"></span>. </p><p>Ordinary transformers require a memory size that is quadratic in the size of the context window. Attention-free transformers<sup id="cite_ref-102" class="reference"><a href="#cite_note-102"><span class="cite-bracket">[</span>100<span class="cite-bracket">]</span></a></sup> reduce this to a linear dependence while still retaining the advantages of a transformer by linking the key to the value. </p> <div class="mw-heading mw-heading4"><h4 id="Random_Feature_Attention">Random Feature Attention</h4><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Transformer_(deep_learning_architecture)&action=edit&section=42" title="Edit section: Random Feature Attention"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <p>Random Feature Attention (2021)<sup id="cite_ref-103" class="reference"><a href="#cite_note-103"><span class="cite-bracket">[</span>101<span class="cite-bracket">]</span></a></sup> uses <a href="/wiki/Radial_basis_function_kernel#Fourier_random_features" title="Radial basis function kernel">Fourier random features</a>:<span class="mwe-math-element"><span class="mwe-math-mathml-display mwe-math-mathml-a11y" style="display: none;"><math display="block" xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle \varphi (x)={\frac {1}{\sqrt {D}}}[\cos \langle w_{1},x\rangle ,\sin \langle w_{1},x\rangle ,\cdots \cos \langle w_{D},x\rangle ,\sin \langle w_{D},x\rangle ]^{T}}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mi>φ<!-- φ --></mi> <mo stretchy="false">(</mo> <mi>x</mi> <mo stretchy="false">)</mo> <mo>=</mo> <mrow class="MJX-TeXAtom-ORD"> <mfrac> <mn>1</mn> <msqrt> <mi>D</mi> </msqrt> </mfrac> </mrow> <mo stretchy="false">[</mo> <mi>cos</mi> <mo>⁡<!-- --></mo> <mo fence="false" stretchy="false">⟨<!-- ⟨ --></mo> <msub> <mi>w</mi> <mrow class="MJX-TeXAtom-ORD"> <mn>1</mn> </mrow> </msub> <mo>,</mo> <mi>x</mi> <mo fence="false" stretchy="false">⟩<!-- ⟩ --></mo> <mo>,</mo> <mi>sin</mi> <mo>⁡<!-- --></mo> <mo fence="false" stretchy="false">⟨<!-- ⟨ --></mo> <msub> <mi>w</mi> <mrow class="MJX-TeXAtom-ORD"> <mn>1</mn> </mrow> </msub> <mo>,</mo> <mi>x</mi> <mo fence="false" stretchy="false">⟩<!-- ⟩ --></mo> <mo>,</mo> <mo>⋯<!-- ⋯ --></mo> <mi>cos</mi> <mo>⁡<!-- --></mo> <mo fence="false" stretchy="false">⟨<!-- ⟨ --></mo> <msub> <mi>w</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>D</mi> </mrow> </msub> <mo>,</mo> <mi>x</mi> <mo fence="false" stretchy="false">⟩<!-- ⟩ --></mo> <mo>,</mo> <mi>sin</mi> <mo>⁡<!-- --></mo> <mo fence="false" stretchy="false">⟨<!-- ⟨ --></mo> <msub> <mi>w</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>D</mi> </mrow> </msub> <mo>,</mo> <mi>x</mi> <mo fence="false" stretchy="false">⟩<!-- ⟩ --></mo> <msup> <mo stretchy="false">]</mo> <mrow class="MJX-TeXAtom-ORD"> <mi>T</mi> </mrow> </msup> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle \varphi (x)={\frac {1}{\sqrt {D}}}[\cos \langle w_{1},x\rangle ,\sin \langle w_{1},x\rangle ,\cdots \cos \langle w_{D},x\rangle ,\sin \langle w_{D},x\rangle ]^{T}}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/243ed0310c01dc8193d985ea838e92191cec4fac" class="mwe-math-fallback-image-display mw-invert skin-invert" aria-hidden="true" style="vertical-align: -2.838ex; width:61.925ex; height:6.176ex;" alt="{\displaystyle \varphi (x)={\frac {1}{\sqrt {D}}}[\cos \langle w_{1},x\rangle ,\sin \langle w_{1},x\rangle ,\cdots \cos \langle w_{D},x\rangle ,\sin \langle w_{D},x\rangle ]^{T}}"></span>where <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle w_{1},...,w_{D}}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <msub> <mi>w</mi> <mrow class="MJX-TeXAtom-ORD"> <mn>1</mn> </mrow> </msub> <mo>,</mo> <mo>.</mo> <mo>.</mo> <mo>.</mo> <mo>,</mo> <msub> <mi>w</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>D</mi> </mrow> </msub> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle w_{1},...,w_{D}}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/c97fcd8d8f90947efda4e03b5ffc293c61cb094c" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.671ex; width:11.145ex; height:2.009ex;" alt="{\displaystyle w_{1},...,w_{D}}"></span> are independent samples from the normal distribution <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle N(0,\sigma ^{2}I)}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mi>N</mi> <mo stretchy="false">(</mo> <mn>0</mn> <mo>,</mo> <msup> <mi>σ<!-- σ --></mi> <mrow class="MJX-TeXAtom-ORD"> <mn>2</mn> </mrow> </msup> <mi>I</mi> <mo stretchy="false">)</mo> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle N(0,\sigma ^{2}I)}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/9d30d920f052b8230e76c64a55f2cddc963b201f" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.838ex; width:9.626ex; height:3.176ex;" alt="{\displaystyle N(0,\sigma ^{2}I)}"></span>. This choice of parameters satisfy <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle \mathbb {E} [\langle \varphi (x),\varphi (y)\rangle ]=e^{-{\frac {\|x-y\|^{2}}{2\sigma ^{2}}}}}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mrow class="MJX-TeXAtom-ORD"> <mi mathvariant="double-struck">E</mi> </mrow> <mo stretchy="false">[</mo> <mo fence="false" stretchy="false">⟨<!-- ⟨ --></mo> <mi>φ<!-- φ --></mi> <mo stretchy="false">(</mo> <mi>x</mi> <mo stretchy="false">)</mo> <mo>,</mo> <mi>φ<!-- φ --></mi> <mo stretchy="false">(</mo> <mi>y</mi> <mo stretchy="false">)</mo> <mo fence="false" stretchy="false">⟩<!-- ⟩ --></mo> <mo stretchy="false">]</mo> <mo>=</mo> <msup> <mi>e</mi> <mrow class="MJX-TeXAtom-ORD"> <mo>−<!-- − --></mo> <mrow class="MJX-TeXAtom-ORD"> <mfrac> <mrow> <mo fence="false" stretchy="false">‖<!-- ‖ --></mo> <mi>x</mi> <mo>−<!-- − --></mo> <mi>y</mi> <msup> <mo fence="false" stretchy="false">‖<!-- ‖ --></mo> <mrow class="MJX-TeXAtom-ORD"> <mn>2</mn> </mrow> </msup> </mrow> <mrow> <mn>2</mn> <msup> <mi>σ<!-- σ --></mi> <mrow class="MJX-TeXAtom-ORD"> <mn>2</mn> </mrow> </msup> </mrow> </mfrac> </mrow> </mrow> </msup> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle \mathbb {E} [\langle \varphi (x),\varphi (y)\rangle ]=e^{-{\frac {\|x-y\|^{2}}{2\sigma ^{2}}}}}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/2e0f85eabd1581c50b848cf5d2d73ce4e7ac6e1d" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.838ex; width:25.96ex; height:5.176ex;" alt="{\displaystyle \mathbb {E} [\langle \varphi (x),\varphi (y)\rangle ]=e^{-{\frac {\|x-y\|^{2}}{2\sigma ^{2}}}}}"></span>, or <span class="mwe-math-element"><span class="mwe-math-mathml-display mwe-math-mathml-a11y" style="display: none;"><math display="block" xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle e^{\langle x,y\rangle /\sigma ^{2}}=\mathbb {E} [\langle e^{\|x\|^{2}/2\sigma ^{2}}\varphi (x),e^{\|y\|^{2}/2\sigma ^{2}}\varphi (y)\rangle ]\approx \langle e^{\|x\|^{2}/2\sigma ^{2}}\varphi (x),e^{\|y\|^{2}/2\sigma ^{2}}\varphi (y)\rangle }"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <msup> <mi>e</mi> <mrow class="MJX-TeXAtom-ORD"> <mo fence="false" stretchy="false">⟨<!-- ⟨ --></mo> <mi>x</mi> <mo>,</mo> <mi>y</mi> <mo fence="false" stretchy="false">⟩<!-- ⟩ --></mo> <mrow class="MJX-TeXAtom-ORD"> <mo>/</mo> </mrow> <msup> <mi>σ<!-- σ --></mi> <mrow class="MJX-TeXAtom-ORD"> <mn>2</mn> </mrow> </msup> </mrow> </msup> <mo>=</mo> <mrow class="MJX-TeXAtom-ORD"> <mi mathvariant="double-struck">E</mi> </mrow> <mo stretchy="false">[</mo> <mo fence="false" stretchy="false">⟨<!-- ⟨ --></mo> <msup> <mi>e</mi> <mrow class="MJX-TeXAtom-ORD"> <mo fence="false" stretchy="false">‖<!-- ‖ --></mo> <mi>x</mi> <msup> <mo fence="false" stretchy="false">‖<!-- ‖ --></mo> <mrow class="MJX-TeXAtom-ORD"> <mn>2</mn> </mrow> </msup> <mrow class="MJX-TeXAtom-ORD"> <mo>/</mo> </mrow> <mn>2</mn> <msup> <mi>σ<!-- σ --></mi> <mrow class="MJX-TeXAtom-ORD"> <mn>2</mn> </mrow> </msup> </mrow> </msup> <mi>φ<!-- φ --></mi> <mo stretchy="false">(</mo> <mi>x</mi> <mo stretchy="false">)</mo> <mo>,</mo> <msup> <mi>e</mi> <mrow class="MJX-TeXAtom-ORD"> <mo fence="false" stretchy="false">‖<!-- ‖ --></mo> <mi>y</mi> <msup> <mo fence="false" stretchy="false">‖<!-- ‖ --></mo> <mrow class="MJX-TeXAtom-ORD"> <mn>2</mn> </mrow> </msup> <mrow class="MJX-TeXAtom-ORD"> <mo>/</mo> </mrow> <mn>2</mn> <msup> <mi>σ<!-- σ --></mi> <mrow class="MJX-TeXAtom-ORD"> <mn>2</mn> </mrow> </msup> </mrow> </msup> <mi>φ<!-- φ --></mi> <mo stretchy="false">(</mo> <mi>y</mi> <mo stretchy="false">)</mo> <mo fence="false" stretchy="false">⟩<!-- ⟩ --></mo> <mo stretchy="false">]</mo> <mo>≈<!-- ≈ --></mo> <mo fence="false" stretchy="false">⟨<!-- ⟨ --></mo> <msup> <mi>e</mi> <mrow class="MJX-TeXAtom-ORD"> <mo fence="false" stretchy="false">‖<!-- ‖ --></mo> <mi>x</mi> <msup> <mo fence="false" stretchy="false">‖<!-- ‖ --></mo> <mrow class="MJX-TeXAtom-ORD"> <mn>2</mn> </mrow> </msup> <mrow class="MJX-TeXAtom-ORD"> <mo>/</mo> </mrow> <mn>2</mn> <msup> <mi>σ<!-- σ --></mi> <mrow class="MJX-TeXAtom-ORD"> <mn>2</mn> </mrow> </msup> </mrow> </msup> <mi>φ<!-- φ --></mi> <mo stretchy="false">(</mo> <mi>x</mi> <mo stretchy="false">)</mo> <mo>,</mo> <msup> <mi>e</mi> <mrow class="MJX-TeXAtom-ORD"> <mo fence="false" stretchy="false">‖<!-- ‖ --></mo> <mi>y</mi> <msup> <mo fence="false" stretchy="false">‖<!-- ‖ --></mo> <mrow class="MJX-TeXAtom-ORD"> <mn>2</mn> </mrow> </msup> <mrow class="MJX-TeXAtom-ORD"> <mo>/</mo> </mrow> <mn>2</mn> <msup> <mi>σ<!-- σ --></mi> <mrow class="MJX-TeXAtom-ORD"> <mn>2</mn> </mrow> </msup> </mrow> </msup> <mi>φ<!-- φ --></mi> <mo stretchy="false">(</mo> <mi>y</mi> <mo stretchy="false">)</mo> <mo fence="false" stretchy="false">⟩<!-- ⟩ --></mo> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle e^{\langle x,y\rangle /\sigma ^{2}}=\mathbb {E} [\langle e^{\|x\|^{2}/2\sigma ^{2}}\varphi (x),e^{\|y\|^{2}/2\sigma ^{2}}\varphi (y)\rangle ]\approx \langle e^{\|x\|^{2}/2\sigma ^{2}}\varphi (x),e^{\|y\|^{2}/2\sigma ^{2}}\varphi (y)\rangle }</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/bfb56111453c9e03415021c39d21ed88a37d2ea1" class="mwe-math-fallback-image-display mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.838ex; width:72.765ex; height:3.676ex;" alt="{\displaystyle e^{\langle x,y\rangle /\sigma ^{2}}=\mathbb {E} [\langle e^{\|x\|^{2}/2\sigma ^{2}}\varphi (x),e^{\|y\|^{2}/2\sigma ^{2}}\varphi (y)\rangle ]\approx \langle e^{\|x\|^{2}/2\sigma ^{2}}\varphi (x),e^{\|y\|^{2}/2\sigma ^{2}}\varphi (y)\rangle }"></span>Consequently, the one-headed attention, with one query, can be written as <span class="mwe-math-element"><span class="mwe-math-mathml-display mwe-math-mathml-a11y" style="display: none;"><math display="block" xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle {\text{Attention}}(q,K,V)={\text{softmax}}\left({\frac {qK^{\mathrm {T} }}{\sqrt {d_{k}}}}\right)V\approx {\frac {\varphi (q)^{T}\sum _{i}e^{\|k_{i}\|^{2}/2\sigma ^{2}}\varphi (k_{i})v_{i}^{T}}{\varphi (q)^{T}\sum _{i}e^{\|k_{i}\|^{2}/2\sigma ^{2}}\varphi (k_{i})}}}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mrow class="MJX-TeXAtom-ORD"> <mtext>Attention</mtext> </mrow> <mo stretchy="false">(</mo> <mi>q</mi> <mo>,</mo> <mi>K</mi> <mo>,</mo> <mi>V</mi> <mo stretchy="false">)</mo> <mo>=</mo> <mrow class="MJX-TeXAtom-ORD"> <mtext>softmax</mtext> </mrow> <mrow> <mo>(</mo> <mrow class="MJX-TeXAtom-ORD"> <mfrac> <mrow> <mi>q</mi> <msup> <mi>K</mi> <mrow class="MJX-TeXAtom-ORD"> <mrow class="MJX-TeXAtom-ORD"> <mi mathvariant="normal">T</mi> </mrow> </mrow> </msup> </mrow> <msqrt> <msub> <mi>d</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>k</mi> </mrow> </msub> </msqrt> </mfrac> </mrow> <mo>)</mo> </mrow> <mi>V</mi> <mo>≈<!-- ≈ --></mo> <mrow class="MJX-TeXAtom-ORD"> <mfrac> <mrow> <mi>φ<!-- φ --></mi> <mo stretchy="false">(</mo> <mi>q</mi> <msup> <mo stretchy="false">)</mo> <mrow class="MJX-TeXAtom-ORD"> <mi>T</mi> </mrow> </msup> <munder> <mo>∑<!-- ∑ --></mo> <mrow class="MJX-TeXAtom-ORD"> <mi>i</mi> </mrow> </munder> <msup> <mi>e</mi> <mrow class="MJX-TeXAtom-ORD"> <mo fence="false" stretchy="false">‖<!-- ‖ --></mo> <msub> <mi>k</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>i</mi> </mrow> </msub> <msup> <mo fence="false" stretchy="false">‖<!-- ‖ --></mo> <mrow class="MJX-TeXAtom-ORD"> <mn>2</mn> </mrow> </msup> <mrow class="MJX-TeXAtom-ORD"> <mo>/</mo> </mrow> <mn>2</mn> <msup> <mi>σ<!-- σ --></mi> <mrow class="MJX-TeXAtom-ORD"> <mn>2</mn> </mrow> </msup> </mrow> </msup> <mi>φ<!-- φ --></mi> <mo stretchy="false">(</mo> <msub> <mi>k</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>i</mi> </mrow> </msub> <mo stretchy="false">)</mo> <msubsup> <mi>v</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>i</mi> </mrow> <mrow class="MJX-TeXAtom-ORD"> <mi>T</mi> </mrow> </msubsup> </mrow> <mrow> <mi>φ<!-- φ --></mi> <mo stretchy="false">(</mo> <mi>q</mi> <msup> <mo stretchy="false">)</mo> <mrow class="MJX-TeXAtom-ORD"> <mi>T</mi> </mrow> </msup> <munder> <mo>∑<!-- ∑ --></mo> <mrow class="MJX-TeXAtom-ORD"> <mi>i</mi> </mrow> </munder> <msup> <mi>e</mi> <mrow class="MJX-TeXAtom-ORD"> <mo fence="false" stretchy="false">‖<!-- ‖ --></mo> <msub> <mi>k</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>i</mi> </mrow> </msub> <msup> <mo fence="false" stretchy="false">‖<!-- ‖ --></mo> <mrow class="MJX-TeXAtom-ORD"> <mn>2</mn> </mrow> </msup> <mrow class="MJX-TeXAtom-ORD"> <mo>/</mo> </mrow> <mn>2</mn> <msup> <mi>σ<!-- σ --></mi> <mrow class="MJX-TeXAtom-ORD"> <mn>2</mn> </mrow> </msup> </mrow> </msup> <mi>φ<!-- φ --></mi> <mo stretchy="false">(</mo> <msub> <mi>k</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>i</mi> </mrow> </msub> <mo stretchy="false">)</mo> </mrow> </mfrac> </mrow> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle {\text{Attention}}(q,K,V)={\text{softmax}}\left({\frac {qK^{\mathrm {T} }}{\sqrt {d_{k}}}}\right)V\approx {\frac {\varphi (q)^{T}\sum _{i}e^{\|k_{i}\|^{2}/2\sigma ^{2}}\varphi (k_{i})v_{i}^{T}}{\varphi (q)^{T}\sum _{i}e^{\|k_{i}\|^{2}/2\sigma ^{2}}\varphi (k_{i})}}}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/bca1667ac7c453bd1979496b1b951fc9c09d3b08" class="mwe-math-fallback-image-display mw-invert skin-invert" aria-hidden="true" style="vertical-align: -3.338ex; width:71.8ex; height:8.009ex;" alt="{\displaystyle {\text{Attention}}(q,K,V)={\text{softmax}}\left({\frac {qK^{\mathrm {T} }}{\sqrt {d_{k}}}}\right)V\approx {\frac {\varphi (q)^{T}\sum _{i}e^{\|k_{i}\|^{2}/2\sigma ^{2}}\varphi (k_{i})v_{i}^{T}}{\varphi (q)^{T}\sum _{i}e^{\|k_{i}\|^{2}/2\sigma ^{2}}\varphi (k_{i})}}}"></span>where <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle \sigma =d_{K}^{1/4}}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mi>σ<!-- σ --></mi> <mo>=</mo> <msubsup> <mi>d</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>K</mi> </mrow> <mrow class="MJX-TeXAtom-ORD"> <mn>1</mn> <mrow class="MJX-TeXAtom-ORD"> <mo>/</mo> </mrow> <mn>4</mn> </mrow> </msubsup> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle \sigma =d_{K}^{1/4}}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/d2571d23e3b9162609ece1399f4abf50811e4eb6" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -1.005ex; width:8.344ex; height:3.676ex;" alt="{\displaystyle \sigma =d_{K}^{1/4}}"></span>. Similarly for multiple queries, and for multiheaded attention. </p><p>This approximation can be computed in linear time, as we can compute the matrix <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle \varphi (k_{i})v_{i}^{T}}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mi>φ<!-- φ --></mi> <mo stretchy="false">(</mo> <msub> <mi>k</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>i</mi> </mrow> </msub> <mo stretchy="false">)</mo> <msubsup> <mi>v</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>i</mi> </mrow> <mrow class="MJX-TeXAtom-ORD"> <mi>T</mi> </mrow> </msubsup> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle \varphi (k_{i})v_{i}^{T}}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/6276b79cd088813503ec00ae5eab91ab02d8a7f0" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -1.005ex; width:7.857ex; height:3.176ex;" alt="{\displaystyle \varphi (k_{i})v_{i}^{T}}"></span> first, then multiply it with the query. In essence, we have managed to obtain a more precise version of <span class="mwe-math-element"><span class="mwe-math-mathml-display mwe-math-mathml-a11y" style="display: none;"><math display="block" xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle {\text{Attention}}(Q,K,V)={\text{softmax}}\left({\frac {QK^{\mathrm {T} }}{\sqrt {d_{k}}}}\right)V\approx Q(K^{T}V/{\sqrt {d_{k}}})}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mrow class="MJX-TeXAtom-ORD"> <mtext>Attention</mtext> </mrow> <mo stretchy="false">(</mo> <mi>Q</mi> <mo>,</mo> <mi>K</mi> <mo>,</mo> <mi>V</mi> <mo stretchy="false">)</mo> <mo>=</mo> <mrow class="MJX-TeXAtom-ORD"> <mtext>softmax</mtext> </mrow> <mrow> <mo>(</mo> <mrow class="MJX-TeXAtom-ORD"> <mfrac> <mrow> <mi>Q</mi> <msup> <mi>K</mi> <mrow class="MJX-TeXAtom-ORD"> <mrow class="MJX-TeXAtom-ORD"> <mi mathvariant="normal">T</mi> </mrow> </mrow> </msup> </mrow> <msqrt> <msub> <mi>d</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>k</mi> </mrow> </msub> </msqrt> </mfrac> </mrow> <mo>)</mo> </mrow> <mi>V</mi> <mo>≈<!-- ≈ --></mo> <mi>Q</mi> <mo stretchy="false">(</mo> <msup> <mi>K</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>T</mi> </mrow> </msup> <mi>V</mi> <mrow class="MJX-TeXAtom-ORD"> <mo>/</mo> </mrow> <mrow class="MJX-TeXAtom-ORD"> <msqrt> <msub> <mi>d</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>k</mi> </mrow> </msub> </msqrt> </mrow> <mo stretchy="false">)</mo> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle {\text{Attention}}(Q,K,V)={\text{softmax}}\left({\frac {QK^{\mathrm {T} }}{\sqrt {d_{k}}}}\right)V\approx Q(K^{T}V/{\sqrt {d_{k}}})}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/7da7e0c09e0f526924af6c95f7c0d2c6fb34b910" class="mwe-math-fallback-image-display mw-invert skin-invert" aria-hidden="true" style="vertical-align: -3.171ex; width:60.802ex; height:7.509ex;" alt="{\displaystyle {\text{Attention}}(Q,K,V)={\text{softmax}}\left({\frac {QK^{\mathrm {T} }}{\sqrt {d_{k}}}}\right)V\approx Q(K^{T}V/{\sqrt {d_{k}}})}"></span>Performer (2022)<sup id="cite_ref-104" class="reference"><a href="#cite_note-104"><span class="cite-bracket">[</span>102<span class="cite-bracket">]</span></a></sup> uses the same Random Feature Attention, but <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle w_{1},...,w_{D}}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <msub> <mi>w</mi> <mrow class="MJX-TeXAtom-ORD"> <mn>1</mn> </mrow> </msub> <mo>,</mo> <mo>.</mo> <mo>.</mo> <mo>.</mo> <mo>,</mo> <msub> <mi>w</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>D</mi> </mrow> </msub> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle w_{1},...,w_{D}}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/c97fcd8d8f90947efda4e03b5ffc293c61cb094c" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.671ex; width:11.145ex; height:2.009ex;" alt="{\displaystyle w_{1},...,w_{D}}"></span> are first independently sampled from the normal distribution <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle N(0,\sigma ^{2}I)}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mi>N</mi> <mo stretchy="false">(</mo> <mn>0</mn> <mo>,</mo> <msup> <mi>σ<!-- σ --></mi> <mrow class="MJX-TeXAtom-ORD"> <mn>2</mn> </mrow> </msup> <mi>I</mi> <mo stretchy="false">)</mo> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle N(0,\sigma ^{2}I)}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/9d30d920f052b8230e76c64a55f2cddc963b201f" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.838ex; width:9.626ex; height:3.176ex;" alt="{\displaystyle N(0,\sigma ^{2}I)}"></span>, then they are <a href="/wiki/Gram%E2%80%93Schmidt_process" title="Gram–Schmidt process">Gram-Schmidt processed</a>. </p> <div class="mw-heading mw-heading3"><h3 id="Multimodality">Multimodality</h3><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Transformer_(deep_learning_architecture)&action=edit&section=43" title="Edit section: Multimodality"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <p>Transformers can also be used/adapted for modalities (input or output) beyond just text, usually by finding a way to "tokenize" the modality. </p><p>Multimodal models can either be trained from scratch, or by finetuning. A 2022 study found that Transformers pretrained only on natural language can be finetuned on only 0.03% of parameters and become competitive with LSTMs on a variety of logical and visual tasks, demonstrating <a href="/wiki/Transfer_learning" title="Transfer learning">transfer learning</a>.<sup id="cite_ref-105" class="reference"><a href="#cite_note-105"><span class="cite-bracket">[</span>103<span class="cite-bracket">]</span></a></sup> The LLaVA was a vision-language model composed of a language model (Vicuna-13B)<sup id="cite_ref-106" class="reference"><a href="#cite_note-106"><span class="cite-bracket">[</span>104<span class="cite-bracket">]</span></a></sup> and a vision model (<a href="/wiki/Vision_transformer" title="Vision transformer">ViT</a>-L/14), connected by a linear layer. Only the linear layer is finetuned.<sup id="cite_ref-107" class="reference"><a href="#cite_note-107"><span class="cite-bracket">[</span>105<span class="cite-bracket">]</span></a></sup> </p><p><a href="/wiki/Vision_transformer" title="Vision transformer">Vision transformers</a><sup id="cite_ref-auto2_43-1" class="reference"><a href="#cite_note-auto2-43"><span class="cite-bracket">[</span>41<span class="cite-bracket">]</span></a></sup> adapt the transformer to computer vision by breaking down input images as a series of patches, turning them into vectors, and treating them like tokens in a standard transformer. </p><p>Conformer<sup id="cite_ref-Gulati2020_44-1" class="reference"><a href="#cite_note-Gulati2020-44"><span class="cite-bracket">[</span>42<span class="cite-bracket">]</span></a></sup> and later <a href="/wiki/Whisper_(speech_recognition_system)" title="Whisper (speech recognition system)">Whisper</a><sup id="cite_ref-Radford_Kim_Xu_Brockman_p._108-0" class="reference"><a href="#cite_note-Radford_Kim_Xu_Brockman_p.-108"><span class="cite-bracket">[</span>106<span class="cite-bracket">]</span></a></sup> follow the same pattern for <a href="/wiki/Speech_recognition" title="Speech recognition">speech recognition</a>, first turning the speech signal into a <a href="/wiki/Spectrogram" title="Spectrogram">spectrogram</a>, which is then treated like an image, i.e. broken down into a series of patches, turned into vectors and treated like tokens in a standard transformer. </p><p><a href="/wiki/Perceiver" title="Perceiver">Perceivers</a><sup id="cite_ref-perceiver2021_109-0" class="reference"><a href="#cite_note-perceiver2021-109"><span class="cite-bracket">[</span>107<span class="cite-bracket">]</span></a></sup><sup id="cite_ref-jaegle2021b_110-0" class="reference"><a href="#cite_note-jaegle2021b-110"><span class="cite-bracket">[</span>108<span class="cite-bracket">]</span></a></sup> are a variant of Transformers designed for multimodality. </p><p>For image generation, notable architectures are <a href="/wiki/DALL-E" title="DALL-E">DALL-E 1</a> (2021), Parti (2022),<sup id="cite_ref-111" class="reference"><a href="#cite_note-111"><span class="cite-bracket">[</span>109<span class="cite-bracket">]</span></a></sup> Phenaki (2023),<sup id="cite_ref-:13_112-0" class="reference"><a href="#cite_note-:13-112"><span class="cite-bracket">[</span>110<span class="cite-bracket">]</span></a></sup> and Muse (2023).<sup id="cite_ref-:12_113-0" class="reference"><a href="#cite_note-:12-113"><span class="cite-bracket">[</span>111<span class="cite-bracket">]</span></a></sup> Unlike later models, DALL-E is not a diffusion model. Instead, it uses a decoder-only Transformer that autoregressively generates a text, followed by the token representation of an image, which is then converted by a <a href="/wiki/Variational_autoencoder" title="Variational autoencoder">variational autoencoder</a> to an image.<sup id="cite_ref-114" class="reference"><a href="#cite_note-114"><span class="cite-bracket">[</span>112<span class="cite-bracket">]</span></a></sup> Parti is an encoder-decoder Transformer, where the encoder processes a text prompt, and the decoder generates a token representation of an image.<sup id="cite_ref-115" class="reference"><a href="#cite_note-115"><span class="cite-bracket">[</span>113<span class="cite-bracket">]</span></a></sup> Muse is an encoder-only Transformer that is trained to predict masked image tokens from unmasked image tokens. During generation, all input tokens are masked, and the highest-confidence predictions are included for the next iteration, until all tokens are predicted.<sup id="cite_ref-:12_113-1" class="reference"><a href="#cite_note-:12-113"><span class="cite-bracket">[</span>111<span class="cite-bracket">]</span></a></sup> Phenaki is a text-to-video model. It is a bidirectional masked transformer conditioned on pre-computed text tokens. The generated tokens are then decoded to a video.<sup id="cite_ref-:13_112-1" class="reference"><a href="#cite_note-:13-112"><span class="cite-bracket">[</span>110<span class="cite-bracket">]</span></a></sup> </p> <div class="mw-heading mw-heading2"><h2 id="Applications">Applications</h2><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Transformer_(deep_learning_architecture)&action=edit&section=44" title="Edit section: Applications"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <p>The transformer has had great success in <a href="/wiki/Natural_language_processing" title="Natural language processing">natural language processing</a> (NLP). Many <a href="/wiki/Large_language_model" title="Large language model">large language models</a> such as <a href="/wiki/GPT-2" title="GPT-2">GPT-2</a>, <a href="/wiki/GPT-3" title="GPT-3">GPT-3</a>, <a href="/wiki/GPT-4" title="GPT-4">GPT-4</a>, AlbertAGPT, <a href="/wiki/Anthropic#Claude" title="Anthropic">Claude</a>, <a href="/wiki/BERT_(language_model)" title="BERT (language model)">BERT</a>, <a href="/wiki/XLNet" title="XLNet">XLNet</a>, <a href="/wiki/BERT_(language_model)#RoBERTa" title="BERT (language model)">RoBERTa</a> and <a href="/wiki/ChatGPT" title="ChatGPT">ChatGPT</a> demonstrate the ability of transformers to perform a wide variety of NLP-related subtasks and their related real-world applications, including: </p> <ul><li><a href="/wiki/Machine_translation" title="Machine translation">machine translation</a></li> <li><a href="/wiki/Time_series" title="Time series">time series</a> prediction</li> <li><a href="/wiki/Automatic_summarization" title="Automatic summarization">document summarization</a></li> <li><a href="/wiki/Natural_language_generation" title="Natural language generation">document generation</a></li> <li><a href="/wiki/Named-entity_recognition" title="Named-entity recognition">named entity recognition</a> (NER)<sup id="cite_ref-116" class="reference"><a href="#cite_note-116"><span class="cite-bracket">[</span>114<span class="cite-bracket">]</span></a></sup></li> <li><a href="/wiki/Computer_programming" title="Computer programming">writing computer code</a> based on requirements expressed in natural language.</li> <li><a href="/wiki/Speech-to-text" class="mw-redirect" title="Speech-to-text">speech-to-text</a></li></ul> <p>Beyond traditional NLP, the transformer architecture has had success in other applications, such as: </p> <ul><li><a href="/wiki/Sequence_analysis" title="Sequence analysis">biological sequence analysis</a></li> <li><a href="/wiki/Computer_vision" title="Computer vision">video understanding</a></li> <li><a href="/wiki/Protein_structure_prediction" title="Protein structure prediction">protein folding</a> (such as <a href="/wiki/AlphaFold" title="AlphaFold">AlphaFold</a>)</li> <li><a href="/wiki/Evaluation_function" title="Evaluation function">evaluating</a> chess board positions. Using static evaluation alone (that is, with no <a href="/wiki/Minimax" title="Minimax">Minimax</a> search) transformer achieved an <a href="/wiki/Elo_rating_system" title="Elo rating system">Elo</a> of 2895, putting it at <a href="/wiki/Grandmaster_(chess)" title="Grandmaster (chess)">grandmaster</a> level.<sup id="cite_ref-grandmaster_10-1" class="reference"><a href="#cite_note-grandmaster-10"><span class="cite-bracket">[</span>10<span class="cite-bracket">]</span></a></sup></li></ul> <div class="mw-heading mw-heading2"><h2 id="See_also">See also</h2><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Transformer_(deep_learning_architecture)&action=edit&section=45" title="Edit section: See also"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <ul><li><a href="/wiki/Seq2seq" title="Seq2seq">seq2seq</a> – Family of machine learning approaches</li> <li><a href="/wiki/Perceiver" title="Perceiver">Perceiver</a> – Variant of Transformer designed for multimodal data</li> <li><a href="/wiki/Vision_transformer" title="Vision transformer">Vision transformer</a> – Machine learning model for vision processing</li> <li><a href="/wiki/Large_language_model" title="Large language model">Large language model</a> – Type of machine learning model</li> <li><a href="/wiki/BERT_(language_model)" title="BERT (language model)">BERT (language model)</a> – Series of language models developed by Google AI</li> <li><a href="/wiki/Generative_pre-trained_transformer" title="Generative pre-trained transformer">Generative pre-trained transformer</a> – Type of large language model</li> <li><a href="/wiki/T5_(language_model)" title="T5 (language model)">T5 (language model)</a> – Series of large language models developed by Google AI</li></ul> <div class="mw-heading mw-heading2"><h2 id="Notes">Notes</h2><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Transformer_(deep_learning_architecture)&action=edit&section=46" title="Edit section: Notes"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <style data-mw-deduplicate="TemplateStyles:r1239543626">.mw-parser-output .reflist{margin-bottom:0.5em;list-style-type:decimal}@media screen{.mw-parser-output .reflist{font-size:90%}}.mw-parser-output .reflist .references{font-size:100%;margin-bottom:0;list-style-type:inherit}.mw-parser-output .reflist-columns-2{column-width:30em}.mw-parser-output .reflist-columns-3{column-width:25em}.mw-parser-output .reflist-columns{margin-top:0.3em}.mw-parser-output .reflist-columns ol{margin-top:0}.mw-parser-output .reflist-columns li{page-break-inside:avoid;break-inside:avoid-column}.mw-parser-output .reflist-upper-alpha{list-style-type:upper-alpha}.mw-parser-output .reflist-upper-roman{list-style-type:upper-roman}.mw-parser-output .reflist-lower-alpha{list-style-type:lower-alpha}.mw-parser-output .reflist-lower-greek{list-style-type:lower-greek}.mw-parser-output .reflist-lower-roman{list-style-type:lower-roman}</style><div class="reflist"> <div class="mw-references-wrap"><ol class="references"> <li id="cite_note-13"><span class="mw-cite-backlink"><b><a href="#cite_ref-13">^</a></b></span> <span class="reference-text"><a href="/wiki/Gated_recurrent_units" class="mw-redirect" title="Gated recurrent units">Gated recurrent units</a> (2014) further reduced its complexity.</span> </li> <li id="cite_note-17"><span class="mw-cite-backlink"><b><a href="#cite_ref-17">^</a></b></span> <span class="reference-text">Some architectures, such as RWKV or state space models, avoid the issue.</span> </li> </ol></div></div> <div class="mw-heading mw-heading2"><h2 id="References">References</h2><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Transformer_(deep_learning_architecture)&action=edit&section=47" title="Edit section: References"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1239543626"><div class="reflist"> <div class="mw-references-wrap mw-references-columns"><ol class="references"> <li id="cite_note-2017_Attention_Is_All_You_Need-1"><span class="mw-cite-backlink">^ <a href="#cite_ref-2017_Attention_Is_All_You_Need_1-0"><sup><i><b>a</b></i></sup></a> <a href="#cite_ref-2017_Attention_Is_All_You_Need_1-1"><sup><i><b>b</b></i></sup></a> <a href="#cite_ref-2017_Attention_Is_All_You_Need_1-2"><sup><i><b>c</b></i></sup></a> <a href="#cite_ref-2017_Attention_Is_All_You_Need_1-3"><sup><i><b>d</b></i></sup></a> <a href="#cite_ref-2017_Attention_Is_All_You_Need_1-4"><sup><i><b>e</b></i></sup></a> <a href="#cite_ref-2017_Attention_Is_All_You_Need_1-5"><sup><i><b>f</b></i></sup></a> <a href="#cite_ref-2017_Attention_Is_All_You_Need_1-6"><sup><i><b>g</b></i></sup></a> <a href="#cite_ref-2017_Attention_Is_All_You_Need_1-7"><sup><i><b>h</b></i></sup></a> <a href="#cite_ref-2017_Attention_Is_All_You_Need_1-8"><sup><i><b>i</b></i></sup></a> <a href="#cite_ref-2017_Attention_Is_All_You_Need_1-9"><sup><i><b>j</b></i></sup></a> <a href="#cite_ref-2017_Attention_Is_All_You_Need_1-10"><sup><i><b>k</b></i></sup></a> <a href="#cite_ref-2017_Attention_Is_All_You_Need_1-11"><sup><i><b>l</b></i></sup></a></span> <span class="reference-text"><style data-mw-deduplicate="TemplateStyles:r1238218222">.mw-parser-output cite.citation{font-style:inherit;word-wrap:break-word}.mw-parser-output .citation q{quotes:"\"""\"""'""'"}.mw-parser-output .citation:target{background-color:rgba(0,127,255,0.133)}.mw-parser-output .id-lock-free.id-lock-free a{background:url("//upload.wikimedia.org/wikipedia/commons/6/65/Lock-green.svg")right 0.1em center/9px no-repeat}.mw-parser-output .id-lock-limited.id-lock-limited a,.mw-parser-output .id-lock-registration.id-lock-registration a{background:url("//upload.wikimedia.org/wikipedia/commons/d/d6/Lock-gray-alt-2.svg")right 0.1em center/9px no-repeat}.mw-parser-output .id-lock-subscription.id-lock-subscription a{background:url("//upload.wikimedia.org/wikipedia/commons/a/aa/Lock-red-alt-2.svg")right 0.1em center/9px no-repeat}.mw-parser-output .cs1-ws-icon a{background:url("//upload.wikimedia.org/wikipedia/commons/4/4c/Wikisource-logo.svg")right 0.1em center/12px no-repeat}body:not(.skin-timeless):not(.skin-minerva) .mw-parser-output .id-lock-free a,body:not(.skin-timeless):not(.skin-minerva) .mw-parser-output .id-lock-limited a,body:not(.skin-timeless):not(.skin-minerva) .mw-parser-output .id-lock-registration a,body:not(.skin-timeless):not(.skin-minerva) .mw-parser-output .id-lock-subscription a,body:not(.skin-timeless):not(.skin-minerva) .mw-parser-output .cs1-ws-icon a{background-size:contain;padding:0 1em 0 0}.mw-parser-output .cs1-code{color:inherit;background:inherit;border:none;padding:inherit}.mw-parser-output .cs1-hidden-error{display:none;color:var(--color-error,#d33)}.mw-parser-output .cs1-visible-error{color:var(--color-error,#d33)}.mw-parser-output .cs1-maint{display:none;color:#085;margin-left:0.3em}.mw-parser-output .cs1-kern-left{padding-left:0.2em}.mw-parser-output .cs1-kern-right{padding-right:0.2em}.mw-parser-output .citation .mw-selflink{font-weight:inherit}@media screen{.mw-parser-output .cs1-format{font-size:95%}html.skin-theme-clientpref-night .mw-parser-output .cs1-maint{color:#18911f}}@media screen and (prefers-color-scheme:dark){html.skin-theme-clientpref-os .mw-parser-output .cs1-maint{color:#18911f}}</style><cite id="CITEREFVaswaniShazeerParmarUszkoreit2017" class="citation journal cs1"><a href="/wiki/Ashish_Vaswani" title="Ashish Vaswani">Vaswani, Ashish</a>; Shazeer, Noam; Parmar, Niki; Uszkoreit, Jakob; Jones, Llion; <a href="/wiki/Aidan_Gomez" title="Aidan Gomez">Gomez, Aidan N</a>; Kaiser, Łukasz; Polosukhin, Illia (2017). <a rel="nofollow" class="external text" href="https://proceedings.neurips.cc/paper/2017/file/3f5ee243547dee91fbd053c1c4a845aa-Paper.pdf">"Attention is All you Need"</a> <span class="cs1-format">(PDF)</span>. <i>Advances in Neural Information Processing Systems</i>. <b>30</b>. Curran Associates, Inc.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=article&rft.jtitle=Advances+in+Neural+Information+Processing+Systems&rft.atitle=Attention+is+All+you+Need&rft.volume=30&rft.date=2017&rft.aulast=Vaswani&rft.aufirst=Ashish&rft.au=Shazeer%2C+Noam&rft.au=Parmar%2C+Niki&rft.au=Uszkoreit%2C+Jakob&rft.au=Jones%2C+Llion&rft.au=Gomez%2C+Aidan+N&rft.au=Kaiser%2C+%C5%81ukasz&rft.au=Polosukhin%2C+Illia&rft_id=https%3A%2F%2Fproceedings.neurips.cc%2Fpaper%2F2017%2Ffile%2F3f5ee243547dee91fbd053c1c4a845aa-Paper.pdf&rfr_id=info%3Asid%2Fen.wikipedia.org%3ATransformer+%28deep+learning+architecture%29" class="Z3988"></span></span> </li> <li id="cite_note-lstm1997-2"><span class="mw-cite-backlink"><b><a href="#cite_ref-lstm1997_2-0">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFHochreiterSchmidhuber1997" class="citation journal cs1"><a href="/wiki/Sepp_Hochreiter" title="Sepp Hochreiter">Hochreiter, Sepp</a>; <a href="/wiki/J%C3%BCrgen_Schmidhuber" title="Jürgen Schmidhuber">Schmidhuber, Jürgen</a> (1 November 1997). "Long Short-Term Memory". <i>Neural Computation</i>. <b>9</b> (8): <span class="nowrap">1735–</span>1780. <a href="/wiki/Doi_(identifier)" class="mw-redirect" title="Doi (identifier)">doi</a>:<a rel="nofollow" class="external text" href="https://doi.org/10.1162%2Fneco.1997.9.8.1735">10.1162/neco.1997.9.8.1735</a>. <a href="/wiki/ISSN_(identifier)" class="mw-redirect" title="ISSN (identifier)">ISSN</a> <a rel="nofollow" class="external text" href="https://search.worldcat.org/issn/0899-7667">0899-7667</a>. <a href="/wiki/PMID_(identifier)" class="mw-redirect" title="PMID (identifier)">PMID</a> <a rel="nofollow" class="external text" href="https://pubmed.ncbi.nlm.nih.gov/9377276">9377276</a>. <a href="/wiki/S2CID_(identifier)" class="mw-redirect" title="S2CID (identifier)">S2CID</a> <a rel="nofollow" class="external text" href="https://api.semanticscholar.org/CorpusID:1915014">1915014</a>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=article&rft.jtitle=Neural+Computation&rft.atitle=Long+Short-Term+Memory&rft.volume=9&rft.issue=8&rft.pages=%3Cspan+class%3D%22nowrap%22%3E1735-%3C%2Fspan%3E1780&rft.date=1997-11-01&rft.issn=0899-7667&rft_id=https%3A%2F%2Fapi.semanticscholar.org%2FCorpusID%3A1915014%23id-name%3DS2CID&rft_id=info%3Apmid%2F9377276&rft_id=info%3Adoi%2F10.1162%2Fneco.1997.9.8.1735&rft.aulast=Hochreiter&rft.aufirst=Sepp&rft.au=Schmidhuber%2C+J%C3%BCrgen&rfr_id=info%3Asid%2Fen.wikipedia.org%3ATransformer+%28deep+learning+architecture%29" class="Z3988"></span></span> </li> <li id="cite_note-:7-3"><span class="mw-cite-backlink">^ <a href="#cite_ref-:7_3-0"><sup><i><b>a</b></i></sup></a> <a href="#cite_ref-:7_3-1"><sup><i><b>b</b></i></sup></a></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite class="citation web cs1"><a rel="nofollow" class="external text" href="https://openai.com/blog/better-language-models/">"Better Language Models and Their Implications"</a>. <i>OpenAI</i>. 2019-02-14. <a rel="nofollow" class="external text" href="https://web.archive.org/web/20201219132206/https://openai.com/blog/better-language-models/">Archived</a> from the original on 2020-12-19<span class="reference-accessdate">. Retrieved <span class="nowrap">2019-08-25</span></span>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=unknown&rft.jtitle=OpenAI&rft.atitle=Better+Language+Models+and+Their+Implications&rft.date=2019-02-14&rft_id=https%3A%2F%2Fopenai.com%2Fblog%2Fbetter-language-models%2F&rfr_id=info%3Asid%2Fen.wikipedia.org%3ATransformer+%28deep+learning+architecture%29" class="Z3988"></span></span> </li> <li id="cite_note-inventors-4"><span class="mw-cite-backlink">^ <a href="#cite_ref-inventors_4-0"><sup><i><b>a</b></i></sup></a> <a href="#cite_ref-inventors_4-1"><sup><i><b>b</b></i></sup></a></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFBahdanauChoBengio2014" class="citation arxiv cs1">Bahdanau; Cho, Kyunghyun; Bengio, Yoshua (September 1, 2014). "Neural Machine Translation by Jointly Learning to Align and Translate". <a href="/wiki/ArXiv_(identifier)" class="mw-redirect" title="ArXiv (identifier)">arXiv</a>:<span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://arxiv.org/abs/1409.0473">1409.0473</a></span> [<a rel="nofollow" class="external text" href="https://arxiv.org/archive/cs.CL">cs.CL</a>].</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=preprint&rft.jtitle=arXiv&rft.atitle=Neural+Machine+Translation+by+Jointly+Learning+to+Align+and+Translate&rft.date=2014-09-01&rft_id=info%3Aarxiv%2F1409.0473&rft.au=Bahdanau&rft.au=Cho%2C+Kyunghyun&rft.au=Bengio%2C+Yoshua&rfr_id=info%3Asid%2Fen.wikipedia.org%3ATransformer+%28deep+learning+architecture%29" class="Z3988"></span></span> </li> <li id="cite_note-inventconfirm-5"><span class="mw-cite-backlink"><b><a href="#cite_ref-inventconfirm_5-0">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFLuongPhamManning2015" class="citation arxiv cs1">Luong, Minh-Thang; Pham, Hieu; Manning, Christopher D. (August 17, 2015). "Effective Approaches to Attention-based Neural Machine Translation". <a href="/wiki/ArXiv_(identifier)" class="mw-redirect" title="ArXiv (identifier)">arXiv</a>:<span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://arxiv.org/abs/1508.04025">1508.04025</a></span> [<a rel="nofollow" class="external text" href="https://arxiv.org/archive/cs.CL">cs.CL</a>].</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=preprint&rft.jtitle=arXiv&rft.atitle=Effective+Approaches+to+Attention-based+Neural+Machine+Translation&rft.date=2015-08-17&rft_id=info%3Aarxiv%2F1508.04025&rft.aulast=Luong&rft.aufirst=Minh-Thang&rft.au=Pham%2C+Hieu&rft.au=Manning%2C+Christopher+D.&rfr_id=info%3Asid%2Fen.wikipedia.org%3ATransformer+%28deep+learning+architecture%29" class="Z3988"></span></span> </li> <li id="cite_note-:10-6"><span class="mw-cite-backlink">^ <a href="#cite_ref-:10_6-0"><sup><i><b>a</b></i></sup></a> <a href="#cite_ref-:10_6-1"><sup><i><b>b</b></i></sup></a></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFChenLuRajeswaranLee2021" class="citation cs2">Chen, Lili; Lu, Kevin; Rajeswaran, Aravind; Lee, Kimin; Grover, Aditya; Laskin, Michael; Abbeel, Pieter; Srinivas, Aravind; Mordatch, Igor (2021-06-24), <i>Decision Transformer: Reinforcement Learning via Sequence Modeling</i>, <a href="/wiki/ArXiv_(identifier)" class="mw-redirect" title="ArXiv (identifier)">arXiv</a>:<span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://arxiv.org/abs/2106.01345">2106.01345</a></span></cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&rft.genre=book&rft.btitle=Decision+Transformer%3A+Reinforcement+Learning+via+Sequence+Modeling&rft.date=2021-06-24&rft_id=info%3Aarxiv%2F2106.01345&rft.aulast=Chen&rft.aufirst=Lili&rft.au=Lu%2C+Kevin&rft.au=Rajeswaran%2C+Aravind&rft.au=Lee%2C+Kimin&rft.au=Grover%2C+Aditya&rft.au=Laskin%2C+Michael&rft.au=Abbeel%2C+Pieter&rft.au=Srinivas%2C+Aravind&rft.au=Mordatch%2C+Igor&rfr_id=info%3Asid%2Fen.wikipedia.org%3ATransformer+%28deep+learning+architecture%29" class="Z3988"></span></span> </li> <li id="cite_note-7"><span class="mw-cite-backlink"><b><a href="#cite_ref-7">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFParisottoSongRaePascanu2020" class="citation journal cs1">Parisotto, Emilio; Song, Francis; Rae, Jack; Pascanu, Razvan; Gulcehre, Caglar; Jayakumar, Siddhant; Jaderberg, Max; Kaufman, Raphaël Lopez; Clark, Aidan; Noury, Seb; Botvinick, Matthew; Heess, Nicolas; Hadsell, Raia (2020-11-21). <a rel="nofollow" class="external text" href="https://proceedings.mlr.press/v119/parisotto20a.html">"Stabilizing Transformers for Reinforcement Learning"</a>. <i>Proceedings of the 37th International Conference on Machine Learning</i>. PMLR: <span class="nowrap">7487–</span>7498.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=article&rft.jtitle=Proceedings+of+the+37th+International+Conference+on+Machine+Learning&rft.atitle=Stabilizing+Transformers+for+Reinforcement+Learning&rft.pages=%3Cspan+class%3D%22nowrap%22%3E7487-%3C%2Fspan%3E7498&rft.date=2020-11-21&rft.aulast=Parisotto&rft.aufirst=Emilio&rft.au=Song%2C+Francis&rft.au=Rae%2C+Jack&rft.au=Pascanu%2C+Razvan&rft.au=Gulcehre%2C+Caglar&rft.au=Jayakumar%2C+Siddhant&rft.au=Jaderberg%2C+Max&rft.au=Kaufman%2C+Rapha%C3%ABl+Lopez&rft.au=Clark%2C+Aidan&rft.au=Noury%2C+Seb&rft.au=Botvinick%2C+Matthew&rft.au=Heess%2C+Nicolas&rft.au=Hadsell%2C+Raia&rft_id=https%3A%2F%2Fproceedings.mlr.press%2Fv119%2Fparisotto20a.html&rfr_id=info%3Asid%2Fen.wikipedia.org%3ATransformer+%28deep+learning+architecture%29" class="Z3988"></span></span> </li> <li id="cite_note-Robust_Speech_Recognition_via_Large-Scale_Weak_Supervision-8"><span class="mw-cite-backlink"><b><a href="#cite_ref-Robust_Speech_Recognition_via_Large-Scale_Weak_Supervision_8-0">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFRadfordJong_Wook_KimXuBrockman2022" class="citation arxiv cs1">Radford, Alec; Jong Wook Kim; Xu, Tao; Brockman, Greg; McLeavey, Christine; Sutskever, Ilya (2022). "Robust Speech Recognition via Large-Scale Weak Supervision". <a href="/wiki/ArXiv_(identifier)" class="mw-redirect" title="ArXiv (identifier)">arXiv</a>:<span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://arxiv.org/abs/2212.04356">2212.04356</a></span> [<a rel="nofollow" class="external text" href="https://arxiv.org/archive/eess.AS">eess.AS</a>].</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=preprint&rft.jtitle=arXiv&rft.atitle=Robust+Speech+Recognition+via+Large-Scale+Weak+Supervision&rft.date=2022&rft_id=info%3Aarxiv%2F2212.04356&rft.aulast=Radford&rft.aufirst=Alec&rft.au=Jong+Wook+Kim&rft.au=Xu%2C+Tao&rft.au=Brockman%2C+Greg&rft.au=McLeavey%2C+Christine&rft.au=Sutskever%2C+Ilya&rfr_id=info%3Asid%2Fen.wikipedia.org%3ATransformer+%28deep+learning+architecture%29" class="Z3988"></span></span> </li> <li id="cite_note-9"><span class="mw-cite-backlink"><b><a href="#cite_ref-9">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFMonastirskyAzulaySintov2023" class="citation journal cs1">Monastirsky, Maxim; Azulay, Osher; Sintov, Avishai (February 2023). <a rel="nofollow" class="external text" href="https://ieeexplore.ieee.org/document/9984828">"Learning to Throw With a Handful of Samples Using Decision Transformers"</a>. <i>IEEE Robotics and Automation Letters</i>. <b>8</b> (2): <span class="nowrap">576–</span>583. <a href="/wiki/Doi_(identifier)" class="mw-redirect" title="Doi (identifier)">doi</a>:<a rel="nofollow" class="external text" href="https://doi.org/10.1109%2FLRA.2022.3229266">10.1109/LRA.2022.3229266</a>. <a href="/wiki/ISSN_(identifier)" class="mw-redirect" title="ISSN (identifier)">ISSN</a> <a rel="nofollow" class="external text" href="https://search.worldcat.org/issn/2377-3766">2377-3766</a>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=article&rft.jtitle=IEEE+Robotics+and+Automation+Letters&rft.atitle=Learning+to+Throw+With+a+Handful+of+Samples+Using+Decision+Transformers&rft.volume=8&rft.issue=2&rft.pages=%3Cspan+class%3D%22nowrap%22%3E576-%3C%2Fspan%3E583&rft.date=2023-02&rft_id=info%3Adoi%2F10.1109%2FLRA.2022.3229266&rft.issn=2377-3766&rft.aulast=Monastirsky&rft.aufirst=Maxim&rft.au=Azulay%2C+Osher&rft.au=Sintov%2C+Avishai&rft_id=https%3A%2F%2Fieeexplore.ieee.org%2Fdocument%2F9984828&rfr_id=info%3Asid%2Fen.wikipedia.org%3ATransformer+%28deep+learning+architecture%29" class="Z3988"></span></span> </li> <li id="cite_note-grandmaster-10"><span class="mw-cite-backlink">^ <a href="#cite_ref-grandmaster_10-0"><sup><i><b>a</b></i></sup></a> <a href="#cite_ref-grandmaster_10-1"><sup><i><b>b</b></i></sup></a></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFRuossDelétangMedapatiGrau-Moya2024" class="citation arxiv cs1">Ruoss, Anian; Delétang, Grégoire; Medapati, Sourabh; Grau-Moya, Jordi; Wenliang, Li; Catt, Elliot; Reid, John; Genewein, Tim (2024-02-07). "Grandmaster-Level Chess Without Search". <a href="/wiki/ArXiv_(identifier)" class="mw-redirect" title="ArXiv (identifier)">arXiv</a>:<span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://arxiv.org/abs/2402.04494v1">2402.04494v1</a></span> [<a rel="nofollow" class="external text" href="https://arxiv.org/archive/cs.LG">cs.LG</a>].</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=preprint&rft.jtitle=arXiv&rft.atitle=Grandmaster-Level+Chess+Without+Search&rft.date=2024-02-07&rft_id=info%3Aarxiv%2F2402.04494v1&rft.aulast=Ruoss&rft.aufirst=Anian&rft.au=Del%C3%A9tang%2C+Gr%C3%A9goire&rft.au=Medapati%2C+Sourabh&rft.au=Grau-Moya%2C+Jordi&rft.au=Wenliang%2C+Li&rft.au=Catt%2C+Elliot&rft.au=Reid%2C+John&rft.au=Genewein%2C+Tim&rfr_id=info%3Asid%2Fen.wikipedia.org%3ATransformer+%28deep+learning+architecture%29" class="Z3988"></span></span> </li> <li id="cite_note-wolf2020-11"><span class="mw-cite-backlink">^ <a href="#cite_ref-wolf2020_11-0"><sup><i><b>a</b></i></sup></a> <a href="#cite_ref-wolf2020_11-1"><sup><i><b>b</b></i></sup></a></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFWolfDebutSanhChaumond2020" class="citation book cs1">Wolf, Thomas; Debut, Lysandre; Sanh, Victor; Chaumond, Julien; Delangue, Clement; Moi, Anthony; Cistac, Pierric; Rault, Tim; Louf, Remi; Funtowicz, Morgan; Davison, Joe; Shleifer, Sam; von Platen, Patrick; Ma, Clara; Jernite, Yacine; Plu, Julien; Xu, Canwen; Le Scao, Teven; Gugger, Sylvain; Drame, Mariama; Lhoest, Quentin; Rush, Alexander (2020). "Transformers: State-of-the-Art Natural Language Processing". <i>Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing: System Demonstrations</i>. pp. <span class="nowrap">38–</span>45. <a href="/wiki/Doi_(identifier)" class="mw-redirect" title="Doi (identifier)">doi</a>:<a rel="nofollow" class="external text" href="https://doi.org/10.18653%2Fv1%2F2020.emnlp-demos.6">10.18653/v1/2020.emnlp-demos.6</a>. <a href="/wiki/S2CID_(identifier)" class="mw-redirect" title="S2CID (identifier)">S2CID</a> <a rel="nofollow" class="external text" href="https://api.semanticscholar.org/CorpusID:208117506">208117506</a>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&rft.genre=bookitem&rft.atitle=Transformers%3A+State-of-the-Art+Natural+Language+Processing&rft.btitle=Proceedings+of+the+2020+Conference+on+Empirical+Methods+in+Natural+Language+Processing%3A+System+Demonstrations&rft.pages=%3Cspan+class%3D%22nowrap%22%3E38-%3C%2Fspan%3E45&rft.date=2020&rft_id=info%3Adoi%2F10.18653%2Fv1%2F2020.emnlp-demos.6&rft_id=https%3A%2F%2Fapi.semanticscholar.org%2FCorpusID%3A208117506%23id-name%3DS2CID&rft.aulast=Wolf&rft.aufirst=Thomas&rft.au=Debut%2C+Lysandre&rft.au=Sanh%2C+Victor&rft.au=Chaumond%2C+Julien&rft.au=Delangue%2C+Clement&rft.au=Moi%2C+Anthony&rft.au=Cistac%2C+Pierric&rft.au=Rault%2C+Tim&rft.au=Louf%2C+Remi&rft.au=Funtowicz%2C+Morgan&rft.au=Davison%2C+Joe&rft.au=Shleifer%2C+Sam&rft.au=von+Platen%2C+Patrick&rft.au=Ma%2C+Clara&rft.au=Jernite%2C+Yacine&rft.au=Plu%2C+Julien&rft.au=Xu%2C+Canwen&rft.au=Le+Scao%2C+Teven&rft.au=Gugger%2C+Sylvain&rft.au=Drame%2C+Mariama&rft.au=Lhoest%2C+Quentin&rft.au=Rush%2C+Alexander&rfr_id=info%3Asid%2Fen.wikipedia.org%3ATransformer+%28deep+learning+architecture%29" class="Z3988"></span></span> </li> <li id="cite_note-:6-12"><span class="mw-cite-backlink">^ <a href="#cite_ref-:6_12-0"><sup><i><b>a</b></i></sup></a> <a href="#cite_ref-:6_12-1"><sup><i><b>b</b></i></sup></a> <a href="#cite_ref-:6_12-2"><sup><i><b>c</b></i></sup></a></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite class="citation web cs1"><a rel="nofollow" class="external text" href="http://ai.googleblog.com/2018/11/open-sourcing-bert-state-of-art-pre.html">"Open Sourcing BERT: State-of-the-Art Pre-training for Natural Language Processing"</a>. <i>Google AI Blog</i>. 2 November 2018. <a rel="nofollow" class="external text" href="https://web.archive.org/web/20210113211449/https://ai.googleblog.com/2018/11/open-sourcing-bert-state-of-art-pre.html">Archived</a> from the original on 2021-01-13<span class="reference-accessdate">. Retrieved <span class="nowrap">2019-08-25</span></span>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=unknown&rft.jtitle=Google+AI+Blog&rft.atitle=Open+Sourcing+BERT%3A+State-of-the-Art+Pre-training+for+Natural+Language+Processing&rft.date=2018-11-02&rft_id=http%3A%2F%2Fai.googleblog.com%2F2018%2F11%2Fopen-sourcing-bert-state-of-art-pre.html&rfr_id=info%3Asid%2Fen.wikipedia.org%3ATransformer+%28deep+learning+architecture%29" class="Z3988"></span></span> </li> <li id="cite_note-14"><span class="mw-cite-backlink"><b><a href="#cite_ref-14">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFFeldmanBallard1982" class="citation journal cs1">Feldman, J. A.; Ballard, D. H. (1982-07-01). <a rel="nofollow" class="external text" href="https://www.sciencedirect.com/science/article/pii/S0364021382800013">"Connectionist models and their properties"</a>. <i>Cognitive Science</i>. <b>6</b> (3): <span class="nowrap">205–</span>254. <a href="/wiki/Doi_(identifier)" class="mw-redirect" title="Doi (identifier)">doi</a>:<a rel="nofollow" class="external text" href="https://doi.org/10.1016%2FS0364-0213%2882%2980001-3">10.1016/S0364-0213(82)80001-3</a>. <a href="/wiki/ISSN_(identifier)" class="mw-redirect" title="ISSN (identifier)">ISSN</a> <a rel="nofollow" class="external text" href="https://search.worldcat.org/issn/0364-0213">0364-0213</a>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=article&rft.jtitle=Cognitive+Science&rft.atitle=Connectionist+models+and+their+properties&rft.volume=6&rft.issue=3&rft.pages=%3Cspan+class%3D%22nowrap%22%3E205-%3C%2Fspan%3E254&rft.date=1982-07-01&rft_id=info%3Adoi%2F10.1016%2FS0364-0213%2882%2980001-3&rft.issn=0364-0213&rft.aulast=Feldman&rft.aufirst=J.+A.&rft.au=Ballard%2C+D.+H.&rft_id=https%3A%2F%2Fwww.sciencedirect.com%2Fscience%2Farticle%2Fpii%2FS0364021382800013&rfr_id=info%3Asid%2Fen.wikipedia.org%3ATransformer+%28deep+learning+architecture%29" class="Z3988"></span></span> </li> <li id="cite_note-PDP-15"><span class="mw-cite-backlink"><b><a href="#cite_ref-PDP_15-0">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFRumelhartMcClellandHinton1987" class="citation book cs1">Rumelhart, David E.; McClelland, James L.; Hinton, Geoffrey E. (1987-07-29). <a rel="nofollow" class="external text" href="https://stanford.edu/~jlmcc/papers/PDP/Chapter2.pdf"><i>Parallel Distributed Processing, Volume 1: Explorations in the Microstructure of Cognition: Foundations, Chapter 2</i></a> <span class="cs1-format">(PDF)</span>. Cambridge, Mass: Bradford Books. <a href="/wiki/ISBN_(identifier)" class="mw-redirect" title="ISBN (identifier)">ISBN</a> <a href="/wiki/Special:BookSources/978-0-262-68053-0" title="Special:BookSources/978-0-262-68053-0"><bdi>978-0-262-68053-0</bdi></a>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&rft.genre=book&rft.btitle=Parallel+Distributed+Processing%2C+Volume+1%3A+Explorations+in+the+Microstructure+of+Cognition%3A+Foundations%2C+Chapter+2&rft.place=Cambridge%2C+Mass&rft.pub=Bradford+Books&rft.date=1987-07-29&rft.isbn=978-0-262-68053-0&rft.aulast=Rumelhart&rft.aufirst=David+E.&rft.au=McClelland%2C+James+L.&rft.au=Hinton%2C+Geoffrey+E.&rft_id=https%3A%2F%2Fstanford.edu%2F~jlmcc%2Fpapers%2FPDP%2FChapter2.pdf&rfr_id=info%3Asid%2Fen.wikipedia.org%3ATransformer+%28deep+learning+architecture%29" class="Z3988"></span></span> </li> <li id="cite_note-16"><span class="mw-cite-backlink"><b><a href="#cite_ref-16">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFGilesMaxwell1987" class="citation journal cs1">Giles, C. Lee; Maxwell, Tom (1987-12-01). <a rel="nofollow" class="external text" href="https://opg.optica.org/abstract.cfm?URI=ao-26-23-4972">"Learning, invariance, and generalization in high-order neural networks"</a>. <i>Applied Optics</i>. <b>26</b> (23): <span class="nowrap">4972–</span>4978. <a href="/wiki/Doi_(identifier)" class="mw-redirect" title="Doi (identifier)">doi</a>:<a rel="nofollow" class="external text" href="https://doi.org/10.1364%2FAO.26.004972">10.1364/AO.26.004972</a>. <a href="/wiki/ISSN_(identifier)" class="mw-redirect" title="ISSN (identifier)">ISSN</a> <a rel="nofollow" class="external text" href="https://search.worldcat.org/issn/0003-6935">0003-6935</a>. <a href="/wiki/PMID_(identifier)" class="mw-redirect" title="PMID (identifier)">PMID</a> <a rel="nofollow" class="external text" href="https://pubmed.ncbi.nlm.nih.gov/20523475">20523475</a>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=article&rft.jtitle=Applied+Optics&rft.atitle=Learning%2C+invariance%2C+and+generalization+in+high-order+neural+networks&rft.volume=26&rft.issue=23&rft.pages=%3Cspan+class%3D%22nowrap%22%3E4972-%3C%2Fspan%3E4978&rft.date=1987-12-01&rft.issn=0003-6935&rft_id=info%3Apmid%2F20523475&rft_id=info%3Adoi%2F10.1364%2FAO.26.004972&rft.aulast=Giles&rft.aufirst=C.+Lee&rft.au=Maxwell%2C+Tom&rft_id=https%3A%2F%2Fopg.optica.org%2Fabstract.cfm%3FURI%3Dao-26-23-4972&rfr_id=info%3Asid%2Fen.wikipedia.org%3ATransformer+%28deep+learning+architecture%29" class="Z3988"></span></span> </li> <li id="cite_note-transform19922-18"><span class="mw-cite-backlink">^ <a href="#cite_ref-transform19922_18-0"><sup><i><b>a</b></i></sup></a> <a href="#cite_ref-transform19922_18-1"><sup><i><b>b</b></i></sup></a></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFSchmidhuber1992" class="citation journal cs1"><a href="/wiki/J%C3%BCrgen_Schmidhuber" title="Jürgen Schmidhuber">Schmidhuber, Jürgen</a> (1992). <a rel="nofollow" class="external text" href="https://archive.org/download/wikipedia-scholarly-sources-corpus/10.1162.zip/10.1162%252Fneco.1992.4.1.131.pdf">"Learning to control fast-weight memories: an alternative to recurrent nets"</a> <span class="cs1-format">(PDF)</span>. <i>Neural Computation</i>. <b>4</b> (1): <span class="nowrap">131–</span>139. <a href="/wiki/Doi_(identifier)" class="mw-redirect" title="Doi (identifier)">doi</a>:<a rel="nofollow" class="external text" href="https://doi.org/10.1162%2Fneco.1992.4.1.131">10.1162/neco.1992.4.1.131</a>. <a href="/wiki/S2CID_(identifier)" class="mw-redirect" title="S2CID (identifier)">S2CID</a> <a rel="nofollow" class="external text" href="https://api.semanticscholar.org/CorpusID:16683347">16683347</a>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=article&rft.jtitle=Neural+Computation&rft.atitle=Learning+to+control+fast-weight+memories%3A+an+alternative+to+recurrent+nets.&rft.volume=4&rft.issue=1&rft.pages=%3Cspan+class%3D%22nowrap%22%3E131-%3C%2Fspan%3E139&rft.date=1992&rft_id=info%3Adoi%2F10.1162%2Fneco.1992.4.1.131&rft_id=https%3A%2F%2Fapi.semanticscholar.org%2FCorpusID%3A16683347%23id-name%3DS2CID&rft.aulast=Schmidhuber&rft.aufirst=J%C3%BCrgen&rft_id=https%3A%2F%2Farchive.org%2Fdownload%2Fwikipedia-scholarly-sources-corpus%2F10.1162.zip%2F10.1162%25252Fneco.1992.4.1.131.pdf&rfr_id=info%3Asid%2Fen.wikipedia.org%3ATransformer+%28deep+learning+architecture%29" class="Z3988"></span></span> </li> <li id="cite_note-malsburg1981-19"><span class="mw-cite-backlink"><b><a href="#cite_ref-malsburg1981_19-0">^</a></b></span> <span class="reference-text">Christoph von der Malsburg: The correlation theory of brain function. Internal Report 81-2, MPI Biophysical Chemistry, 1981. <a rel="nofollow" class="external free" href="http://cogprints.org/1380/1/vdM_correlation.pdf">http://cogprints.org/1380/1/vdM_correlation.pdf</a> See Reprint in Models of Neural Networks II, chapter 2, pages 95-119. Springer, Berlin, 1994.</span> </li> <li id="cite_note-feldman1982-20"><span class="mw-cite-backlink"><b><a href="#cite_ref-feldman1982_20-0">^</a></b></span> <span class="reference-text">Jerome A. Feldman, "Dynamic connections in neural networks," Biological Cybernetics, vol. 46, no. 1, pp. 27-39, Dec. 1982.</span> </li> <li id="cite_note-21"><span class="mw-cite-backlink"><b><a href="#cite_ref-21">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFHintonPlaut1987" class="citation journal cs1">Hinton, Geoffrey E.; Plaut, David C. (1987). <a rel="nofollow" class="external text" href="https://escholarship.org/uc/item/0570j1dp">"Using Fast Weights to Deblur Old Memories"</a>. <i>Proceedings of the Annual Meeting of the Cognitive Science Society</i>. <b>9</b>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=article&rft.jtitle=Proceedings+of+the+Annual+Meeting+of+the+Cognitive+Science+Society&rft.atitle=Using+Fast+Weights+to+Deblur+Old+Memories&rft.volume=9&rft.date=1987&rft.aulast=Hinton&rft.aufirst=Geoffrey+E.&rft.au=Plaut%2C+David+C.&rft_id=https%3A%2F%2Fescholarship.org%2Fuc%2Fitem%2F0570j1dp&rfr_id=info%3Asid%2Fen.wikipedia.org%3ATransformer+%28deep+learning+architecture%29" class="Z3988"></span></span> </li> <li id="cite_note-fastlinear20202-22"><span class="mw-cite-backlink"><b><a href="#cite_ref-fastlinear20202_22-0">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFKatharopoulosVyasPappasFleuret2020" class="citation conference cs1">Katharopoulos, Angelos; Vyas, Apoorv; Pappas, Nikolaos; Fleuret, François (2020). <a rel="nofollow" class="external text" href="https://proceedings.mlr.press/v119/katharopoulos20a.html">"Transformers are RNNs: Fast autoregressive Transformers with linear attention"</a>. <i>ICML 2020</i>. PMLR. pp. <span class="nowrap">5156–</span>5165.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&rft.genre=conference&rft.atitle=Transformers+are+RNNs%3A+Fast+autoregressive+Transformers+with+linear+attention&rft.btitle=ICML+2020&rft.pages=%3Cspan+class%3D%22nowrap%22%3E5156-%3C%2Fspan%3E5165&rft.pub=PMLR&rft.date=2020&rft.aulast=Katharopoulos&rft.aufirst=Angelos&rft.au=Vyas%2C+Apoorv&rft.au=Pappas%2C+Nikolaos&rft.au=Fleuret%2C+Fran%C3%A7ois&rft_id=https%3A%2F%2Fproceedings.mlr.press%2Fv119%2Fkatharopoulos20a.html&rfr_id=info%3Asid%2Fen.wikipedia.org%3ATransformer+%28deep+learning+architecture%29" class="Z3988"></span></span> </li> <li id="cite_note-schlag20212-23"><span class="mw-cite-backlink"><b><a href="#cite_ref-schlag20212_23-0">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFSchlagIrieSchmidhuber2021" class="citation conference cs1">Schlag, Imanol; Irie, Kazuki; <a href="/wiki/Juergen_Schmidhuber" class="mw-redirect" title="Juergen Schmidhuber">Schmidhuber, Jürgen</a> (2021). "Linear Transformers Are Secretly Fast Weight Programmers". <i>ICML 2021</i>. Springer. pp. <span class="nowrap">9355–</span>9366.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&rft.genre=conference&rft.atitle=Linear+Transformers+Are+Secretly+Fast+Weight+Programmers&rft.btitle=ICML+2021&rft.pages=%3Cspan+class%3D%22nowrap%22%3E9355-%3C%2Fspan%3E9366&rft.pub=Springer&rft.date=2021&rft.aulast=Schlag&rft.aufirst=Imanol&rft.au=Irie%2C+Kazuki&rft.au=Schmidhuber%2C+J%C3%BCrgen&rfr_id=info%3Asid%2Fen.wikipedia.org%3ATransformer+%28deep+learning+architecture%29" class="Z3988"></span></span> </li> <li id="cite_note-:22-24"><span class="mw-cite-backlink">^ <a href="#cite_ref-:22_24-0"><sup><i><b>a</b></i></sup></a> <a href="#cite_ref-:22_24-1"><sup><i><b>b</b></i></sup></a> <a href="#cite_ref-:22_24-2"><sup><i><b>c</b></i></sup></a></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFChovan_MerriënboerGulcehreBahdanau2014" class="citation book cs1">Cho, Kyunghyun; van Merriënboer, Bart; Gulcehre, Caglar; Bahdanau, Dzmitry; Bougares, Fethi; Schwenk, Holger; Bengio, Yoshua (October 2014). <a rel="nofollow" class="external text" href="https://aclanthology.org/D14-1179">"Learning Phrase Representations using RNN Encoder–Decoder for Statistical Machine Translation"</a>. In Moschitti, Alessandro; Pang, Bo; Daelemans, Walter (eds.). <i>Proceedings of the 2014 Conference on Empirical Methods in Natural Language Processing (EMNLP)</i>. Doha, Qatar: Association for Computational Linguistics. pp. <span class="nowrap">1724–</span>1734. <a href="/wiki/ArXiv_(identifier)" class="mw-redirect" title="ArXiv (identifier)">arXiv</a>:<span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://arxiv.org/abs/1406.1078">1406.1078</a></span>. <a href="/wiki/Doi_(identifier)" class="mw-redirect" title="Doi (identifier)">doi</a>:<a rel="nofollow" class="external text" href="https://doi.org/10.3115%2Fv1%2FD14-1179">10.3115/v1/D14-1179</a>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&rft.genre=bookitem&rft.atitle=Learning+Phrase+Representations+using+RNN+Encoder%E2%80%93Decoder+for+Statistical+Machine+Translation&rft.btitle=Proceedings+of+the+2014+Conference+on+Empirical+Methods+in+Natural+Language+Processing+%28EMNLP%29&rft.place=Doha%2C+Qatar&rft.pages=%3Cspan+class%3D%22nowrap%22%3E1724-%3C%2Fspan%3E1734&rft.pub=Association+for+Computational+Linguistics&rft.date=2014-10&rft_id=info%3Aarxiv%2F1406.1078&rft_id=info%3Adoi%2F10.3115%2Fv1%2FD14-1179&rft.aulast=Cho&rft.aufirst=Kyunghyun&rft.au=van+Merri%C3%ABnboer%2C+Bart&rft.au=Gulcehre%2C+Caglar&rft.au=Bahdanau%2C+Dzmitry&rft.au=Bougares%2C+Fethi&rft.au=Schwenk%2C+Holger&rft.au=Bengio%2C+Yoshua&rft_id=https%3A%2F%2Faclanthology.org%2FD14-1179&rfr_id=info%3Asid%2Fen.wikipedia.org%3ATransformer+%28deep+learning+architecture%29" class="Z3988"></span></span> </li> <li id="cite_note-sequence-25"><span class="mw-cite-backlink">^ <a href="#cite_ref-sequence_25-0"><sup><i><b>a</b></i></sup></a> <a href="#cite_ref-sequence_25-1"><sup><i><b>b</b></i></sup></a> <a href="#cite_ref-sequence_25-2"><sup><i><b>c</b></i></sup></a></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFSutskeverVinyalsLe2014" class="citation arxiv cs1">Sutskever, Ilya; Vinyals, Oriol; Le, Quoc Viet (14 Dec 2014). "Sequence to sequence learning with neural networks". <a href="/wiki/ArXiv_(identifier)" class="mw-redirect" title="ArXiv (identifier)">arXiv</a>:<span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://arxiv.org/abs/1409.3215">1409.3215</a></span> [<a rel="nofollow" class="external text" href="https://arxiv.org/archive/cs.CL">cs.CL</a>].</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=preprint&rft.jtitle=arXiv&rft.atitle=Sequence+to+sequence+learning+with+neural+networks&rft.date=2014-12-14&rft_id=info%3Aarxiv%2F1409.3215&rft.aulast=Sutskever&rft.aufirst=Ilya&rft.au=Vinyals%2C+Oriol&rft.au=Le%2C+Quoc+Viet&rfr_id=info%3Asid%2Fen.wikipedia.org%3ATransformer+%28deep+learning+architecture%29" class="Z3988"></span> [first version posted to arXiv on 10 Sep 2014]</span> </li> <li id="cite_note-MyUser_Arxiv.org_May_18_2016c-26"><span class="mw-cite-backlink"><b><a href="#cite_ref-MyUser_Arxiv.org_May_18_2016c_26-0">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFChungGulcehreChoBengio2014" class="citation arxiv cs1">Chung, Junyoung; Gulcehre, Caglar; Cho, KyungHyun; Bengio, Yoshua (2014). "Empirical Evaluation of Gated Recurrent Neural Networks on Sequence Modeling". <a href="/wiki/ArXiv_(identifier)" class="mw-redirect" title="ArXiv (identifier)">arXiv</a>:<span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://arxiv.org/abs/1412.3555">1412.3555</a></span> [<a rel="nofollow" class="external text" href="https://arxiv.org/archive/cs.NE">cs.NE</a>].</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=preprint&rft.jtitle=arXiv&rft.atitle=Empirical+Evaluation+of+Gated+Recurrent+Neural+Networks+on+Sequence+Modeling&rft.date=2014&rft_id=info%3Aarxiv%2F1412.3555&rft.aulast=Chung&rft.aufirst=Junyoung&rft.au=Gulcehre%2C+Caglar&rft.au=Cho%2C+KyungHyun&rft.au=Bengio%2C+Yoshua&rfr_id=info%3Asid%2Fen.wikipedia.org%3ATransformer+%28deep+learning+architecture%29" class="Z3988"></span></span> </li> <li id="cite_note-gruber_jockisch-27"><span class="mw-cite-backlink"><b><a href="#cite_ref-gruber_jockisch_27-0">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFGruberJockisch2020" class="citation cs2">Gruber, N.; Jockisch, A. (2020), "Are GRU cells more specific and LSTM cells more sensitive in motive classification of text?", <i>Frontiers in Artificial Intelligence</i>, <b>3</b>: 40, <a href="/wiki/Doi_(identifier)" class="mw-redirect" title="Doi (identifier)">doi</a>:<span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://doi.org/10.3389%2Ffrai.2020.00040">10.3389/frai.2020.00040</a></span>, <a href="/wiki/PMC_(identifier)" class="mw-redirect" title="PMC (identifier)">PMC</a> <span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7861254">7861254</a></span>, <a href="/wiki/PMID_(identifier)" class="mw-redirect" title="PMID (identifier)">PMID</a> <a rel="nofollow" class="external text" href="https://pubmed.ncbi.nlm.nih.gov/33733157">33733157</a>, <a href="/wiki/S2CID_(identifier)" class="mw-redirect" title="S2CID (identifier)">S2CID</a> <a rel="nofollow" class="external text" href="https://api.semanticscholar.org/CorpusID:220252321">220252321</a></cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=article&rft.jtitle=Frontiers+in+Artificial+Intelligence&rft.atitle=Are+GRU+cells+more+specific+and+LSTM+cells+more+sensitive+in+motive+classification+of+text%3F&rft.volume=3&rft.pages=40&rft.date=2020&rft_id=https%3A%2F%2Fwww.ncbi.nlm.nih.gov%2Fpmc%2Farticles%2FPMC7861254%23id-name%3DPMC&rft_id=https%3A%2F%2Fapi.semanticscholar.org%2FCorpusID%3A220252321%23id-name%3DS2CID&rft_id=info%3Apmid%2F33733157&rft_id=info%3Adoi%2F10.3389%2Ffrai.2020.00040&rft.aulast=Gruber&rft.aufirst=N.&rft.au=Jockisch%2C+A.&rfr_id=info%3Asid%2Fen.wikipedia.org%3ATransformer+%28deep+learning+architecture%29" class="Z3988"></span></span> </li> <li id="cite_note-28"><span class="mw-cite-backlink"><b><a href="#cite_ref-28">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFSutskeverVinyalsLe2014" class="citation journal cs1">Sutskever, Ilya; Vinyals, Oriol; Le, Quoc V (2014). <a rel="nofollow" class="external text" href="https://proceedings.neurips.cc/paper/2014/hash/a14ac55a4f27472c5d894ec1c3c743d2-Abstract.html">"Sequence to Sequence Learning with Neural Networks"</a>. <i>Advances in Neural Information Processing Systems</i>. <b>27</b>. Curran Associates, Inc. <a href="/wiki/ArXiv_(identifier)" class="mw-redirect" title="ArXiv (identifier)">arXiv</a>:<span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://arxiv.org/abs/1409.3215">1409.3215</a></span>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=article&rft.jtitle=Advances+in+Neural+Information+Processing+Systems&rft.atitle=Sequence+to+Sequence+Learning+with+Neural+Networks&rft.volume=27&rft.date=2014&rft_id=info%3Aarxiv%2F1409.3215&rft.aulast=Sutskever&rft.aufirst=Ilya&rft.au=Vinyals%2C+Oriol&rft.au=Le%2C+Quoc+V&rft_id=https%3A%2F%2Fproceedings.neurips.cc%2Fpaper%2F2014%2Fhash%2Fa14ac55a4f27472c5d894ec1c3c743d2-Abstract.html&rfr_id=info%3Asid%2Fen.wikipedia.org%3ATransformer+%28deep+learning+architecture%29" class="Z3988"></span></span> </li> <li id="cite_note-29"><span class="mw-cite-backlink"><b><a href="#cite_ref-29">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFLuongPhamManning2015" class="citation arxiv cs1">Luong, Minh-Thang; Pham, Hieu; Manning, Christopher D. (2015). "Effective Approaches to Attention-based Neural Machine Translation". <a href="/wiki/ArXiv_(identifier)" class="mw-redirect" title="ArXiv (identifier)">arXiv</a>:<span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://arxiv.org/abs/1508.04025">1508.04025</a></span> [<a rel="nofollow" class="external text" href="https://arxiv.org/archive/cs.CL">cs.CL</a>].</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=preprint&rft.jtitle=arXiv&rft.atitle=Effective+Approaches+to+Attention-based+Neural+Machine+Translation&rft.date=2015&rft_id=info%3Aarxiv%2F1508.04025&rft.aulast=Luong&rft.aufirst=Minh-Thang&rft.au=Pham%2C+Hieu&rft.au=Manning%2C+Christopher+D.&rfr_id=info%3Asid%2Fen.wikipedia.org%3ATransformer+%28deep+learning+architecture%29" class="Z3988"></span></span> </li> <li id="cite_note-Y4moj-30"><span class="mw-cite-backlink"><b><a href="#cite_ref-Y4moj_30-0">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFWuSchusterChenLe2016" class="citation arxiv cs1">Wu, Yonghui; et al. (2016-09-01). "Google's Neural Machine Translation System: Bridging the Gap between Human and Machine Translation". <a href="/wiki/ArXiv_(identifier)" class="mw-redirect" title="ArXiv (identifier)">arXiv</a>:<span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://arxiv.org/abs/1609.08144">1609.08144</a></span> [<a rel="nofollow" class="external text" href="https://arxiv.org/archive/cs.CL">cs.CL</a>].</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=preprint&rft.jtitle=arXiv&rft.atitle=Google%27s+Neural+Machine+Translation+System%3A+Bridging+the+Gap+between+Human+and+Machine+Translation&rft.date=2016-09-01&rft_id=info%3Aarxiv%2F1609.08144&rft.aulast=Wu&rft.aufirst=Yonghui&rft.au=Schuster%2C+Mike&rft.au=Chen%2C+Zhifeng&rft.au=Le%2C+Quoc+V.&rft.au=Norouzi%2C+Mohammad&rft.au=Macherey%2C+Wolfgang&rft.au=Krikun%2C+Maxim&rft.au=Cao%2C+Yuan&rft.au=Gao%2C+Qin&rft.au=Macherey%2C+Klaus&rft.au=Klingner%2C+Jeff&rft.au=Shah%2C+Apurva&rft.au=Johnson%2C+Melvin&rft.au=Liu%2C+Xiaobing&rft.au=Kaiser%2C+%C5%81ukasz&rfr_id=info%3Asid%2Fen.wikipedia.org%3ATransformer+%28deep+learning+architecture%29" class="Z3988"></span></span> </li> <li id="cite_note-UJDu8-31"><span class="mw-cite-backlink"><b><a href="#cite_ref-UJDu8_31-0">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFLewis-Kraus2016" class="citation news cs1">Lewis-Kraus, Gideon (2016-12-14). <a rel="nofollow" class="external text" href="https://web.archive.org/web/20230524052626/https://www.nytimes.com/2016/12/14/magazine/the-great-ai-awakening.html">"The Great A.I. Awakening"</a>. <i>The New York Times</i>. <a href="/wiki/ISSN_(identifier)" class="mw-redirect" title="ISSN (identifier)">ISSN</a> <a rel="nofollow" class="external text" href="https://search.worldcat.org/issn/0362-4331">0362-4331</a>. Archived from <a rel="nofollow" class="external text" href="https://www.nytimes.com/2016/12/14/magazine/the-great-ai-awakening.html">the original</a> on 24 May 2023<span class="reference-accessdate">. Retrieved <span class="nowrap">2023-06-22</span></span>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=article&rft.jtitle=The+New+York+Times&rft.atitle=The+Great+A.I.+Awakening&rft.date=2016-12-14&rft.issn=0362-4331&rft.aulast=Lewis-Kraus&rft.aufirst=Gideon&rft_id=https%3A%2F%2Fwww.nytimes.com%2F2016%2F12%2F14%2Fmagazine%2Fthe-great-ai-awakening.html&rfr_id=info%3Asid%2Fen.wikipedia.org%3ATransformer+%28deep+learning+architecture%29" class="Z3988"></span></span> </li> <li id="cite_note-32"><span class="mw-cite-backlink"><b><a href="#cite_ref-32">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFParikhTäckströmDasUszkoreit2016" class="citation arxiv cs1">Parikh, Ankur P.; Täckström, Oscar; Das, Dipanjan; Uszkoreit, Jakob (2016-09-25). "A Decomposable Attention Model for Natural Language Inference". <a href="/wiki/ArXiv_(identifier)" class="mw-redirect" title="ArXiv (identifier)">arXiv</a>:<span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://arxiv.org/abs/1606.01933">1606.01933</a></span> [<a rel="nofollow" class="external text" href="https://arxiv.org/archive/cs.CL">cs.CL</a>].</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=preprint&rft.jtitle=arXiv&rft.atitle=A+Decomposable+Attention+Model+for+Natural+Language+Inference&rft.date=2016-09-25&rft_id=info%3Aarxiv%2F1606.01933&rft.aulast=Parikh&rft.aufirst=Ankur+P.&rft.au=T%C3%A4ckstr%C3%B6m%2C+Oscar&rft.au=Das%2C+Dipanjan&rft.au=Uszkoreit%2C+Jakob&rfr_id=info%3Asid%2Fen.wikipedia.org%3ATransformer+%28deep+learning+architecture%29" class="Z3988"></span></span> </li> <li id="cite_note-:11-33"><span class="mw-cite-backlink">^ <a href="#cite_ref-:11_33-0"><sup><i><b>a</b></i></sup></a> <a href="#cite_ref-:11_33-1"><sup><i><b>b</b></i></sup></a></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFLevy" class="citation magazine cs1">Levy, Steven. <a rel="nofollow" class="external text" href="https://www.wired.com/story/eight-google-employees-invented-modern-ai-transformers-paper/">"8 Google Employees Invented Modern AI. Here's the Inside Story"</a>. <i>Wired</i>. <a href="/wiki/ISSN_(identifier)" class="mw-redirect" title="ISSN (identifier)">ISSN</a> <a rel="nofollow" class="external text" href="https://search.worldcat.org/issn/1059-1028">1059-1028</a>. <a rel="nofollow" class="external text" href="https://web.archive.org/web/20240320101528/https://www.wired.com/story/eight-google-employees-invented-modern-ai-transformers-paper/">Archived</a> from the original on 20 Mar 2024<span class="reference-accessdate">. Retrieved <span class="nowrap">2024-08-06</span></span>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=article&rft.jtitle=Wired&rft.atitle=8+Google+Employees+Invented+Modern+AI.+Here%27s+the+Inside+Story&rft.issn=1059-1028&rft.aulast=Levy&rft.aufirst=Steven&rft_id=https%3A%2F%2Fwww.wired.com%2Fstory%2Feight-google-employees-invented-modern-ai-transformers-paper%2F&rfr_id=info%3Asid%2Fen.wikipedia.org%3ATransformer+%28deep+learning+architecture%29" class="Z3988"></span></span> </li> <li id="cite_note-34"><span class="mw-cite-backlink"><b><a href="#cite_ref-34">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFChengDongLapata2016" class="citation book cs1">Cheng, Jianpeng; Dong, Li; Lapata, Mirella (November 2016). <a rel="nofollow" class="external text" href="https://aclanthology.org/D16-1053/">"Long Short-Term Memory-Networks for Machine Reading"</a>. In Su, Jian; Duh, Kevin; Carreras, Xavier (eds.). <i>Proceedings of the 2016 Conference on Empirical Methods in Natural Language Processing</i>. Austin, Texas: Association for Computational Linguistics. pp. <span class="nowrap">551–</span>561. <a href="/wiki/Doi_(identifier)" class="mw-redirect" title="Doi (identifier)">doi</a>:<a rel="nofollow" class="external text" href="https://doi.org/10.18653%2Fv1%2FD16-1053">10.18653/v1/D16-1053</a>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&rft.genre=bookitem&rft.atitle=Long+Short-Term+Memory-Networks+for+Machine+Reading&rft.btitle=Proceedings+of+the+2016+Conference+on+Empirical+Methods+in+Natural+Language+Processing&rft.place=Austin%2C+Texas&rft.pages=%3Cspan+class%3D%22nowrap%22%3E551-%3C%2Fspan%3E561&rft.pub=Association+for+Computational+Linguistics&rft.date=2016-11&rft_id=info%3Adoi%2F10.18653%2Fv1%2FD16-1053&rft.aulast=Cheng&rft.aufirst=Jianpeng&rft.au=Dong%2C+Li&rft.au=Lapata%2C+Mirella&rft_id=https%3A%2F%2Faclanthology.org%2FD16-1053%2F&rfr_id=info%3Asid%2Fen.wikipedia.org%3ATransformer+%28deep+learning+architecture%29" class="Z3988"></span></span> </li> <li id="cite_note-35"><span class="mw-cite-backlink"><b><a href="#cite_ref-35">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFPengAlcaideAnthonyAlbalak2023" class="citation cs2">Peng, Bo; Alcaide, Eric; Anthony, Quentin; Albalak, Alon; Arcadinho, Samuel; Biderman, Stella; Cao, Huanqi; Cheng, Xin; Chung, Michael (2023-12-10), <i>RWKV: Reinventing RNNs for the Transformer Era</i>, <a href="/wiki/ArXiv_(identifier)" class="mw-redirect" title="ArXiv (identifier)">arXiv</a>:<span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://arxiv.org/abs/2305.13048">2305.13048</a></span></cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&rft.genre=book&rft.btitle=RWKV%3A+Reinventing+RNNs+for+the+Transformer+Era&rft.date=2023-12-10&rft_id=info%3Aarxiv%2F2305.13048&rft.aulast=Peng&rft.aufirst=Bo&rft.au=Alcaide%2C+Eric&rft.au=Anthony%2C+Quentin&rft.au=Albalak%2C+Alon&rft.au=Arcadinho%2C+Samuel&rft.au=Biderman%2C+Stella&rft.au=Cao%2C+Huanqi&rft.au=Cheng%2C+Xin&rft.au=Chung%2C+Michael&rfr_id=info%3Asid%2Fen.wikipedia.org%3ATransformer+%28deep+learning+architecture%29" class="Z3988"></span></span> </li> <li id="cite_note-36"><span class="mw-cite-backlink"><b><a href="#cite_ref-36">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFMarche2024" class="citation magazine cs1">Marche, Stephen (2024-08-23). <a rel="nofollow" class="external text" href="https://www.newyorker.com/science/annals-of-artificial-intelligence/was-linguistic-ai-created-by-accident">"Was Linguistic A.I. Created by Accident?"</a>. <i>The New Yorker</i>. <a href="/wiki/ISSN_(identifier)" class="mw-redirect" title="ISSN (identifier)">ISSN</a> <a rel="nofollow" class="external text" href="https://search.worldcat.org/issn/0028-792X">0028-792X</a><span class="reference-accessdate">. Retrieved <span class="nowrap">2024-08-27</span></span>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=article&rft.jtitle=The+New+Yorker&rft.atitle=Was+Linguistic+A.I.+Created+by+Accident%3F&rft.date=2024-08-23&rft.issn=0028-792X&rft.aulast=Marche&rft.aufirst=Stephen&rft_id=https%3A%2F%2Fwww.newyorker.com%2Fscience%2Fannals-of-artificial-intelligence%2Fwas-linguistic-ai-created-by-accident&rfr_id=info%3Asid%2Fen.wikipedia.org%3ATransformer+%28deep+learning+architecture%29" class="Z3988"></span></span> </li> <li id="cite_note-:03-37"><span class="mw-cite-backlink">^ <a href="#cite_ref-:03_37-0"><sup><i><b>a</b></i></sup></a> <a href="#cite_ref-:03_37-1"><sup><i><b>b</b></i></sup></a> <a href="#cite_ref-:03_37-2"><sup><i><b>c</b></i></sup></a> <a href="#cite_ref-:03_37-3"><sup><i><b>d</b></i></sup></a> <a href="#cite_ref-:03_37-4"><sup><i><b>e</b></i></sup></a> <a href="#cite_ref-:03_37-5"><sup><i><b>f</b></i></sup></a></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFDevlinChangLeeToutanova2018" class="citation arxiv cs1">Devlin, Jacob; Chang, Ming-Wei; Lee, Kenton; Toutanova, Kristina (11 October 2018). "BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding". <a href="/wiki/ArXiv_(identifier)" class="mw-redirect" title="ArXiv (identifier)">arXiv</a>:<span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://arxiv.org/abs/1810.04805v2">1810.04805v2</a></span> [<a rel="nofollow" class="external text" href="https://arxiv.org/archive/cs.CL">cs.CL</a>].</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=preprint&rft.jtitle=arXiv&rft.atitle=BERT%3A+Pre-training+of+Deep+Bidirectional+Transformers+for+Language+Understanding&rft.date=2018-10-11&rft_id=info%3Aarxiv%2F1810.04805v2&rft.aulast=Devlin&rft.aufirst=Jacob&rft.au=Chang%2C+Ming-Wei&rft.au=Lee%2C+Kenton&rft.au=Toutanova%2C+Kristina&rfr_id=info%3Asid%2Fen.wikipedia.org%3ATransformer+%28deep+learning+architecture%29" class="Z3988"></span></span> </li> <li id="cite_note-38"><span class="mw-cite-backlink"><b><a href="#cite_ref-38">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite class="citation web cs1"><a rel="nofollow" class="external text" href="https://searchengineland.com/google-bert-used-on-almost-every-english-query-342193">"Google: BERT now used on almost every English query"</a>. <i>Search Engine Land</i>. 2020-10-15<span class="reference-accessdate">. Retrieved <span class="nowrap">2020-11-24</span></span>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=unknown&rft.jtitle=Search+Engine+Land&rft.atitle=Google%3A+BERT+now+used+on+almost+every+English+query&rft.date=2020-10-15&rft_id=https%3A%2F%2Fsearchengineland.com%2Fgoogle-bert-used-on-almost-every-english-query-342193&rfr_id=info%3Asid%2Fen.wikipedia.org%3ATransformer+%28deep+learning+architecture%29" class="Z3988"></span></span> </li> <li id="cite_note-39"><span class="mw-cite-backlink"><b><a href="#cite_ref-39">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite class="citation web cs1"><a rel="nofollow" class="external text" href="https://research.google/blog/recent-advances-in-google-translate/">"Recent Advances in Google Translate"</a>. <i>research.google</i><span class="reference-accessdate">. Retrieved <span class="nowrap">2024-05-08</span></span>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=unknown&rft.jtitle=research.google&rft.atitle=Recent+Advances+in+Google+Translate&rft_id=http%3A%2F%2Fresearch.google%2Fblog%2Frecent-advances-in-google-translate%2F&rfr_id=info%3Asid%2Fen.wikipedia.org%3ATransformer+%28deep+learning+architecture%29" class="Z3988"></span></span> </li> <li id="cite_note-40"><span class="mw-cite-backlink"><b><a href="#cite_ref-40">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite class="citation web cs1"><a rel="nofollow" class="external text" href="https://www.technologyreview.com/2023/03/03/1069311/inside-story-oral-history-how-chatgpt-built-openai/">"The inside story of how ChatGPT was built from the people who made it"</a>. <i>MIT Technology Review</i><span class="reference-accessdate">. Retrieved <span class="nowrap">2024-08-06</span></span>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=unknown&rft.jtitle=MIT+Technology+Review&rft.atitle=The+inside+story+of+how+ChatGPT+was+built+from+the+people+who+made+it&rft_id=https%3A%2F%2Fwww.technologyreview.com%2F2023%2F03%2F03%2F1069311%2Finside-story-oral-history-how-chatgpt-built-openai%2F&rfr_id=info%3Asid%2Fen.wikipedia.org%3ATransformer+%28deep+learning+architecture%29" class="Z3988"></span></span> </li> <li id="cite_note-gpt12-41"><span class="mw-cite-backlink"><b><a href="#cite_ref-gpt12_41-0">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite class="citation web cs1"><a rel="nofollow" class="external text" href="https://openai.com/research/language-unsupervised">"Improving language understanding with unsupervised learning"</a>. <i>openai.com</i>. June 11, 2018. <a rel="nofollow" class="external text" href="https://web.archive.org/web/20230318210736/https://openai.com/research/language-unsupervised">Archived</a> from the original on 2023-03-18<span class="reference-accessdate">. Retrieved <span class="nowrap">2023-03-18</span></span>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=unknown&rft.jtitle=openai.com&rft.atitle=Improving+language+understanding+with+unsupervised+learning&rft.date=2018-06-11&rft_id=https%3A%2F%2Fopenai.com%2Fresearch%2Flanguage-unsupervised&rfr_id=info%3Asid%2Fen.wikipedia.org%3ATransformer+%28deep+learning+architecture%29" class="Z3988"></span></span> </li> <li id="cite_note-ngEG3-42"><span class="mw-cite-backlink"><b><a href="#cite_ref-ngEG3_42-0">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite class="citation cs2"><a rel="nofollow" class="external text" href="https://github.com/openai/finetune-transformer-lm"><i>finetune-transformer-lm</i></a>, OpenAI, June 11, 2018<span class="reference-accessdate">, retrieved <span class="nowrap">2023-05-01</span></span></cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&rft.genre=book&rft.btitle=finetune-transformer-lm&rft.pub=OpenAI&rft.date=2018-06-11&rft_id=https%3A%2F%2Fgithub.com%2Fopenai%2Ffinetune-transformer-lm&rfr_id=info%3Asid%2Fen.wikipedia.org%3ATransformer+%28deep+learning+architecture%29" class="Z3988"></span></span> </li> <li id="cite_note-auto2-43"><span class="mw-cite-backlink">^ <a href="#cite_ref-auto2_43-0"><sup><i><b>a</b></i></sup></a> <a href="#cite_ref-auto2_43-1"><sup><i><b>b</b></i></sup></a></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFDosovitskiyBeyerKolesnikovWeissenborn2021" class="citation arxiv cs1">Dosovitskiy, Alexey; Beyer, Lucas; Kolesnikov, Alexander; Weissenborn, Dirk; Zhai, Xiaohua; Unterthiner, Thomas; Dehghani, Mostafa; Minderer, Matthias; Heigold, Georg; Gelly, Sylvain; Uszkoreit, Jakob (2021-06-03). "An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale". <a href="/wiki/ArXiv_(identifier)" class="mw-redirect" title="ArXiv (identifier)">arXiv</a>:<span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://arxiv.org/abs/2010.11929">2010.11929</a></span> [<a rel="nofollow" class="external text" href="https://arxiv.org/archive/cs.CV">cs.CV</a>].</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=preprint&rft.jtitle=arXiv&rft.atitle=An+Image+is+Worth+16x16+Words%3A+Transformers+for+Image+Recognition+at+Scale&rft.date=2021-06-03&rft_id=info%3Aarxiv%2F2010.11929&rft.aulast=Dosovitskiy&rft.aufirst=Alexey&rft.au=Beyer%2C+Lucas&rft.au=Kolesnikov%2C+Alexander&rft.au=Weissenborn%2C+Dirk&rft.au=Zhai%2C+Xiaohua&rft.au=Unterthiner%2C+Thomas&rft.au=Dehghani%2C+Mostafa&rft.au=Minderer%2C+Matthias&rft.au=Heigold%2C+Georg&rft.au=Gelly%2C+Sylvain&rft.au=Uszkoreit%2C+Jakob&rfr_id=info%3Asid%2Fen.wikipedia.org%3ATransformer+%28deep+learning+architecture%29" class="Z3988"></span></span> </li> <li id="cite_note-Gulati2020-44"><span class="mw-cite-backlink">^ <a href="#cite_ref-Gulati2020_44-0"><sup><i><b>a</b></i></sup></a> <a href="#cite_ref-Gulati2020_44-1"><sup><i><b>b</b></i></sup></a></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFGulatiQinChiuParmar2020" class="citation arxiv cs1">Gulati, Anmol; Qin, James; Chiu, Chung-Cheng; Parmar, Niki; Zhang, Yu; Yu, Jiahui; Han, Wei; Wang, Shibo; Zhang, Zhengdong; Wu, Yonghui; Pang, Ruoming (2020). "Conformer: Convolution-augmented Transformer for Speech Recognition". <a href="/wiki/ArXiv_(identifier)" class="mw-redirect" title="ArXiv (identifier)">arXiv</a>:<span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://arxiv.org/abs/2005.08100">2005.08100</a></span> [<a rel="nofollow" class="external text" href="https://arxiv.org/archive/eess.AS">eess.AS</a>].</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=preprint&rft.jtitle=arXiv&rft.atitle=Conformer%3A+Convolution-augmented+Transformer+for+Speech+Recognition&rft.date=2020&rft_id=info%3Aarxiv%2F2005.08100&rft.aulast=Gulati&rft.aufirst=Anmol&rft.au=Qin%2C+James&rft.au=Chiu%2C+Chung-Cheng&rft.au=Parmar%2C+Niki&rft.au=Zhang%2C+Yu&rft.au=Yu%2C+Jiahui&rft.au=Han%2C+Wei&rft.au=Wang%2C+Shibo&rft.au=Zhang%2C+Zhengdong&rft.au=Wu%2C+Yonghui&rft.au=Pang%2C+Ruoming&rfr_id=info%3Asid%2Fen.wikipedia.org%3ATransformer+%28deep+learning+architecture%29" class="Z3988"></span></span> </li> <li id="cite_note-choromanski2020-45"><span class="mw-cite-backlink"><b><a href="#cite_ref-choromanski2020_45-0">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFChoromanskiLikhosherstovDohanSong2022" class="citation cs2">Choromanski, Krzysztof; Likhosherstov, Valerii; Dohan, David; Song, Xingyou; Gane, Andreea; Sarlos, Tamas; Hawkins, Peter; Davis, Jared; Mohiuddin, Afroz (2022-11-19), <i>Rethinking Attention with Performers</i>, <a href="/wiki/ArXiv_(identifier)" class="mw-redirect" title="ArXiv (identifier)">arXiv</a>:<span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://arxiv.org/abs/2009.14794">2009.14794</a></span></cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&rft.genre=book&rft.btitle=Rethinking+Attention+with+Performers&rft.date=2022-11-19&rft_id=info%3Aarxiv%2F2009.14794&rft.aulast=Choromanski&rft.aufirst=Krzysztof&rft.au=Likhosherstov%2C+Valerii&rft.au=Dohan%2C+David&rft.au=Song%2C+Xingyou&rft.au=Gane%2C+Andreea&rft.au=Sarlos%2C+Tamas&rft.au=Hawkins%2C+Peter&rft.au=Davis%2C+Jared&rft.au=Mohiuddin%2C+Afroz&rfr_id=info%3Asid%2Fen.wikipedia.org%3ATransformer+%28deep+learning+architecture%29" class="Z3988"></span></span> </li> <li id="cite_note-46"><span class="mw-cite-backlink"><b><a href="#cite_ref-46">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFLiuMaoWuFeichtenhofer2022" class="citation conference cs1">Liu, Zhuang; Mao, Hanzi; Wu, Chao-Yuan; Feichtenhofer, Christoph; Darrell, Trevor; Xie, Saining (2022). <a rel="nofollow" class="external text" href="https://openaccess.thecvf.com/content/CVPR2022/html/Liu_A_ConvNet_for_the_2020s_CVPR_2022_paper.html"><i>A ConvNet for the 2020s</i></a>. Conference on Computer Vision and Pattern Recognition. pp. <span class="nowrap">11976–</span>11986.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&rft.genre=conference&rft.btitle=A+ConvNet+for+the+2020s&rft.pages=%3Cspan+class%3D%22nowrap%22%3E11976-%3C%2Fspan%3E11986&rft.date=2022&rft.aulast=Liu&rft.aufirst=Zhuang&rft.au=Mao%2C+Hanzi&rft.au=Wu%2C+Chao-Yuan&rft.au=Feichtenhofer%2C+Christoph&rft.au=Darrell%2C+Trevor&rft.au=Xie%2C+Saining&rft_id=https%3A%2F%2Fopenaccess.thecvf.com%2Fcontent%2FCVPR2022%2Fhtml%2FLiu_A_ConvNet_for_the_2020s_CVPR_2022_paper.html&rfr_id=info%3Asid%2Fen.wikipedia.org%3ATransformer+%28deep+learning+architecture%29" class="Z3988"></span></span> </li> <li id="cite_note-:62-47"><span class="mw-cite-backlink"><b><a href="#cite_ref-:62_47-0">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFEsserKulalBlattmannEntezari2024" class="citation cs2">Esser, Patrick; Kulal, Sumith; Blattmann, Andreas; Entezari, Rahim; Müller, Jonas; Saini, Harry; Levi, Yam; Lorenz, Dominik; Sauer, Axel (2024-03-05), <i>Scaling Rectified Flow Transformers for High-Resolution Image Synthesis</i>, <a href="/wiki/ArXiv_(identifier)" class="mw-redirect" title="ArXiv (identifier)">arXiv</a>:<span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://arxiv.org/abs/2403.03206">2403.03206</a></span></cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&rft.genre=book&rft.btitle=Scaling+Rectified+Flow+Transformers+for+High-Resolution+Image+Synthesis&rft.date=2024-03-05&rft_id=info%3Aarxiv%2F2403.03206&rft.aulast=Esser&rft.aufirst=Patrick&rft.au=Kulal%2C+Sumith&rft.au=Blattmann%2C+Andreas&rft.au=Entezari%2C+Rahim&rft.au=M%C3%BCller%2C+Jonas&rft.au=Saini%2C+Harry&rft.au=Levi%2C+Yam&rft.au=Lorenz%2C+Dominik&rft.au=Sauer%2C+Axel&rfr_id=info%3Asid%2Fen.wikipedia.org%3ATransformer+%28deep+learning+architecture%29" class="Z3988"></span></span> </li> <li id="cite_note-auto1-48"><span class="mw-cite-backlink">^ <a href="#cite_ref-auto1_48-0"><sup><i><b>a</b></i></sup></a> <a href="#cite_ref-auto1_48-1"><sup><i><b>b</b></i></sup></a></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFXiongYangHeZheng2020" class="citation arxiv cs1">Xiong, Ruibin; Yang, Yunchang; He, Di; Zheng, Kai; Zheng, Shuxin; Xing, Chen; Zhang, Huishuai; Lan, Yanyan; Wang, Liwei; Liu, Tie-Yan (2020-06-29). "On Layer Normalization in the Transformer Architecture". <a href="/wiki/ArXiv_(identifier)" class="mw-redirect" title="ArXiv (identifier)">arXiv</a>:<span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://arxiv.org/abs/2002.04745">2002.04745</a></span> [<a rel="nofollow" class="external text" href="https://arxiv.org/archive/cs.LG">cs.LG</a>].</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=preprint&rft.jtitle=arXiv&rft.atitle=On+Layer+Normalization+in+the+Transformer+Architecture&rft.date=2020-06-29&rft_id=info%3Aarxiv%2F2002.04745&rft.aulast=Xiong&rft.aufirst=Ruibin&rft.au=Yang%2C+Yunchang&rft.au=He%2C+Di&rft.au=Zheng%2C+Kai&rft.au=Zheng%2C+Shuxin&rft.au=Xing%2C+Chen&rft.au=Zhang%2C+Huishuai&rft.au=Lan%2C+Yanyan&rft.au=Wang%2C+Liwei&rft.au=Liu%2C+Tie-Yan&rfr_id=info%3Asid%2Fen.wikipedia.org%3ATransformer+%28deep+learning+architecture%29" class="Z3988"></span></span> </li> <li id="cite_note-:0-49"><span class="mw-cite-backlink"><b><a href="#cite_ref-:0_49-0">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFRaffelShazeerRobertsLee2020" class="citation journal cs1">Raffel, Colin; Shazeer, Noam; Roberts, Adam; Lee, Katherine; Narang, Sharan; Matena, Michael; Zhou, Yanqi; Li, Wei; Liu, Peter J. (2020-01-01). <a rel="nofollow" class="external text" href="https://dl.acm.org/doi/abs/10.5555/3455716.3455856">"Exploring the limits of transfer learning with a unified text-to-text transformer"</a>. <i>The Journal of Machine Learning Research</i>. <b>21</b> (1): 140:5485–140:5551. <a href="/wiki/ArXiv_(identifier)" class="mw-redirect" title="ArXiv (identifier)">arXiv</a>:<span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://arxiv.org/abs/1910.10683">1910.10683</a></span>. <a href="/wiki/ISSN_(identifier)" class="mw-redirect" title="ISSN (identifier)">ISSN</a> <a rel="nofollow" class="external text" href="https://search.worldcat.org/issn/1532-4435">1532-4435</a>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=article&rft.jtitle=The+Journal+of+Machine+Learning+Research&rft.atitle=Exploring+the+limits+of+transfer+learning+with+a+unified+text-to-text+transformer&rft.volume=21&rft.issue=1&rft.pages=140%3A5485-140%3A5551&rft.date=2020-01-01&rft_id=info%3Aarxiv%2F1910.10683&rft.issn=1532-4435&rft.aulast=Raffel&rft.aufirst=Colin&rft.au=Shazeer%2C+Noam&rft.au=Roberts%2C+Adam&rft.au=Lee%2C+Katherine&rft.au=Narang%2C+Sharan&rft.au=Matena%2C+Michael&rft.au=Zhou%2C+Yanqi&rft.au=Li%2C+Wei&rft.au=Liu%2C+Peter+J.&rft_id=https%3A%2F%2Fdl.acm.org%2Fdoi%2Fabs%2F10.5555%2F3455716.3455856&rfr_id=info%3Asid%2Fen.wikipedia.org%3ATransformer+%28deep+learning+architecture%29" class="Z3988"></span></span> </li> <li id="cite_note-50"><span class="mw-cite-backlink"><b><a href="#cite_ref-50">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFRaffelShazeerRobertsLee2019" class="citation arxiv cs1">Raffel, Colin; Shazeer, Noam; Roberts, Adam; Lee, Katherine; Narang, Sharan; Matena, Michael; Zhou, Yanqi; Li, Wei; Liu, Peter J. (2019). "Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer". <a href="/wiki/ArXiv_(identifier)" class="mw-redirect" title="ArXiv (identifier)">arXiv</a>:<span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://arxiv.org/abs/1910.10683">1910.10683</a></span> [<a rel="nofollow" class="external text" href="https://arxiv.org/archive/cs.LG">cs.LG</a>].</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=preprint&rft.jtitle=arXiv&rft.atitle=Exploring+the+Limits+of+Transfer+Learning+with+a+Unified+Text-to-Text+Transformer&rft.date=2019&rft_id=info%3Aarxiv%2F1910.10683&rft.aulast=Raffel&rft.aufirst=Colin&rft.au=Shazeer%2C+Noam&rft.au=Roberts%2C+Adam&rft.au=Lee%2C+Katherine&rft.au=Narang%2C+Sharan&rft.au=Matena%2C+Michael&rft.au=Zhou%2C+Yanqi&rft.au=Li%2C+Wei&rft.au=Liu%2C+Peter+J.&rfr_id=info%3Asid%2Fen.wikipedia.org%3ATransformer+%28deep+learning+architecture%29" class="Z3988"></span></span> </li> <li id="cite_note-:5-51"><span class="mw-cite-backlink">^ <a href="#cite_ref-:5_51-0"><sup><i><b>a</b></i></sup></a> <a href="#cite_ref-:5_51-1"><sup><i><b>b</b></i></sup></a></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite class="citation web cs1"><a rel="nofollow" class="external text" href="https://huggingface.co/docs/transformers/tasks/masked_language_modeling">"Masked language modeling"</a>. <i>huggingface.co</i><span class="reference-accessdate">. Retrieved <span class="nowrap">2023-10-05</span></span>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=unknown&rft.jtitle=huggingface.co&rft.atitle=Masked+language+modeling&rft_id=https%3A%2F%2Fhuggingface.co%2Fdocs%2Ftransformers%2Ftasks%2Fmasked_language_modeling&rfr_id=info%3Asid%2Fen.wikipedia.org%3ATransformer+%28deep+learning+architecture%29" class="Z3988"></span></span> </li> <li id="cite_note-:8-52"><span class="mw-cite-backlink">^ <a href="#cite_ref-:8_52-0"><sup><i><b>a</b></i></sup></a> <a href="#cite_ref-:8_52-1"><sup><i><b>b</b></i></sup></a></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite class="citation web cs1"><a rel="nofollow" class="external text" href="https://huggingface.co/docs/transformers/tasks/language_modeling">"Causal language modeling"</a>. <i>huggingface.co</i><span class="reference-accessdate">. Retrieved <span class="nowrap">2023-10-05</span></span>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=unknown&rft.jtitle=huggingface.co&rft.atitle=Causal+language+modeling&rft_id=https%3A%2F%2Fhuggingface.co%2Fdocs%2Ftransformers%2Ftasks%2Flanguage_modeling&rfr_id=info%3Asid%2Fen.wikipedia.org%3ATransformer+%28deep+learning+architecture%29" class="Z3988"></span></span> </li> <li id="cite_note-:4-53"><span class="mw-cite-backlink">^ <a href="#cite_ref-:4_53-0"><sup><i><b>a</b></i></sup></a> <a href="#cite_ref-:4_53-1"><sup><i><b>b</b></i></sup></a> <a href="#cite_ref-:4_53-2"><sup><i><b>c</b></i></sup></a> <a href="#cite_ref-:4_53-3"><sup><i><b>d</b></i></sup></a></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFTayDehghaniTranGarcia2023" class="citation cs2">Tay, Yi; Dehghani, Mostafa; Tran, Vinh Q.; Garcia, Xavier; Wei, Jason; Wang, Xuezhi; Chung, Hyung Won; Shakeri, Siamak; Bahri, Dara (2023-02-28), <i>UL2: Unifying Language Learning Paradigms</i>, <a href="/wiki/ArXiv_(identifier)" class="mw-redirect" title="ArXiv (identifier)">arXiv</a>:<span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://arxiv.org/abs/2205.05131">2205.05131</a></span></cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&rft.genre=book&rft.btitle=UL2%3A+Unifying+Language+Learning+Paradigms&rft.date=2023-02-28&rft_id=info%3Aarxiv%2F2205.05131&rft.aulast=Tay&rft.aufirst=Yi&rft.au=Dehghani%2C+Mostafa&rft.au=Tran%2C+Vinh+Q.&rft.au=Garcia%2C+Xavier&rft.au=Wei%2C+Jason&rft.au=Wang%2C+Xuezhi&rft.au=Chung%2C+Hyung+Won&rft.au=Shakeri%2C+Siamak&rft.au=Bahri%2C+Dara&rfr_id=info%3Asid%2Fen.wikipedia.org%3ATransformer+%28deep+learning+architecture%29" class="Z3988"></span></span> </li> <li id="cite_note-54"><span class="mw-cite-backlink"><b><a href="#cite_ref-54">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFPressWolf2017" class="citation cs2">Press, Ofir; Wolf, Lior (2017-02-21), <i>Using the Output Embedding to Improve Language Models</i>, <a href="/wiki/ArXiv_(identifier)" class="mw-redirect" title="ArXiv (identifier)">arXiv</a>:<span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://arxiv.org/abs/1608.05859">1608.05859</a></span></cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&rft.genre=book&rft.btitle=Using+the+Output+Embedding+to+Improve+Language+Models&rft.date=2017-02-21&rft_id=info%3Aarxiv%2F1608.05859&rft.aulast=Press&rft.aufirst=Ofir&rft.au=Wolf%2C+Lior&rfr_id=info%3Asid%2Fen.wikipedia.org%3ATransformer+%28deep+learning+architecture%29" class="Z3988"></span></span> </li> <li id="cite_note-55"><span class="mw-cite-backlink"><b><a href="#cite_ref-55">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFLintz2016" class="citation web cs1">Lintz, Nathan (2016-04-18). <a rel="nofollow" class="external text" href="https://indico.io/blog/sequence-modeling-neural-networks-part2-attention-models/">"Sequence Modeling with Neural Networks (Part 2): Attention Models"</a>. <i>Indico</i>. <a rel="nofollow" class="external text" href="https://web.archive.org/web/20201021203352/https://indico.io/blog/sequence-modeling-neural-networks-part2-attention-models/">Archived</a> from the original on 2020-10-21<span class="reference-accessdate">. Retrieved <span class="nowrap">2019-10-15</span></span>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=unknown&rft.jtitle=Indico&rft.atitle=Sequence+Modeling+with+Neural+Networks+%28Part+2%29%3A+Attention+Models&rft.date=2016-04-18&rft.aulast=Lintz&rft.aufirst=Nathan&rft_id=https%3A%2F%2Findico.io%2Fblog%2Fsequence-modeling-neural-networks-part2-attention-models%2F&rfr_id=info%3Asid%2Fen.wikipedia.org%3ATransformer+%28deep+learning+architecture%29" class="Z3988"></span></span> </li> <li id="cite_note-:1-56"><span class="mw-cite-backlink">^ <a href="#cite_ref-:1_56-0"><sup><i><b>a</b></i></sup></a> <a href="#cite_ref-:1_56-1"><sup><i><b>b</b></i></sup></a> <a href="#cite_ref-:1_56-2"><sup><i><b>c</b></i></sup></a></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFAlammar" class="citation web cs1">Alammar, Jay. <a rel="nofollow" class="external text" href="http://jalammar.github.io/illustrated-transformer/">"The Illustrated Transformer"</a>. <i>jalammar.github.io</i>. <a rel="nofollow" class="external text" href="https://web.archive.org/web/20201018061610/https://jalammar.github.io/illustrated-transformer/">Archived</a> from the original on 2020-10-18<span class="reference-accessdate">. Retrieved <span class="nowrap">2019-10-15</span></span>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=unknown&rft.jtitle=jalammar.github.io&rft.atitle=The+Illustrated+Transformer&rft.aulast=Alammar&rft.aufirst=Jay&rft_id=http%3A%2F%2Fjalammar.github.io%2Fillustrated-transformer%2F&rfr_id=info%3Asid%2Fen.wikipedia.org%3ATransformer+%28deep+learning+architecture%29" class="Z3988"></span></span> </li> <li id="cite_note-57"><span class="mw-cite-backlink"><b><a href="#cite_ref-57">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFTeam" class="citation web cs1">Team, Keras. <a rel="nofollow" class="external text" href="https://keras.io/api/keras_nlp/models/gpt2/gpt2_backbone/">"Keras documentation: GPT2Backbone model"</a>. <i>keras.io</i><span class="reference-accessdate">. Retrieved <span class="nowrap">2024-08-08</span></span>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=unknown&rft.jtitle=keras.io&rft.atitle=Keras+documentation%3A+GPT2Backbone+model&rft.aulast=Team&rft.aufirst=Keras&rft_id=https%3A%2F%2Fkeras.io%2Fapi%2Fkeras_nlp%2Fmodels%2Fgpt2%2Fgpt2_backbone%2F&rfr_id=info%3Asid%2Fen.wikipedia.org%3ATransformer+%28deep+learning+architecture%29" class="Z3988"></span></span> </li> <li id="cite_note-58"><span class="mw-cite-backlink"><b><a href="#cite_ref-58">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFClarkKhandelwalLevyManning2019" class="citation journal cs1">Clark, Kevin; Khandelwal, Urvashi; Levy, Omer; Manning, Christopher D. (August 2019). <a rel="nofollow" class="external text" href="https://www.aclweb.org/anthology/W19-4828">"What Does BERT Look at? An Analysis of BERT's Attention"</a>. <i>Proceedings of the 2019 ACL Workshop BlackboxNLP: Analyzing and Interpreting Neural Networks for NLP</i>. Florence, Italy: Association for Computational Linguistics: <span class="nowrap">276–</span>286. <a href="/wiki/ArXiv_(identifier)" class="mw-redirect" title="ArXiv (identifier)">arXiv</a>:<span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://arxiv.org/abs/1906.04341">1906.04341</a></span>. <a href="/wiki/Doi_(identifier)" class="mw-redirect" title="Doi (identifier)">doi</a>:<span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://doi.org/10.18653%2Fv1%2FW19-4828">10.18653/v1/W19-4828</a></span>. <a rel="nofollow" class="external text" href="https://web.archive.org/web/20201021211357/https://www.aclweb.org/anthology/W19-4828/">Archived</a> from the original on 2020-10-21<span class="reference-accessdate">. Retrieved <span class="nowrap">2020-05-20</span></span>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=article&rft.jtitle=Proceedings+of+the+2019+ACL+Workshop+BlackboxNLP%3A+Analyzing+and+Interpreting+Neural+Networks+for+NLP&rft.atitle=What+Does+BERT+Look+at%3F+An+Analysis+of+BERT%27s+Attention&rft.pages=%3Cspan+class%3D%22nowrap%22%3E276-%3C%2Fspan%3E286&rft.date=2019-08&rft_id=info%3Aarxiv%2F1906.04341&rft_id=info%3Adoi%2F10.18653%2Fv1%2FW19-4828&rft.aulast=Clark&rft.aufirst=Kevin&rft.au=Khandelwal%2C+Urvashi&rft.au=Levy%2C+Omer&rft.au=Manning%2C+Christopher+D.&rft_id=https%3A%2F%2Fwww.aclweb.org%2Fanthology%2FW19-4828&rfr_id=info%3Asid%2Fen.wikipedia.org%3ATransformer+%28deep+learning+architecture%29" class="Z3988"></span></span> </li> <li id="cite_note-59"><span class="mw-cite-backlink"><b><a href="#cite_ref-59">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFYangDaiYangCarbonell2019" class="citation journal cs1">Yang, Zhilin; Dai, Zihang; Yang, Yiming; Carbonell, Jaime; Salakhutdinov, Russ R; Le, Quoc V (2019). <a rel="nofollow" class="external text" href="https://proceedings.neurips.cc/paper/2019/hash/dc6a7e655d7e5840e66733e9ee67cc69-Abstract.html">"XLNet: Generalized Autoregressive Pretraining for Language Understanding"</a>. <i>Advances in Neural Information Processing Systems</i>. <b>32</b>. Curran Associates, Inc. <a href="/wiki/ArXiv_(identifier)" class="mw-redirect" title="ArXiv (identifier)">arXiv</a>:<span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://arxiv.org/abs/1906.08237">1906.08237</a></span>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=article&rft.jtitle=Advances+in+Neural+Information+Processing+Systems&rft.atitle=XLNet%3A+Generalized+Autoregressive+Pretraining+for+Language+Understanding&rft.volume=32&rft.date=2019&rft_id=info%3Aarxiv%2F1906.08237&rft.aulast=Yang&rft.aufirst=Zhilin&rft.au=Dai%2C+Zihang&rft.au=Yang%2C+Yiming&rft.au=Carbonell%2C+Jaime&rft.au=Salakhutdinov%2C+Russ+R&rft.au=Le%2C+Quoc+V&rft_id=https%3A%2F%2Fproceedings.neurips.cc%2Fpaper%2F2019%2Fhash%2Fdc6a7e655d7e5840e66733e9ee67cc69-Abstract.html&rfr_id=info%3Asid%2Fen.wikipedia.org%3ATransformer+%28deep+learning+architecture%29" class="Z3988"></span></span> </li> <li id="cite_note-gpt1paper-60"><span class="mw-cite-backlink"><b><a href="#cite_ref-gpt1paper_60-0">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFRadfordNarasimhanSalimansSutskever2018" class="citation web cs1">Radford, Alec; Narasimhan, Karthik; Salimans, Tim; Sutskever, Ilya (11 June 2018). <a rel="nofollow" class="external text" href="https://cdn.openai.com/research-covers/language-unsupervised/language_understanding_paper.pdf">"Improving Language Understanding by Generative Pre-Training"</a> <span class="cs1-format">(PDF)</span>. <a href="/wiki/OpenAI" title="OpenAI">OpenAI</a>. p. 12. <a rel="nofollow" class="external text" href="https://web.archive.org/web/20210126024542/https://cdn.openai.com/research-covers/language-unsupervised/language_understanding_paper.pdf">Archived</a> <span class="cs1-format">(PDF)</span> from the original on 26 January 2021<span class="reference-accessdate">. Retrieved <span class="nowrap">23 January</span> 2021</span>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&rft.genre=unknown&rft.btitle=Improving+Language+Understanding+by+Generative+Pre-Training&rft.pages=12&rft.pub=OpenAI&rft.date=2018-06-11&rft.aulast=Radford&rft.aufirst=Alec&rft.au=Narasimhan%2C+Karthik&rft.au=Salimans%2C+Tim&rft.au=Sutskever%2C+Ilya&rft_id=https%3A%2F%2Fcdn.openai.com%2Fresearch-covers%2Flanguage-unsupervised%2Flanguage_understanding_paper.pdf&rfr_id=info%3Asid%2Fen.wikipedia.org%3ATransformer+%28deep+learning+architecture%29" class="Z3988"></span></span> </li> <li id="cite_note-61"><span class="mw-cite-backlink"><b><a href="#cite_ref-61">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFWangLiXiaoZhu2019" class="citation cs2">Wang, Qiang; Li, Bei; Xiao, Tong; Zhu, Jingbo; Li, Changliang; Wong, Derek F.; Chao, Lidia S. (2019-06-04), <i>Learning Deep Transformer Models for Machine Translation</i>, <a href="/wiki/ArXiv_(identifier)" class="mw-redirect" title="ArXiv (identifier)">arXiv</a>:<span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://arxiv.org/abs/1906.01787">1906.01787</a></span></cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&rft.genre=book&rft.btitle=Learning+Deep+Transformer+Models+for+Machine+Translation&rft.date=2019-06-04&rft_id=info%3Aarxiv%2F1906.01787&rft.aulast=Wang&rft.aufirst=Qiang&rft.au=Li%2C+Bei&rft.au=Xiao%2C+Tong&rft.au=Zhu%2C+Jingbo&rft.au=Li%2C+Changliang&rft.au=Wong%2C+Derek+F.&rft.au=Chao%2C+Lidia+S.&rfr_id=info%3Asid%2Fen.wikipedia.org%3ATransformer+%28deep+learning+architecture%29" class="Z3988"></span></span> </li> <li id="cite_note-62"><span class="mw-cite-backlink"><b><a href="#cite_ref-62">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFPhuongHutter2022" class="citation cs2">Phuong, Mary; Hutter, Marcus (2022-07-19), <i>Formal Algorithms for Transformers</i>, <a href="/wiki/ArXiv_(identifier)" class="mw-redirect" title="ArXiv (identifier)">arXiv</a>:<span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://arxiv.org/abs/2207.09238">2207.09238</a></span></cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&rft.genre=book&rft.btitle=Formal+Algorithms+for+Transformers&rft.date=2022-07-19&rft_id=info%3Aarxiv%2F2207.09238&rft.aulast=Phuong&rft.aufirst=Mary&rft.au=Hutter%2C+Marcus&rfr_id=info%3Asid%2Fen.wikipedia.org%3ATransformer+%28deep+learning+architecture%29" class="Z3988"></span></span> </li> <li id="cite_note-:3-63"><span class="mw-cite-backlink">^ <a href="#cite_ref-:3_63-0"><sup><i><b>a</b></i></sup></a> <a href="#cite_ref-:3_63-1"><sup><i><b>b</b></i></sup></a> <a href="#cite_ref-:3_63-2"><sup><i><b>c</b></i></sup></a></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFRaffelShazeerRobertsLee2020" class="citation journal cs1">Raffel, Colin; Shazeer, Noam; Roberts, Adam; Lee, Katherine; Narang, Sharan; Matena, Michael; Zhou, Yanqi; Li, Wei; Liu, Peter J. (2020). <a rel="nofollow" class="external text" href="http://jmlr.org/papers/v21/20-074.html">"Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer"</a>. <i>Journal of Machine Learning Research</i>. <b>21</b> (140): <span class="nowrap">1–</span>67. <a href="/wiki/ArXiv_(identifier)" class="mw-redirect" title="ArXiv (identifier)">arXiv</a>:<span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://arxiv.org/abs/1910.10683">1910.10683</a></span>. <a href="/wiki/ISSN_(identifier)" class="mw-redirect" title="ISSN (identifier)">ISSN</a> <a rel="nofollow" class="external text" href="https://search.worldcat.org/issn/1533-7928">1533-7928</a>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=article&rft.jtitle=Journal+of+Machine+Learning+Research&rft.atitle=Exploring+the+Limits+of+Transfer+Learning+with+a+Unified+Text-to-Text+Transformer&rft.volume=21&rft.issue=140&rft.pages=%3Cspan+class%3D%22nowrap%22%3E1-%3C%2Fspan%3E67&rft.date=2020&rft_id=info%3Aarxiv%2F1910.10683&rft.issn=1533-7928&rft.aulast=Raffel&rft.aufirst=Colin&rft.au=Shazeer%2C+Noam&rft.au=Roberts%2C+Adam&rft.au=Lee%2C+Katherine&rft.au=Narang%2C+Sharan&rft.au=Matena%2C+Michael&rft.au=Zhou%2C+Yanqi&rft.au=Li%2C+Wei&rft.au=Liu%2C+Peter+J.&rft_id=http%3A%2F%2Fjmlr.org%2Fpapers%2Fv21%2F20-074.html&rfr_id=info%3Asid%2Fen.wikipedia.org%3ATransformer+%28deep+learning+architecture%29" class="Z3988"></span></span> </li> <li id="cite_note-64"><span class="mw-cite-backlink"><b><a href="#cite_ref-64">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite class="citation web cs1"><a rel="nofollow" class="external text" href="https://research.google/blog/recent-advances-in-google-translate/">"Recent Advances in Google Translate"</a>. <i>Google Research</i>. June 8, 2020. <a rel="nofollow" class="external text" href="https://web.archive.org/web/20240704042433/https://research.google/blog/recent-advances-in-google-translate/">Archived</a> from the original on 4 Jul 2024<span class="reference-accessdate">. Retrieved <span class="nowrap">2024-08-07</span></span>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=unknown&rft.jtitle=Google+Research&rft.atitle=Recent+Advances+in+Google+Translate&rft.date=2020-06-08&rft_id=http%3A%2F%2Fresearch.google%2Fblog%2Frecent-advances-in-google-translate%2F&rfr_id=info%3Asid%2Fen.wikipedia.org%3ATransformer+%28deep+learning+architecture%29" class="Z3988"></span></span> </li> <li id="cite_note-65"><span class="mw-cite-backlink"><b><a href="#cite_ref-65">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFShazeer2020" class="citation arxiv cs1">Shazeer, Noam (2020-02-01). "GLU Variants Improve Transformer". <a href="/wiki/ArXiv_(identifier)" class="mw-redirect" title="ArXiv (identifier)">arXiv</a>:<span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://arxiv.org/abs/2002.05202">2002.05202</a></span> [<a rel="nofollow" class="external text" href="https://arxiv.org/archive/cs.LG">cs.LG</a>].</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=preprint&rft.jtitle=arXiv&rft.atitle=GLU+Variants+Improve+Transformer&rft.date=2020-02-01&rft_id=info%3Aarxiv%2F2002.05202&rft.aulast=Shazeer&rft.aufirst=Noam&rfr_id=info%3Asid%2Fen.wikipedia.org%3ATransformer+%28deep+learning+architecture%29" class="Z3988"></span></span> </li> <li id="cite_note-66"><span class="mw-cite-backlink"><b><a href="#cite_ref-66">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFHendrycksGimpel2016" class="citation arxiv cs1">Hendrycks, Dan; Gimpel, Kevin (2016-06-27). "Gaussian Error Linear Units (GELUs)". <a href="/wiki/ArXiv_(identifier)" class="mw-redirect" title="ArXiv (identifier)">arXiv</a>:<span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://arxiv.org/abs/1606.08415v5">1606.08415v5</a></span> [<a rel="nofollow" class="external text" href="https://arxiv.org/archive/cs.LG">cs.LG</a>].</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=preprint&rft.jtitle=arXiv&rft.atitle=Gaussian+Error+Linear+Units+%28GELUs%29&rft.date=2016-06-27&rft_id=info%3Aarxiv%2F1606.08415v5&rft.aulast=Hendrycks&rft.aufirst=Dan&rft.au=Gimpel%2C+Kevin&rfr_id=info%3Asid%2Fen.wikipedia.org%3ATransformer+%28deep+learning+architecture%29" class="Z3988"></span></span> </li> <li id="cite_note-67"><span class="mw-cite-backlink"><b><a href="#cite_ref-67">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFShazeer2020" class="citation arxiv cs1">Shazeer, Noam (February 14, 2020). "GLU Variants Improve Transformer". <a href="/wiki/ArXiv_(identifier)" class="mw-redirect" title="ArXiv (identifier)">arXiv</a>:<span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://arxiv.org/abs/2002.05202">2002.05202</a></span> [<a rel="nofollow" class="external text" href="https://arxiv.org/archive/cs.LG">cs.LG</a>].</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=preprint&rft.jtitle=arXiv&rft.atitle=GLU+Variants+Improve+Transformer&rft.date=2020-02-14&rft_id=info%3Aarxiv%2F2002.05202&rft.aulast=Shazeer&rft.aufirst=Noam&rfr_id=info%3Asid%2Fen.wikipedia.org%3ATransformer+%28deep+learning+architecture%29" class="Z3988"></span></span> </li> <li id="cite_note-68"><span class="mw-cite-backlink"><b><a href="#cite_ref-68">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFZhangSennrich2019" class="citation journal cs1">Zhang, Biao; Sennrich, Rico (2019). <a rel="nofollow" class="external text" href="https://proceedings.neurips.cc/paper/2019/hash/1e8a19426224ca89e83cef47f1e7f53b-Abstract.html">"Root Mean Square Layer Normalization"</a>. <i>Advances in Neural Information Processing Systems</i>. <b>32</b>. Curran Associates, Inc. <a href="/wiki/ArXiv_(identifier)" class="mw-redirect" title="ArXiv (identifier)">arXiv</a>:<span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://arxiv.org/abs/1910.07467">1910.07467</a></span>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=article&rft.jtitle=Advances+in+Neural+Information+Processing+Systems&rft.atitle=Root+Mean+Square+Layer+Normalization&rft.volume=32&rft.date=2019&rft_id=info%3Aarxiv%2F1910.07467&rft.aulast=Zhang&rft.aufirst=Biao&rft.au=Sennrich%2C+Rico&rft_id=https%3A%2F%2Fproceedings.neurips.cc%2Fpaper%2F2019%2Fhash%2F1e8a19426224ca89e83cef47f1e7f53b-Abstract.html&rfr_id=info%3Asid%2Fen.wikipedia.org%3ATransformer+%28deep+learning+architecture%29" class="Z3988"></span></span> </li> <li id="cite_note-69"><span class="mw-cite-backlink"><b><a href="#cite_ref-69">^</a></b></span> <span class="reference-text">Tembine, Hamidou, Manzoor Ahmed Khan, and Issa Bamia. 2024. "Mean-Field-Type Transformers" Mathematics 12, no. 22: 3506. <a rel="nofollow" class="external free" href="https://doi.org/10.3390/math12223506">https://doi.org/10.3390/math12223506</a></span> </li> <li id="cite_note-:9-70"><span class="mw-cite-backlink">^ <a href="#cite_ref-:9_70-0"><sup><i><b>a</b></i></sup></a> <a href="#cite_ref-:9_70-1"><sup><i><b>b</b></i></sup></a></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFNguyenSalazar2019" class="citation journal cs1">Nguyen, Toan Q.; Salazar, Julian (2019-11-02). Niehues, Jan; Cattoni, Rolando; Stüker, Sebastian; Negri, Matteo; Turchi, Marco; Ha, Thanh-Le; Salesky, Elizabeth; Sanabria, Ramon; Barrault, Loic (eds.). <a rel="nofollow" class="external text" href="https://aclanthology.org/2019.iwslt-1.17">"Transformers without Tears: Improving the Normalization of Self-Attention"</a>. <i>Proceedings of the 16th International Conference on Spoken Language Translation</i>. Hong Kong: Association for Computational Linguistics. <a href="/wiki/ArXiv_(identifier)" class="mw-redirect" title="ArXiv (identifier)">arXiv</a>:<span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://arxiv.org/abs/1910.05895">1910.05895</a></span>. <a href="/wiki/Doi_(identifier)" class="mw-redirect" title="Doi (identifier)">doi</a>:<a rel="nofollow" class="external text" href="https://doi.org/10.5281%2Fzenodo.3525484">10.5281/zenodo.3525484</a>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=article&rft.jtitle=Proceedings+of+the+16th+International+Conference+on+Spoken+Language+Translation&rft.atitle=Transformers+without+Tears%3A+Improving+the+Normalization+of+Self-Attention&rft.date=2019-11-02&rft_id=info%3Aarxiv%2F1910.05895&rft_id=info%3Adoi%2F10.5281%2Fzenodo.3525484&rft.aulast=Nguyen&rft.aufirst=Toan+Q.&rft.au=Salazar%2C+Julian&rft_id=https%3A%2F%2Faclanthology.org%2F2019.iwslt-1.17&rfr_id=info%3Asid%2Fen.wikipedia.org%3ATransformer+%28deep+learning+architecture%29" class="Z3988"></span></span> </li> <li id="cite_note-71"><span class="mw-cite-backlink"><b><a href="#cite_ref-71">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFDufterSchmittSchütze2022" class="citation journal cs1">Dufter, Philipp; Schmitt, Martin; Schütze, Hinrich (2022-06-06). <a rel="nofollow" class="external text" href="https://doi.org/10.1162%2Fcoli_a_00445">"Position Information in Transformers: An Overview"</a>. <i>Computational Linguistics</i>. <b>48</b> (3): <span class="nowrap">733–</span>763. <a href="/wiki/ArXiv_(identifier)" class="mw-redirect" title="ArXiv (identifier)">arXiv</a>:<span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://arxiv.org/abs/2102.11090">2102.11090</a></span>. <a href="/wiki/Doi_(identifier)" class="mw-redirect" title="Doi (identifier)">doi</a>:<span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://doi.org/10.1162%2Fcoli_a_00445">10.1162/coli_a_00445</a></span>. <a href="/wiki/ISSN_(identifier)" class="mw-redirect" title="ISSN (identifier)">ISSN</a> <a rel="nofollow" class="external text" href="https://search.worldcat.org/issn/0891-2017">0891-2017</a>. <a href="/wiki/S2CID_(identifier)" class="mw-redirect" title="S2CID (identifier)">S2CID</a> <a rel="nofollow" class="external text" href="https://api.semanticscholar.org/CorpusID:231986066">231986066</a>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=article&rft.jtitle=Computational+Linguistics&rft.atitle=Position+Information+in+Transformers%3A+An+Overview&rft.volume=48&rft.issue=3&rft.pages=%3Cspan+class%3D%22nowrap%22%3E733-%3C%2Fspan%3E763&rft.date=2022-06-06&rft_id=info%3Aarxiv%2F2102.11090&rft_id=https%3A%2F%2Fapi.semanticscholar.org%2FCorpusID%3A231986066%23id-name%3DS2CID&rft.issn=0891-2017&rft_id=info%3Adoi%2F10.1162%2Fcoli_a_00445&rft.aulast=Dufter&rft.aufirst=Philipp&rft.au=Schmitt%2C+Martin&rft.au=Sch%C3%BCtze%2C+Hinrich&rft_id=https%3A%2F%2Fdoi.org%2F10.1162%252Fcoli_a_00445&rfr_id=info%3Asid%2Fen.wikipedia.org%3ATransformer+%28deep+learning+architecture%29" class="Z3988"></span></span> </li> <li id="cite_note-72"><span class="mw-cite-backlink"><b><a href="#cite_ref-72">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFGehringAuliGrangierYarats2017" class="citation journal cs1">Gehring, Jonas; Auli, Michael; Grangier, David; Yarats, Denis; Dauphin, Yann N. (2017-07-17). <a rel="nofollow" class="external text" href="https://proceedings.mlr.press/v70/gehring17a.html">"Convolutional Sequence to Sequence Learning"</a>. <i>Proceedings of the 34th International Conference on Machine Learning</i>. PMLR: <span class="nowrap">1243–</span>1252.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=article&rft.jtitle=Proceedings+of+the+34th+International+Conference+on+Machine+Learning&rft.atitle=Convolutional+Sequence+to+Sequence+Learning&rft.pages=%3Cspan+class%3D%22nowrap%22%3E1243-%3C%2Fspan%3E1252&rft.date=2017-07-17&rft.aulast=Gehring&rft.aufirst=Jonas&rft.au=Auli%2C+Michael&rft.au=Grangier%2C+David&rft.au=Yarats%2C+Denis&rft.au=Dauphin%2C+Yann+N.&rft_id=https%3A%2F%2Fproceedings.mlr.press%2Fv70%2Fgehring17a.html&rfr_id=info%3Asid%2Fen.wikipedia.org%3ATransformer+%28deep+learning+architecture%29" class="Z3988"></span></span> </li> <li id="cite_note-73"><span class="mw-cite-backlink"><b><a href="#cite_ref-73">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFHavivRamPressIzsak2022" class="citation cs2">Haviv, Adi; Ram, Ori; Press, Ofir; Izsak, Peter; Levy, Omer (2022-12-05), <i>Transformer Language Models without Positional Encodings Still Learn Positional Information</i>, <a href="/wiki/ArXiv_(identifier)" class="mw-redirect" title="ArXiv (identifier)">arXiv</a>:<span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://arxiv.org/abs/2203.16634">2203.16634</a></span></cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&rft.genre=book&rft.btitle=Transformer+Language+Models+without+Positional+Encodings+Still+Learn+Positional+Information&rft.date=2022-12-05&rft_id=info%3Aarxiv%2F2203.16634&rft.aulast=Haviv&rft.aufirst=Adi&rft.au=Ram%2C+Ori&rft.au=Press%2C+Ofir&rft.au=Izsak%2C+Peter&rft.au=Levy%2C+Omer&rfr_id=info%3Asid%2Fen.wikipedia.org%3ATransformer+%28deep+learning+architecture%29" class="Z3988"></span></span> </li> <li id="cite_note-74"><span class="mw-cite-backlink"><b><a href="#cite_ref-74">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFSuLuPanMurtadha2021" class="citation arxiv cs1">Su, Jianlin; Lu, Yu; Pan, Shengfeng; Murtadha, Ahmed; Wen, Bo; Liu, Yunfeng (2021-04-01). "RoFormer: Enhanced Transformer with Rotary Position Embedding". <a href="/wiki/ArXiv_(identifier)" class="mw-redirect" title="ArXiv (identifier)">arXiv</a>:<span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://arxiv.org/abs/2104.09864">2104.09864</a></span> [<a rel="nofollow" class="external text" href="https://arxiv.org/archive/cs.CL">cs.CL</a>].</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=preprint&rft.jtitle=arXiv&rft.atitle=RoFormer%3A+Enhanced+Transformer+with+Rotary+Position+Embedding&rft.date=2021-04-01&rft_id=info%3Aarxiv%2F2104.09864&rft.aulast=Su&rft.aufirst=Jianlin&rft.au=Lu%2C+Yu&rft.au=Pan%2C+Shengfeng&rft.au=Murtadha%2C+Ahmed&rft.au=Wen%2C+Bo&rft.au=Liu%2C+Yunfeng&rfr_id=info%3Asid%2Fen.wikipedia.org%3ATransformer+%28deep+learning+architecture%29" class="Z3988"></span></span> </li> <li id="cite_note-75"><span class="mw-cite-backlink"><b><a href="#cite_ref-75">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFPressSmithLewis2021" class="citation arxiv cs1">Press, Ofir; Smith, Noah A.; Lewis, Mike (2021-08-01). "Train Short, Test Long: Attention with Linear Biases Enables Input Length Extrapolation". <a href="/wiki/ArXiv_(identifier)" class="mw-redirect" title="ArXiv (identifier)">arXiv</a>:<span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://arxiv.org/abs/2108.12409">2108.12409</a></span> [<a rel="nofollow" class="external text" href="https://arxiv.org/archive/cs.CL">cs.CL</a>].</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=preprint&rft.jtitle=arXiv&rft.atitle=Train+Short%2C+Test+Long%3A+Attention+with+Linear+Biases+Enables+Input+Length+Extrapolation&rft.date=2021-08-01&rft_id=info%3Aarxiv%2F2108.12409&rft.aulast=Press&rft.aufirst=Ofir&rft.au=Smith%2C+Noah+A.&rft.au=Lewis%2C+Mike&rfr_id=info%3Asid%2Fen.wikipedia.org%3ATransformer+%28deep+learning+architecture%29" class="Z3988"></span></span> </li> <li id="cite_note-76"><span class="mw-cite-backlink"><b><a href="#cite_ref-76">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFShawUszkoreitVaswani2018" class="citation arxiv cs1">Shaw, Peter; Uszkoreit, Jakob; Vaswani, Ashish (2018). "Self-Attention with Relative Position Representations". <a href="/wiki/ArXiv_(identifier)" class="mw-redirect" title="ArXiv (identifier)">arXiv</a>:<span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://arxiv.org/abs/1803.02155">1803.02155</a></span> [<a rel="nofollow" class="external text" href="https://arxiv.org/archive/cs.CL">cs.CL</a>].</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=preprint&rft.jtitle=arXiv&rft.atitle=Self-Attention+with+Relative+Position+Representations&rft.date=2018&rft_id=info%3Aarxiv%2F1803.02155&rft.aulast=Shaw&rft.aufirst=Peter&rft.au=Uszkoreit%2C+Jakob&rft.au=Vaswani%2C+Ashish&rfr_id=info%3Asid%2Fen.wikipedia.org%3ATransformer+%28deep+learning+architecture%29" class="Z3988"></span></span> </li> <li id="cite_note-77"><span class="mw-cite-backlink"><b><a href="#cite_ref-77">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFKeHeLiu2021" class="citation cs2">Ke, Guolin; He, Di; Liu, Tie-Yan (2021-03-15), <i>Rethinking Positional Encoding in Language Pre-training</i>, <a href="/wiki/ArXiv_(identifier)" class="mw-redirect" title="ArXiv (identifier)">arXiv</a>:<span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://arxiv.org/abs/2006.15595">2006.15595</a></span></cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&rft.genre=book&rft.btitle=Rethinking+Positional+Encoding+in+Language+Pre-training&rft.date=2021-03-15&rft_id=info%3Aarxiv%2F2006.15595&rft.aulast=Ke&rft.aufirst=Guolin&rft.au=He%2C+Di&rft.au=Liu%2C+Tie-Yan&rfr_id=info%3Asid%2Fen.wikipedia.org%3ATransformer+%28deep+learning+architecture%29" class="Z3988"></span></span> </li> <li id="cite_note-78"><span class="mw-cite-backlink"><b><a href="#cite_ref-78">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFKwonLiZhuangSheng2023" class="citation book cs1">Kwon, Woosuk; Li, Zhuohan; Zhuang, Siyuan; Sheng, Ying; Zheng, Lianmin; Yu, Cody Hao; Gonzalez, Joseph; Zhang, Hao; Stoica, Ion (2023-10-23). <a rel="nofollow" class="external text" href="https://dl.acm.org/doi/10.1145/3600006.3613165">"Efficient Memory Management for Large Language Model Serving with PagedAttention"</a>. <i>Proceedings of the 29th Symposium on Operating Systems Principles</i>. SOSP '23. New York, NY, USA: Association for Computing Machinery. pp. <span class="nowrap">611–</span>626. <a href="/wiki/ArXiv_(identifier)" class="mw-redirect" title="ArXiv (identifier)">arXiv</a>:<span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://arxiv.org/abs/2309.06180">2309.06180</a></span>. <a href="/wiki/Doi_(identifier)" class="mw-redirect" title="Doi (identifier)">doi</a>:<a rel="nofollow" class="external text" href="https://doi.org/10.1145%2F3600006.3613165">10.1145/3600006.3613165</a>. <a href="/wiki/ISBN_(identifier)" class="mw-redirect" title="ISBN (identifier)">ISBN</a> <a href="/wiki/Special:BookSources/979-8-4007-0229-7" title="Special:BookSources/979-8-4007-0229-7"><bdi>979-8-4007-0229-7</bdi></a>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&rft.genre=bookitem&rft.atitle=Efficient+Memory+Management+for+Large+Language+Model+Serving+with+PagedAttention&rft.btitle=Proceedings+of+the+29th+Symposium+on+Operating+Systems+Principles&rft.place=New+York%2C+NY%2C+USA&rft.series=SOSP+%2723&rft.pages=%3Cspan+class%3D%22nowrap%22%3E611-%3C%2Fspan%3E626&rft.pub=Association+for+Computing+Machinery&rft.date=2023-10-23&rft_id=info%3Aarxiv%2F2309.06180&rft_id=info%3Adoi%2F10.1145%2F3600006.3613165&rft.isbn=979-8-4007-0229-7&rft.aulast=Kwon&rft.aufirst=Woosuk&rft.au=Li%2C+Zhuohan&rft.au=Zhuang%2C+Siyuan&rft.au=Sheng%2C+Ying&rft.au=Zheng%2C+Lianmin&rft.au=Yu%2C+Cody+Hao&rft.au=Gonzalez%2C+Joseph&rft.au=Zhang%2C+Hao&rft.au=Stoica%2C+Ion&rft_id=https%3A%2F%2Fdl.acm.org%2Fdoi%2F10.1145%2F3600006.3613165&rfr_id=info%3Asid%2Fen.wikipedia.org%3ATransformer+%28deep+learning+architecture%29" class="Z3988"></span></span> </li> <li id="cite_note-79"><span class="mw-cite-backlink"><b><a href="#cite_ref-79">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite class="citation cs2"><a rel="nofollow" class="external text" href="https://github.com/vllm-project/vllm"><i>vllm-project/vllm</i></a>, vLLM, 2024-06-20<span class="reference-accessdate">, retrieved <span class="nowrap">2024-06-20</span></span></cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&rft.genre=book&rft.btitle=vllm-project%2Fvllm&rft.pub=vLLM&rft.date=2024-06-20&rft_id=https%3A%2F%2Fgithub.com%2Fvllm-project%2Fvllm&rfr_id=info%3Asid%2Fen.wikipedia.org%3ATransformer+%28deep+learning+architecture%29" class="Z3988"></span></span> </li> <li id="cite_note-80"><span class="mw-cite-backlink"><b><a href="#cite_ref-80">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFContribution)2023" class="citation web cs1">Contribution), Woosuk Kwon*, Zhuohan Li*, Siyuan Zhuang, Ying Sheng, Lianmin Zheng, Cody Yu, Joey Gonzalez, Hao Zhang, and Ion Stoica (* Equal (2023-06-20). <a rel="nofollow" class="external text" href="https://blog.vllm.ai/2023/06/20/vllm.html">"vLLM: Easy, Fast, and Cheap LLM Serving with PagedAttention"</a>. <i>vLLM Blog</i><span class="reference-accessdate">. Retrieved <span class="nowrap">2024-06-20</span></span>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=unknown&rft.jtitle=vLLM+Blog&rft.atitle=vLLM%3A+Easy%2C+Fast%2C+and+Cheap+LLM+Serving+with+PagedAttention&rft.date=2023-06-20&rft.aulast=Contribution%29&rft.aufirst=Woosuk+Kwon%2A%2C+Zhuohan+Li%2A%2C+Siyuan+Zhuang%2C+Ying+Sheng%2C+Lianmin+Zheng%2C+Cody+Yu%2C+Joey+Gonzalez%2C+Hao+Zhang%2C+and+Ion+Stoica+%28%2A+Equal&rft_id=https%3A%2F%2Fblog.vllm.ai%2F2023%2F06%2F20%2Fvllm.html&rfr_id=info%3Asid%2Fen.wikipedia.org%3ATransformer+%28deep+learning+architecture%29" class="Z3988"></span><span class="cs1-maint citation-comment"><code class="cs1-code">{{<a href="/wiki/Template:Cite_web" title="Template:Cite web">cite web</a>}}</code>: CS1 maint: multiple names: authors list (<a href="/wiki/Category:CS1_maint:_multiple_names:_authors_list" title="Category:CS1 maint: multiple names: authors list">link</a>)</span></span> </li> <li id="cite_note-81"><span class="mw-cite-backlink"><b><a href="#cite_ref-81">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFDaoFuErmonRudra2022" class="citation journal cs1">Dao, Tri; Fu, Dan; Ermon, Stefano; Rudra, Atri; Ré, Christopher (2022-12-06). <a rel="nofollow" class="external text" href="https://proceedings.neurips.cc/paper_files/paper/2022/hash/67d57c32e20fd0a7a302cb81d36e40d5-Abstract-Conference.html">"FlashAttention: Fast and Memory-Efficient Exact Attention with IO-Awareness"</a>. <i>Advances in Neural Information Processing Systems</i>. <b>35</b>: <span class="nowrap">16344–</span>16359. <a href="/wiki/ArXiv_(identifier)" class="mw-redirect" title="ArXiv (identifier)">arXiv</a>:<span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://arxiv.org/abs/2205.14135">2205.14135</a></span>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=article&rft.jtitle=Advances+in+Neural+Information+Processing+Systems&rft.atitle=FlashAttention%3A+Fast+and+Memory-Efficient+Exact+Attention+with+IO-Awareness&rft.volume=35&rft.pages=%3Cspan+class%3D%22nowrap%22%3E16344-%3C%2Fspan%3E16359&rft.date=2022-12-06&rft_id=info%3Aarxiv%2F2205.14135&rft.aulast=Dao&rft.aufirst=Tri&rft.au=Fu%2C+Dan&rft.au=Ermon%2C+Stefano&rft.au=Rudra%2C+Atri&rft.au=R%C3%A9%2C+Christopher&rft_id=https%3A%2F%2Fproceedings.neurips.cc%2Fpaper_files%2Fpaper%2F2022%2Fhash%2F67d57c32e20fd0a7a302cb81d36e40d5-Abstract-Conference.html&rfr_id=info%3Asid%2Fen.wikipedia.org%3ATransformer+%28deep+learning+architecture%29" class="Z3988"></span></span> </li> <li id="cite_note-82"><span class="mw-cite-backlink"><b><a href="#cite_ref-82">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite class="citation web cs1"><a rel="nofollow" class="external text" href="https://crfm.stanford.edu/2023/07/17/flash2.html">"Stanford CRFM"</a>. <i>crfm.stanford.edu</i><span class="reference-accessdate">. Retrieved <span class="nowrap">2023-07-18</span></span>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=unknown&rft.jtitle=crfm.stanford.edu&rft.atitle=Stanford+CRFM&rft_id=https%3A%2F%2Fcrfm.stanford.edu%2F2023%2F07%2F17%2Fflash2.html&rfr_id=info%3Asid%2Fen.wikipedia.org%3ATransformer+%28deep+learning+architecture%29" class="Z3988"></span></span> </li> <li id="cite_note-83"><span class="mw-cite-backlink"><b><a href="#cite_ref-83">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite class="citation web cs1"><a rel="nofollow" class="external text" href="https://princeton-nlp.github.io/flash-atttention-2/">"FlashAttention-2: Faster Attention with Better Parallelism and Work Partitioning"</a>. <i>Princeton NLP</i>. 2023-06-17<span class="reference-accessdate">. Retrieved <span class="nowrap">2023-07-18</span></span>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=unknown&rft.jtitle=Princeton+NLP&rft.atitle=FlashAttention-2%3A+Faster+Attention+with+Better+Parallelism+and+Work+Partitioning&rft.date=2023-06-17&rft_id=https%3A%2F%2Fprinceton-nlp.github.io%2Fflash-atttention-2%2F&rfr_id=info%3Asid%2Fen.wikipedia.org%3ATransformer+%28deep+learning+architecture%29" class="Z3988"></span></span> </li> <li id="cite_note-84"><span class="mw-cite-backlink"><b><a href="#cite_ref-84">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite class="citation web cs1"><a rel="nofollow" class="external text" href="https://together.ai/blog/tri-dao-flash-attention">"Introducing Together AI Chief Scientist Tri Dao, as he releases FlashAttention-2 to speed up model training and inference"</a>. <i>TOGETHER</i><span class="reference-accessdate">. Retrieved <span class="nowrap">2023-07-18</span></span>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=unknown&rft.jtitle=TOGETHER&rft.atitle=Introducing+Together+AI+Chief+Scientist+Tri+Dao%2C+as+he+releases+FlashAttention-2+to+speed+up+model+training+and+inference&rft_id=https%3A%2F%2Ftogether.ai%2Fblog%2Ftri-dao-flash-attention&rfr_id=info%3Asid%2Fen.wikipedia.org%3ATransformer+%28deep+learning+architecture%29" class="Z3988"></span></span> </li> <li id="cite_note-85"><span class="mw-cite-backlink"><b><a href="#cite_ref-85">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFAinslieLee-Thorpde_JongZemlyanskiy2023" class="citation arxiv cs1">Ainslie, Joshua; Lee-Thorp, James; de Jong, Michiel; Zemlyanskiy, Yury; Lebrón, Federico; Sanghai, Sumit (2023-12-23). "GQA: Training Generalized Multi-Query Transformer Models from Multi-Head Checkpoints". <a href="/wiki/ArXiv_(identifier)" class="mw-redirect" title="ArXiv (identifier)">arXiv</a>:<span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://arxiv.org/abs/2305.13245">2305.13245</a></span> [<a rel="nofollow" class="external text" href="https://arxiv.org/archive/cs.CL">cs.CL</a>].</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=preprint&rft.jtitle=arXiv&rft.atitle=GQA%3A+Training+Generalized+Multi-Query+Transformer+Models+from+Multi-Head+Checkpoints&rft.date=2023-12-23&rft_id=info%3Aarxiv%2F2305.13245&rft.aulast=Ainslie&rft.aufirst=Joshua&rft.au=Lee-Thorp%2C+James&rft.au=de+Jong%2C+Michiel&rft.au=Zemlyanskiy%2C+Yury&rft.au=Lebr%C3%B3n%2C+Federico&rft.au=Sanghai%2C+Sumit&rfr_id=info%3Asid%2Fen.wikipedia.org%3ATransformer+%28deep+learning+architecture%29" class="Z3988"></span></span> </li> <li id="cite_note-86"><span class="mw-cite-backlink"><b><a href="#cite_ref-86">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFChowdheryNarangDevlinBosma2022" class="citation arxiv cs1">Chowdhery, Aakanksha; Narang, Sharan; Devlin, Jacob; Bosma, Maarten; Mishra, Gaurav; Roberts, Adam; Barham, Paul; Chung, Hyung Won; Sutton, Charles; Gehrmann, Sebastian; Schuh, Parker; Shi, Kensen; Tsvyashchenko, Sasha; Maynez, Joshua; Rao, Abhishek (2022-04-01). "PaLM: Scaling Language Modeling with Pathways". <a href="/wiki/ArXiv_(identifier)" class="mw-redirect" title="ArXiv (identifier)">arXiv</a>:<span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://arxiv.org/abs/2204.02311">2204.02311</a></span> [<a rel="nofollow" class="external text" href="https://arxiv.org/archive/cs.CL">cs.CL</a>].</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=preprint&rft.jtitle=arXiv&rft.atitle=PaLM%3A+Scaling+Language+Modeling+with+Pathways&rft.date=2022-04-01&rft_id=info%3Aarxiv%2F2204.02311&rft.aulast=Chowdhery&rft.aufirst=Aakanksha&rft.au=Narang%2C+Sharan&rft.au=Devlin%2C+Jacob&rft.au=Bosma%2C+Maarten&rft.au=Mishra%2C+Gaurav&rft.au=Roberts%2C+Adam&rft.au=Barham%2C+Paul&rft.au=Chung%2C+Hyung+Won&rft.au=Sutton%2C+Charles&rft.au=Gehrmann%2C+Sebastian&rft.au=Schuh%2C+Parker&rft.au=Shi%2C+Kensen&rft.au=Tsvyashchenko%2C+Sasha&rft.au=Maynez%2C+Joshua&rft.au=Rao%2C+Abhishek&rfr_id=info%3Asid%2Fen.wikipedia.org%3ATransformer+%28deep+learning+architecture%29" class="Z3988"></span></span> </li> <li id="cite_note-87"><span class="mw-cite-backlink"><b><a href="#cite_ref-87">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFAinslieLee-Thorpde_JongZemlyanskiy2023" class="citation cs2">Ainslie, Joshua; Lee-Thorp, James; de Jong, Michiel; Zemlyanskiy, Yury; Lebrón, Federico; Sanghai, Sumit (2023-12-23), <i>GQA: Training Generalized Multi-Query Transformer Models from Multi-Head Checkpoints</i>, <a href="/wiki/ArXiv_(identifier)" class="mw-redirect" title="ArXiv (identifier)">arXiv</a>:<span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://arxiv.org/abs/2305.13245">2305.13245</a></span></cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&rft.genre=book&rft.btitle=GQA%3A+Training+Generalized+Multi-Query+Transformer+Models+from+Multi-Head+Checkpoints&rft.date=2023-12-23&rft_id=info%3Aarxiv%2F2305.13245&rft.aulast=Ainslie&rft.aufirst=Joshua&rft.au=Lee-Thorp%2C+James&rft.au=de+Jong%2C+Michiel&rft.au=Zemlyanskiy%2C+Yury&rft.au=Lebr%C3%B3n%2C+Federico&rft.au=Sanghai%2C+Sumit&rfr_id=info%3Asid%2Fen.wikipedia.org%3ATransformer+%28deep+learning+architecture%29" class="Z3988"></span></span> </li> <li id="cite_note-:73-88"><span class="mw-cite-backlink">^ <a href="#cite_ref-:73_88-0"><sup><i><b>a</b></i></sup></a> <a href="#cite_ref-:73_88-1"><sup><i><b>b</b></i></sup></a></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFDeepSeek-AILiuFengWang2024" class="citation cs2">DeepSeek-AI; Liu, Aixin; Feng, Bei; Wang, Bin; Wang, Bingxuan; Liu, Bo; Zhao, Chenggang; Dengr, Chengqi; Ruan, Chong (19 June 2024), <i>DeepSeek-V2: A Strong, Economical, and Efficient Mixture-of-Experts Language Model</i>, <a href="/wiki/ArXiv_(identifier)" class="mw-redirect" title="ArXiv (identifier)">arXiv</a>:<span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://arxiv.org/abs/2405.04434">2405.04434</a></span></cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&rft.genre=book&rft.btitle=DeepSeek-V2%3A+A+Strong%2C+Economical%2C+and+Efficient+Mixture-of-Experts+Language+Model&rft.date=2024-06-19&rft_id=info%3Aarxiv%2F2405.04434&rft.au=DeepSeek-AI&rft.au=Liu%2C+Aixin&rft.au=Feng%2C+Bei&rft.au=Wang%2C+Bin&rft.au=Wang%2C+Bingxuan&rft.au=Liu%2C+Bo&rft.au=Zhao%2C+Chenggang&rft.au=Dengr%2C+Chengqi&rft.au=Ruan%2C+Chong&rfr_id=info%3Asid%2Fen.wikipedia.org%3ATransformer+%28deep+learning+architecture%29" class="Z3988"></span>.</span> </li> <li id="cite_note-:2-89"><span class="mw-cite-backlink">^ <a href="#cite_ref-:2_89-0"><sup><i><b>a</b></i></sup></a> <a href="#cite_ref-:2_89-1"><sup><i><b>b</b></i></sup></a></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFLeviathanKalmanMatias2023" class="citation cs2">Leviathan, Yaniv; Kalman, Matan; Matias, Yossi (2023-05-18), <i>Fast Inference from Transformers via Speculative Decoding</i>, <a href="/wiki/ArXiv_(identifier)" class="mw-redirect" title="ArXiv (identifier)">arXiv</a>:<span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://arxiv.org/abs/2211.17192">2211.17192</a></span></cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&rft.genre=book&rft.btitle=Fast+Inference+from+Transformers+via+Speculative+Decoding&rft.date=2023-05-18&rft_id=info%3Aarxiv%2F2211.17192&rft.aulast=Leviathan&rft.aufirst=Yaniv&rft.au=Kalman%2C+Matan&rft.au=Matias%2C+Yossi&rfr_id=info%3Asid%2Fen.wikipedia.org%3ATransformer+%28deep+learning+architecture%29" class="Z3988"></span></span> </li> <li id="cite_note-90"><span class="mw-cite-backlink"><b><a href="#cite_ref-90">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFFu2023" class="citation web cs1">Fu, Yao (2023-12-13). <a rel="nofollow" class="external text" href="https://yaofu.notion.site/Towards-100x-Speedup-Full-Stack-Transformer-Inference-Optimization-43124c3688e14cffaf2f1d6cbdf26c6c">"Towards 100x Speedup: Full Stack Transformer Inference Optimization"</a>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&rft.genre=unknown&rft.btitle=Towards+100x+Speedup%3A+Full+Stack+Transformer+Inference+Optimization&rft.date=2023-12-13&rft.aulast=Fu&rft.aufirst=Yao&rft_id=https%3A%2F%2Fyaofu.notion.site%2FTowards-100x-Speedup-Full-Stack-Transformer-Inference-Optimization-43124c3688e14cffaf2f1d6cbdf26c6c&rfr_id=info%3Asid%2Fen.wikipedia.org%3ATransformer+%28deep+learning+architecture%29" class="Z3988"></span></span> </li> <li id="cite_note-91"><span class="mw-cite-backlink"><b><a href="#cite_ref-91">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFChenBorgeaudIrvingLespiau2023" class="citation cs2">Chen, Charlie; Borgeaud, Sebastian; Irving, Geoffrey; Lespiau, Jean-Baptiste; Sifre, Laurent; Jumper, John (2023-02-02), <i>Accelerating Large Language Model Decoding with Speculative Sampling</i>, <a href="/wiki/ArXiv_(identifier)" class="mw-redirect" title="ArXiv (identifier)">arXiv</a>:<span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://arxiv.org/abs/2302.01318">2302.01318</a></span></cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&rft.genre=book&rft.btitle=Accelerating+Large+Language+Model+Decoding+with+Speculative+Sampling&rft.date=2023-02-02&rft_id=info%3Aarxiv%2F2302.01318&rft.aulast=Chen&rft.aufirst=Charlie&rft.au=Borgeaud%2C+Sebastian&rft.au=Irving%2C+Geoffrey&rft.au=Lespiau%2C+Jean-Baptiste&rft.au=Sifre%2C+Laurent&rft.au=Jumper%2C+John&rfr_id=info%3Asid%2Fen.wikipedia.org%3ATransformer+%28deep+learning+architecture%29" class="Z3988"></span></span> </li> <li id="cite_note-92"><span class="mw-cite-backlink"><b><a href="#cite_ref-92">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFGloeckleIdrissiRozièreLopez-Paz2024" class="citation cs2">Gloeckle, Fabian; Idrissi, Badr Youbi; Rozière, Baptiste; Lopez-Paz, David; Synnaeve, Gabriel (2024-04-30), <a rel="nofollow" class="external text" href="https://arxiv.org/abs/2404.19737"><i>Better & Faster Large Language Models via Multi-token Prediction</i></a>, arXiv, <a href="/wiki/Doi_(identifier)" class="mw-redirect" title="Doi (identifier)">doi</a>:<a rel="nofollow" class="external text" href="https://doi.org/10.48550%2FarXiv.2404.19737">10.48550/arXiv.2404.19737</a>, arXiv:2404.19737</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&rft.genre=book&rft.btitle=Better+%26+Faster+Large+Language+Models+via+Multi-token+Prediction&rft.pub=arXiv&rft.date=2024-04-30&rft_id=info%3Adoi%2F10.48550%2FarXiv.2404.19737&rft.aulast=Gloeckle&rft.aufirst=Fabian&rft.au=Idrissi%2C+Badr+Youbi&rft.au=Rozi%C3%A8re%2C+Baptiste&rft.au=Lopez-Paz%2C+David&rft.au=Synnaeve%2C+Gabriel&rft_id=https%3A%2F%2Farxiv.org%2Fabs%2F2404.19737&rfr_id=info%3Asid%2Fen.wikipedia.org%3ATransformer+%28deep+learning+architecture%29" class="Z3988"></span></span> </li> <li id="cite_note-93"><span class="mw-cite-backlink"><b><a href="#cite_ref-93">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFDeepSeek-AILiuFengXue2024" class="citation cs2">DeepSeek-AI; Liu, Aixin; Feng, Bei; Xue, Bing; Wang, Bingxuan; Wu, Bochao; Lu, Chengda; Zhao, Chenggang; Deng, Chengqi (2024-12-27), <a rel="nofollow" class="external text" href="https://arxiv.org/abs/2412.19437"><i>DeepSeek-V3 Technical Report</i></a>, arXiv, <a href="/wiki/Doi_(identifier)" class="mw-redirect" title="Doi (identifier)">doi</a>:<a rel="nofollow" class="external text" href="https://doi.org/10.48550%2FarXiv.2412.19437">10.48550/arXiv.2412.19437</a>, arXiv:2412.19437</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&rft.genre=book&rft.btitle=DeepSeek-V3+Technical+Report&rft.pub=arXiv&rft.date=2024-12-27&rft_id=info%3Adoi%2F10.48550%2FarXiv.2412.19437&rft.au=DeepSeek-AI&rft.au=Liu%2C+Aixin&rft.au=Feng%2C+Bei&rft.au=Xue%2C+Bing&rft.au=Wang%2C+Bingxuan&rft.au=Wu%2C+Bochao&rft.au=Lu%2C+Chengda&rft.au=Zhao%2C+Chenggang&rft.au=Deng%2C+Chengqi&rft_id=https%3A%2F%2Farxiv.org%2Fabs%2F2412.19437&rfr_id=info%3Asid%2Fen.wikipedia.org%3ATransformer+%28deep+learning+architecture%29" class="Z3988"></span></span> </li> <li id="cite_note-reformer-94"><span class="mw-cite-backlink">^ <a href="#cite_ref-reformer_94-0"><sup><i><b>a</b></i></sup></a> <a href="#cite_ref-reformer_94-1"><sup><i><b>b</b></i></sup></a></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFKitaevKaiserLevskaya2020" class="citation arxiv cs1">Kitaev, Nikita; Kaiser, Łukasz; Levskaya, Anselm (2020). "Reformer: The Efficient Transformer". <a href="/wiki/ArXiv_(identifier)" class="mw-redirect" title="ArXiv (identifier)">arXiv</a>:<span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://arxiv.org/abs/2001.04451">2001.04451</a></span> [<a rel="nofollow" class="external text" href="https://arxiv.org/archive/cs.LG">cs.LG</a>].</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=preprint&rft.jtitle=arXiv&rft.atitle=Reformer%3A+The+Efficient+Transformer&rft.date=2020&rft_id=info%3Aarxiv%2F2001.04451&rft.aulast=Kitaev&rft.aufirst=Nikita&rft.au=Kaiser%2C+%C5%81ukasz&rft.au=Levskaya%2C+Anselm&rfr_id=info%3Asid%2Fen.wikipedia.org%3ATransformer+%28deep+learning+architecture%29" class="Z3988"></span></span> </li> <li id="cite_note-95"><span class="mw-cite-backlink"><b><a href="#cite_ref-95">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFLiuLinCaoHu2021" class="citation book cs1">Liu, Ze; Lin, Yutong; Cao, Yue; Hu, Han; Wei, Yixuan; Zhang, Zheng; Lin, Stephen; Guo, Baining (2021). <a rel="nofollow" class="external text" href="https://ieeexplore.ieee.org/document/9710580">"Swin Transformer: Hierarchical Vision Transformer using Shifted Windows"</a>. <i>2021 IEEE/CVF International Conference on Computer Vision (ICCV)</i>. IEEE. pp. <span class="nowrap">9992–</span>10002. <a href="/wiki/ArXiv_(identifier)" class="mw-redirect" title="ArXiv (identifier)">arXiv</a>:<span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://arxiv.org/abs/2103.14030">2103.14030</a></span>. <a href="/wiki/Doi_(identifier)" class="mw-redirect" title="Doi (identifier)">doi</a>:<a rel="nofollow" class="external text" href="https://doi.org/10.1109%2FICCV48922.2021.00986">10.1109/ICCV48922.2021.00986</a>. <a href="/wiki/ISBN_(identifier)" class="mw-redirect" title="ISBN (identifier)">ISBN</a> <a href="/wiki/Special:BookSources/978-1-6654-2812-5" title="Special:BookSources/978-1-6654-2812-5"><bdi>978-1-6654-2812-5</bdi></a>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&rft.genre=bookitem&rft.atitle=Swin+Transformer%3A+Hierarchical+Vision+Transformer+using+Shifted+Windows&rft.btitle=2021+IEEE%2FCVF+International+Conference+on+Computer+Vision+%28ICCV%29&rft.pages=%3Cspan+class%3D%22nowrap%22%3E9992-%3C%2Fspan%3E10002&rft.pub=IEEE&rft.date=2021&rft_id=info%3Aarxiv%2F2103.14030&rft_id=info%3Adoi%2F10.1109%2FICCV48922.2021.00986&rft.isbn=978-1-6654-2812-5&rft.aulast=Liu&rft.aufirst=Ze&rft.au=Lin%2C+Yutong&rft.au=Cao%2C+Yue&rft.au=Hu%2C+Han&rft.au=Wei%2C+Yixuan&rft.au=Zhang%2C+Zheng&rft.au=Lin%2C+Stephen&rft.au=Guo%2C+Baining&rft_id=https%3A%2F%2Fieeexplore.ieee.org%2Fdocument%2F9710580&rfr_id=info%3Asid%2Fen.wikipedia.org%3ATransformer+%28deep+learning+architecture%29" class="Z3988"></span></span> </li> <li id="cite_note-96"><span class="mw-cite-backlink"><b><a href="#cite_ref-96">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFRisteaIonescuKhan2022" class="citation journal cs1">Ristea, Nicolaea Catalin; Ionescu, Radu Tudor; Khan, Fahad Shahbaz (2022-09-18). <a rel="nofollow" class="external text" href="https://www.isca-archive.org/interspeech_2022/ristea22_interspeech.html">"SepTr: Separable Transformer for Audio Spectrogram Processing"</a>. <i>Interspeech</i>. ISCA: <span class="nowrap">4103–</span>4107. <a href="/wiki/ArXiv_(identifier)" class="mw-redirect" title="ArXiv (identifier)">arXiv</a>:<span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://arxiv.org/abs/2203.09581">2203.09581</a></span>. <a href="/wiki/Doi_(identifier)" class="mw-redirect" title="Doi (identifier)">doi</a>:<a rel="nofollow" class="external text" href="https://doi.org/10.21437%2FInterspeech.2022-249">10.21437/Interspeech.2022-249</a>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=article&rft.jtitle=Interspeech&rft.atitle=SepTr%3A+Separable+Transformer+for+Audio+Spectrogram+Processing&rft.pages=%3Cspan+class%3D%22nowrap%22%3E4103-%3C%2Fspan%3E4107&rft.date=2022-09-18&rft_id=info%3Aarxiv%2F2203.09581&rft_id=info%3Adoi%2F10.21437%2FInterspeech.2022-249&rft.aulast=Ristea&rft.aufirst=Nicolaea+Catalin&rft.au=Ionescu%2C+Radu+Tudor&rft.au=Khan%2C+Fahad+Shahbaz&rft_id=https%3A%2F%2Fwww.isca-archive.org%2Finterspeech_2022%2Fristea22_interspeech.html&rfr_id=info%3Asid%2Fen.wikipedia.org%3ATransformer+%28deep+learning+architecture%29" class="Z3988"></span></span> </li> <li id="cite_note-97"><span class="mw-cite-backlink"><b><a href="#cite_ref-97">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFTayDehghaniAbnarShen2020" class="citation arxiv cs1">Tay, Yi; Dehghani, Mostafa; Abnar, Samira; Shen, Yikang; Bahri, Dara; Pham, Philip; Rao, Jinfeng; Yang, Liu; Ruder, Sebastian; Metzler, Donald (2020-11-08). "Long Range Arena: A Benchmark for Efficient Transformers". <a href="/wiki/ArXiv_(identifier)" class="mw-redirect" title="ArXiv (identifier)">arXiv</a>:<span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://arxiv.org/abs/2011.04006">2011.04006</a></span> [<a rel="nofollow" class="external text" href="https://arxiv.org/archive/cs.LG">cs.LG</a>].</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=preprint&rft.jtitle=arXiv&rft.atitle=Long+Range+Arena%3A+A+Benchmark+for+Efficient+Transformers&rft.date=2020-11-08&rft_id=info%3Aarxiv%2F2011.04006&rft.aulast=Tay&rft.aufirst=Yi&rft.au=Dehghani%2C+Mostafa&rft.au=Abnar%2C+Samira&rft.au=Shen%2C+Yikang&rft.au=Bahri%2C+Dara&rft.au=Pham%2C+Philip&rft.au=Rao%2C+Jinfeng&rft.au=Yang%2C+Liu&rft.au=Ruder%2C+Sebastian&rft.au=Metzler%2C+Donald&rfr_id=info%3Asid%2Fen.wikipedia.org%3ATransformer+%28deep+learning+architecture%29" class="Z3988"></span></span> </li> <li id="cite_note-98"><span class="mw-cite-backlink"><b><a href="#cite_ref-98">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite class="citation web cs1"><a rel="nofollow" class="external text" href="http://ai.googleblog.com/2020/01/reformer-efficient-transformer.html">"Reformer: The Efficient Transformer"</a>. <i>Google AI Blog</i>. 16 January 2020. <a rel="nofollow" class="external text" href="https://web.archive.org/web/20201022210019/https://ai.googleblog.com/2020/01/reformer-efficient-transformer.html">Archived</a> from the original on 2020-10-22<span class="reference-accessdate">. Retrieved <span class="nowrap">2020-10-22</span></span>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=unknown&rft.jtitle=Google+AI+Blog&rft.atitle=Reformer%3A+The+Efficient+Transformer&rft.date=2020-01-16&rft_id=http%3A%2F%2Fai.googleblog.com%2F2020%2F01%2Freformer-efficient-transformer.html&rfr_id=info%3Asid%2Fen.wikipedia.org%3ATransformer+%28deep+learning+architecture%29" class="Z3988"></span></span> </li> <li id="cite_note-99"><span class="mw-cite-backlink"><b><a href="#cite_ref-99">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFGomezRenUrtasunGrosse2017" class="citation journal cs1">Gomez, Aidan N; Ren, Mengye; Urtasun, Raquel; Grosse, Roger B (2017). <a rel="nofollow" class="external text" href="https://proceedings.neurips.cc/paper/2017/hash/f9be311e65d81a9ad8150a60844bb94c-Abstract.html">"The Reversible Residual Network: Backpropagation Without Storing Activations"</a>. <i>Advances in Neural Information Processing Systems</i>. <b>30</b>. Curran Associates, Inc. <a href="/wiki/ArXiv_(identifier)" class="mw-redirect" title="ArXiv (identifier)">arXiv</a>:<span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://arxiv.org/abs/1707.04585">1707.04585</a></span>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=article&rft.jtitle=Advances+in+Neural+Information+Processing+Systems&rft.atitle=The+Reversible+Residual+Network%3A+Backpropagation+Without+Storing+Activations&rft.volume=30&rft.date=2017&rft_id=info%3Aarxiv%2F1707.04585&rft.aulast=Gomez&rft.aufirst=Aidan+N&rft.au=Ren%2C+Mengye&rft.au=Urtasun%2C+Raquel&rft.au=Grosse%2C+Roger+B&rft_id=https%3A%2F%2Fproceedings.neurips.cc%2Fpaper%2F2017%2Fhash%2Ff9be311e65d81a9ad8150a60844bb94c-Abstract.html&rfr_id=info%3Asid%2Fen.wikipedia.org%3ATransformer+%28deep+learning+architecture%29" class="Z3988"></span></span> </li> <li id="cite_note-100"><span class="mw-cite-backlink"><b><a href="#cite_ref-100">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFChildGrayRadfordSutskever2019" class="citation cs2">Child, Rewon; Gray, Scott; Radford, Alec; Sutskever, Ilya (2019-04-23), <i>Generating Long Sequences with Sparse Transformers</i>, <a href="/wiki/ArXiv_(identifier)" class="mw-redirect" title="ArXiv (identifier)">arXiv</a>:<span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://arxiv.org/abs/1904.10509">1904.10509</a></span></cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&rft.genre=book&rft.btitle=Generating+Long+Sequences+with+Sparse+Transformers&rft.date=2019-04-23&rft_id=info%3Aarxiv%2F1904.10509&rft.aulast=Child&rft.aufirst=Rewon&rft.au=Gray%2C+Scott&rft.au=Radford%2C+Alec&rft.au=Sutskever%2C+Ilya&rfr_id=info%3Asid%2Fen.wikipedia.org%3ATransformer+%28deep+learning+architecture%29" class="Z3988"></span></span> </li> <li id="cite_note-101"><span class="mw-cite-backlink"><b><a href="#cite_ref-101">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite class="citation web cs1"><a rel="nofollow" class="external text" href="https://ai.googleblog.com/2021/03/constructing-transformers-for-longer.html">"Constructing Transformers For Longer Sequences with Sparse Attention Methods"</a>. <i>Google AI Blog</i>. 25 March 2021. <a rel="nofollow" class="external text" href="https://web.archive.org/web/20210918150757/https://ai.googleblog.com/2021/03/constructing-transformers-for-longer.html">Archived</a> from the original on 2021-09-18<span class="reference-accessdate">. Retrieved <span class="nowrap">2021-05-28</span></span>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=unknown&rft.jtitle=Google+AI+Blog&rft.atitle=Constructing+Transformers+For+Longer+Sequences+with+Sparse+Attention+Methods&rft.date=2021-03-25&rft_id=https%3A%2F%2Fai.googleblog.com%2F2021%2F03%2Fconstructing-transformers-for-longer.html&rfr_id=info%3Asid%2Fen.wikipedia.org%3ATransformer+%28deep+learning+architecture%29" class="Z3988"></span></span> </li> <li id="cite_note-102"><span class="mw-cite-backlink"><b><a href="#cite_ref-102">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFZhaiTalbottSrivastavaHuang2021" class="citation arxiv cs1">Zhai, Shuangfei; Talbott, Walter; Srivastava, Nitish; Huang, Chen; Goh, Hanlin; Zhang, Ruixiang; Susskind, Josh (2021-09-21). "An Attention Free Transformer". <a href="/wiki/ArXiv_(identifier)" class="mw-redirect" title="ArXiv (identifier)">arXiv</a>:<span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://arxiv.org/abs/2105.14103">2105.14103</a></span> [<a rel="nofollow" class="external text" href="https://arxiv.org/archive/cs.LG">cs.LG</a>].</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=preprint&rft.jtitle=arXiv&rft.atitle=An+Attention+Free+Transformer&rft.date=2021-09-21&rft_id=info%3Aarxiv%2F2105.14103&rft.aulast=Zhai&rft.aufirst=Shuangfei&rft.au=Talbott%2C+Walter&rft.au=Srivastava%2C+Nitish&rft.au=Huang%2C+Chen&rft.au=Goh%2C+Hanlin&rft.au=Zhang%2C+Ruixiang&rft.au=Susskind%2C+Josh&rfr_id=info%3Asid%2Fen.wikipedia.org%3ATransformer+%28deep+learning+architecture%29" class="Z3988"></span></span> </li> <li id="cite_note-103"><span class="mw-cite-backlink"><b><a href="#cite_ref-103">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFPengPappasYogatamaSchwartz2021" class="citation arxiv cs1">Peng, Hao; Pappas, Nikolaos; Yogatama, Dani; Schwartz, Roy; Smith, Noah A.; Kong, Lingpeng (2021-03-19). "Random Feature Attention". <a href="/wiki/ArXiv_(identifier)" class="mw-redirect" title="ArXiv (identifier)">arXiv</a>:<span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://arxiv.org/abs/2103.02143">2103.02143</a></span> [<a rel="nofollow" class="external text" href="https://arxiv.org/archive/cs.CL">cs.CL</a>].</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=preprint&rft.jtitle=arXiv&rft.atitle=Random+Feature+Attention&rft.date=2021-03-19&rft_id=info%3Aarxiv%2F2103.02143&rft.aulast=Peng&rft.aufirst=Hao&rft.au=Pappas%2C+Nikolaos&rft.au=Yogatama%2C+Dani&rft.au=Schwartz%2C+Roy&rft.au=Smith%2C+Noah+A.&rft.au=Kong%2C+Lingpeng&rfr_id=info%3Asid%2Fen.wikipedia.org%3ATransformer+%28deep+learning+architecture%29" class="Z3988"></span></span> </li> <li id="cite_note-104"><span class="mw-cite-backlink"><b><a href="#cite_ref-104">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFChoromanskiLikhosherstovDohanSong2020" class="citation arxiv cs1">Choromanski, Krzysztof; Likhosherstov, Valerii; Dohan, David; Song, Xingyou; Gane, Andreea; Sarlos, Tamas; Hawkins, Peter; Davis, Jared; Belanger, David; Colwell, Lucy; Weller, Adrian (2020-09-30). "Masked Language Modeling for Proteins via Linearly Scalable Long-Context Transformers". <a href="/wiki/ArXiv_(identifier)" class="mw-redirect" title="ArXiv (identifier)">arXiv</a>:<span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://arxiv.org/abs/2006.03555">2006.03555</a></span> [<a rel="nofollow" class="external text" href="https://arxiv.org/archive/cs.LG">cs.LG</a>].</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=preprint&rft.jtitle=arXiv&rft.atitle=Masked+Language+Modeling+for+Proteins+via+Linearly+Scalable+Long-Context+Transformers&rft.date=2020-09-30&rft_id=info%3Aarxiv%2F2006.03555&rft.aulast=Choromanski&rft.aufirst=Krzysztof&rft.au=Likhosherstov%2C+Valerii&rft.au=Dohan%2C+David&rft.au=Song%2C+Xingyou&rft.au=Gane%2C+Andreea&rft.au=Sarlos%2C+Tamas&rft.au=Hawkins%2C+Peter&rft.au=Davis%2C+Jared&rft.au=Belanger%2C+David&rft.au=Colwell%2C+Lucy&rft.au=Weller%2C+Adrian&rfr_id=info%3Asid%2Fen.wikipedia.org%3ATransformer+%28deep+learning+architecture%29" class="Z3988"></span></span> </li> <li id="cite_note-105"><span class="mw-cite-backlink"><b><a href="#cite_ref-105">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFLuGroverAbbeelMordatch2022" class="citation journal cs1">Lu, Kevin; Grover, Aditya; Abbeel, Pieter; Mordatch, Igor (2022-06-28). <a rel="nofollow" class="external text" href="https://ojs.aaai.org/index.php/AAAI/article/view/20729">"Frozen Pretrained Transformers as Universal Computation Engines"</a>. <i>Proceedings of the AAAI Conference on Artificial Intelligence</i>. <b>36</b> (7): <span class="nowrap">7628–</span>7636. <a href="/wiki/Doi_(identifier)" class="mw-redirect" title="Doi (identifier)">doi</a>:<span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://doi.org/10.1609%2Faaai.v36i7.20729">10.1609/aaai.v36i7.20729</a></span>. <a href="/wiki/ISSN_(identifier)" class="mw-redirect" title="ISSN (identifier)">ISSN</a> <a rel="nofollow" class="external text" href="https://search.worldcat.org/issn/2374-3468">2374-3468</a>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=article&rft.jtitle=Proceedings+of+the+AAAI+Conference+on+Artificial+Intelligence&rft.atitle=Frozen+Pretrained+Transformers+as+Universal+Computation+Engines&rft.volume=36&rft.issue=7&rft.pages=%3Cspan+class%3D%22nowrap%22%3E7628-%3C%2Fspan%3E7636&rft.date=2022-06-28&rft_id=info%3Adoi%2F10.1609%2Faaai.v36i7.20729&rft.issn=2374-3468&rft.aulast=Lu&rft.aufirst=Kevin&rft.au=Grover%2C+Aditya&rft.au=Abbeel%2C+Pieter&rft.au=Mordatch%2C+Igor&rft_id=https%3A%2F%2Fojs.aaai.org%2Findex.php%2FAAAI%2Farticle%2Fview%2F20729&rfr_id=info%3Asid%2Fen.wikipedia.org%3ATransformer+%28deep+learning+architecture%29" class="Z3988"></span></span> </li> <li id="cite_note-106"><span class="mw-cite-backlink"><b><a href="#cite_ref-106">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite class="citation web cs1"><a rel="nofollow" class="external text" href="https://lmsys.org/blog/2023-03-30-vicuna">"Vicuna: An Open-Source Chatbot Impressing GPT-4 with 90%* ChatGPT Quality | LMSYS Org"</a>. <i>lmsys.org</i><span class="reference-accessdate">. Retrieved <span class="nowrap">2024-08-11</span></span>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=unknown&rft.jtitle=lmsys.org&rft.atitle=Vicuna%3A+An+Open-Source+Chatbot+Impressing+GPT-4+with+90%25%2A+ChatGPT+Quality+%7C+LMSYS+Org&rft_id=https%3A%2F%2Flmsys.org%2Fblog%2F2023-03-30-vicuna&rfr_id=info%3Asid%2Fen.wikipedia.org%3ATransformer+%28deep+learning+architecture%29" class="Z3988"></span></span> </li> <li id="cite_note-107"><span class="mw-cite-backlink"><b><a href="#cite_ref-107">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFLiuLiWuLee2023" class="citation journal cs1">Liu, Haotian; Li, Chunyuan; Wu, Qingyang; Lee, Yong Jae (2023-12-15). <a rel="nofollow" class="external text" href="https://proceedings.neurips.cc/paper_files/paper/2023/hash/6dcf277ea32ce3288914faf369fe6de0-Abstract-Conference.html">"Visual Instruction Tuning"</a>. <i>Advances in Neural Information Processing Systems</i>. <b>36</b>: <span class="nowrap">34892–</span>34916.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=article&rft.jtitle=Advances+in+Neural+Information+Processing+Systems&rft.atitle=Visual+Instruction+Tuning&rft.volume=36&rft.pages=%3Cspan+class%3D%22nowrap%22%3E34892-%3C%2Fspan%3E34916&rft.date=2023-12-15&rft.aulast=Liu&rft.aufirst=Haotian&rft.au=Li%2C+Chunyuan&rft.au=Wu%2C+Qingyang&rft.au=Lee%2C+Yong+Jae&rft_id=https%3A%2F%2Fproceedings.neurips.cc%2Fpaper_files%2Fpaper%2F2023%2Fhash%2F6dcf277ea32ce3288914faf369fe6de0-Abstract-Conference.html&rfr_id=info%3Asid%2Fen.wikipedia.org%3ATransformer+%28deep+learning+architecture%29" class="Z3988"></span></span> </li> <li id="cite_note-Radford_Kim_Xu_Brockman_p.-108"><span class="mw-cite-backlink"><b><a href="#cite_ref-Radford_Kim_Xu_Brockman_p._108-0">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFRadfordKimXuBrockman2022" class="citation arxiv cs1">Radford, Alec; Kim, Jong Wook; Xu, Tao; Brockman, Greg; McLeavey, Christine; Sutskever, Ilya (2022). "Robust Speech Recognition via Large-Scale Weak Supervision". <a href="/wiki/ArXiv_(identifier)" class="mw-redirect" title="ArXiv (identifier)">arXiv</a>:<span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://arxiv.org/abs/2212.04356">2212.04356</a></span> [<a rel="nofollow" class="external text" href="https://arxiv.org/archive/eess.AS">eess.AS</a>].</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=preprint&rft.jtitle=arXiv&rft.atitle=Robust+Speech+Recognition+via+Large-Scale+Weak+Supervision&rft.date=2022&rft_id=info%3Aarxiv%2F2212.04356&rft.aulast=Radford&rft.aufirst=Alec&rft.au=Kim%2C+Jong+Wook&rft.au=Xu%2C+Tao&rft.au=Brockman%2C+Greg&rft.au=McLeavey%2C+Christine&rft.au=Sutskever%2C+Ilya&rfr_id=info%3Asid%2Fen.wikipedia.org%3ATransformer+%28deep+learning+architecture%29" class="Z3988"></span></span> </li> <li id="cite_note-perceiver2021-109"><span class="mw-cite-backlink"><b><a href="#cite_ref-perceiver2021_109-0">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFJaegleGimenoBrockZisserman2021" class="citation arxiv cs1">Jaegle, Andrew; Gimeno, Felix; Brock, Andrew; Zisserman, Andrew; Vinyals, Oriol; Carreira, Joao (2021-06-22). "Perceiver: General Perception with Iterative Attention". <a href="/wiki/ArXiv_(identifier)" class="mw-redirect" title="ArXiv (identifier)">arXiv</a>:<span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://arxiv.org/abs/2103.03206">2103.03206</a></span> [<a rel="nofollow" class="external text" href="https://arxiv.org/archive/cs.CV">cs.CV</a>].</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=preprint&rft.jtitle=arXiv&rft.atitle=Perceiver%3A+General+Perception+with+Iterative+Attention&rft.date=2021-06-22&rft_id=info%3Aarxiv%2F2103.03206&rft.aulast=Jaegle&rft.aufirst=Andrew&rft.au=Gimeno%2C+Felix&rft.au=Brock%2C+Andrew&rft.au=Zisserman%2C+Andrew&rft.au=Vinyals%2C+Oriol&rft.au=Carreira%2C+Joao&rfr_id=info%3Asid%2Fen.wikipedia.org%3ATransformer+%28deep+learning+architecture%29" class="Z3988"></span></span> </li> <li id="cite_note-jaegle2021b-110"><span class="mw-cite-backlink"><b><a href="#cite_ref-jaegle2021b_110-0">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFJaegleBorgeaudAlayracDoersch2021" class="citation arxiv cs1">Jaegle, Andrew; Borgeaud, Sebastian; Alayrac, Jean-Baptiste; Doersch, Carl; Ionescu, Catalin; Ding, David; Koppula, Skanda; Zoran, Daniel; Brock, Andrew; Shelhamer, Evan; Hénaff, Olivier (2021-08-02). "Perceiver IO: A General Architecture for Structured Inputs & Outputs". <a href="/wiki/ArXiv_(identifier)" class="mw-redirect" title="ArXiv (identifier)">arXiv</a>:<span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://arxiv.org/abs/2107.14795">2107.14795</a></span> [<a rel="nofollow" class="external text" href="https://arxiv.org/archive/cs.LG">cs.LG</a>].</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=preprint&rft.jtitle=arXiv&rft.atitle=Perceiver+IO%3A+A+General+Architecture+for+Structured+Inputs+%26+Outputs&rft.date=2021-08-02&rft_id=info%3Aarxiv%2F2107.14795&rft.aulast=Jaegle&rft.aufirst=Andrew&rft.au=Borgeaud%2C+Sebastian&rft.au=Alayrac%2C+Jean-Baptiste&rft.au=Doersch%2C+Carl&rft.au=Ionescu%2C+Catalin&rft.au=Ding%2C+David&rft.au=Koppula%2C+Skanda&rft.au=Zoran%2C+Daniel&rft.au=Brock%2C+Andrew&rft.au=Shelhamer%2C+Evan&rft.au=H%C3%A9naff%2C+Olivier&rfr_id=info%3Asid%2Fen.wikipedia.org%3ATransformer+%28deep+learning+architecture%29" class="Z3988"></span></span> </li> <li id="cite_note-111"><span class="mw-cite-backlink"><b><a href="#cite_ref-111">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite class="citation web cs1"><a rel="nofollow" class="external text" href="https://sites.research.google/parti/">"Parti: Pathways Autoregressive Text-to-Image Model"</a>. <i>sites.research.google</i><span class="reference-accessdate">. Retrieved <span class="nowrap">2024-08-09</span></span>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=unknown&rft.jtitle=sites.research.google&rft.atitle=Parti%3A+Pathways+Autoregressive+Text-to-Image+Model&rft_id=https%3A%2F%2Fsites.research.google%2Fparti%2F&rfr_id=info%3Asid%2Fen.wikipedia.org%3ATransformer+%28deep+learning+architecture%29" class="Z3988"></span></span> </li> <li id="cite_note-:13-112"><span class="mw-cite-backlink">^ <a href="#cite_ref-:13_112-0"><sup><i><b>a</b></i></sup></a> <a href="#cite_ref-:13_112-1"><sup><i><b>b</b></i></sup></a></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFVillegasBabaeizadehKindermansMoraldo2022" class="citation journal cs1">Villegas, Ruben; Babaeizadeh, Mohammad; Kindermans, Pieter-Jan; Moraldo, Hernan; Zhang, Han; Saffar, Mohammad Taghi; Castro, Santiago; Kunze, Julius; Erhan, Dumitru (2022-09-29). <a rel="nofollow" class="external text" href="https://openreview.net/forum?id=vOEXS39nOF">"Phenaki: Variable Length Video Generation from Open Domain Textual Descriptions"</a>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=article&rft.atitle=Phenaki%3A+Variable+Length+Video+Generation+from+Open+Domain+Textual+Descriptions&rft.date=2022-09-29&rft.aulast=Villegas&rft.aufirst=Ruben&rft.au=Babaeizadeh%2C+Mohammad&rft.au=Kindermans%2C+Pieter-Jan&rft.au=Moraldo%2C+Hernan&rft.au=Zhang%2C+Han&rft.au=Saffar%2C+Mohammad+Taghi&rft.au=Castro%2C+Santiago&rft.au=Kunze%2C+Julius&rft.au=Erhan%2C+Dumitru&rft_id=https%3A%2F%2Fopenreview.net%2Fforum%3Fid%3DvOEXS39nOF&rfr_id=info%3Asid%2Fen.wikipedia.org%3ATransformer+%28deep+learning+architecture%29" class="Z3988"></span> <span class="cs1-visible-error citation-comment"><code class="cs1-code">{{<a href="/wiki/Template:Cite_journal" title="Template:Cite journal">cite journal</a>}}</code>: </span><span class="cs1-visible-error citation-comment">Cite journal requires <code class="cs1-code">|journal=</code> (<a href="/wiki/Help:CS1_errors#missing_periodical" title="Help:CS1 errors">help</a>)</span></span> </li> <li id="cite_note-:12-113"><span class="mw-cite-backlink">^ <a href="#cite_ref-:12_113-0"><sup><i><b>a</b></i></sup></a> <a href="#cite_ref-:12_113-1"><sup><i><b>b</b></i></sup></a></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFChangZhangBarberMaschinot2023" class="citation arxiv cs1">Chang, Huiwen; Zhang, Han; Barber, Jarred; Maschinot, A. J.; Lezama, Jose; Jiang, Lu; Yang, Ming-Hsuan; Murphy, Kevin; Freeman, William T. (2023-01-02). "Muse: Text-To-Image Generation via Masked Generative Transformers". <a href="/wiki/ArXiv_(identifier)" class="mw-redirect" title="ArXiv (identifier)">arXiv</a>:<span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://arxiv.org/abs/2301.00704">2301.00704</a></span> [<a rel="nofollow" class="external text" href="https://arxiv.org/archive/cs.CV">cs.CV</a>].</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=preprint&rft.jtitle=arXiv&rft.atitle=Muse%3A+Text-To-Image+Generation+via+Masked+Generative+Transformers&rft.date=2023-01-02&rft_id=info%3Aarxiv%2F2301.00704&rft.aulast=Chang&rft.aufirst=Huiwen&rft.au=Zhang%2C+Han&rft.au=Barber%2C+Jarred&rft.au=Maschinot%2C+A.+J.&rft.au=Lezama%2C+Jose&rft.au=Jiang%2C+Lu&rft.au=Yang%2C+Ming-Hsuan&rft.au=Murphy%2C+Kevin&rft.au=Freeman%2C+William+T.&rfr_id=info%3Asid%2Fen.wikipedia.org%3ATransformer+%28deep+learning+architecture%29" class="Z3988"></span></span> </li> <li id="cite_note-114"><span class="mw-cite-backlink"><b><a href="#cite_ref-114">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFRameshPavlovGohGray2021" class="citation cs2">Ramesh, Aditya; Pavlov, Mikhail; Goh, Gabriel; Gray, Scott; Voss, Chelsea; Radford, Alec; Chen, Mark; Sutskever, Ilya (2021-02-26), <i>Zero-Shot Text-to-Image Generation</i>, <a href="/wiki/ArXiv_(identifier)" class="mw-redirect" title="ArXiv (identifier)">arXiv</a>:<span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://arxiv.org/abs/2102.12092">2102.12092</a></span></cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&rft.genre=book&rft.btitle=Zero-Shot+Text-to-Image+Generation&rft.date=2021-02-26&rft_id=info%3Aarxiv%2F2102.12092&rft.aulast=Ramesh&rft.aufirst=Aditya&rft.au=Pavlov%2C+Mikhail&rft.au=Goh%2C+Gabriel&rft.au=Gray%2C+Scott&rft.au=Voss%2C+Chelsea&rft.au=Radford%2C+Alec&rft.au=Chen%2C+Mark&rft.au=Sutskever%2C+Ilya&rfr_id=info%3Asid%2Fen.wikipedia.org%3ATransformer+%28deep+learning+architecture%29" class="Z3988"></span></span> </li> <li id="cite_note-115"><span class="mw-cite-backlink"><b><a href="#cite_ref-115">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFYuXuKohLuong2022" class="citation cs2">Yu, Jiahui; Xu, Yuanzhong; Koh, Jing Yu; Luong, Thang; Baid, Gunjan; Wang, Zirui; Vasudevan, Vijay; Ku, Alexander; Yang, Yinfei (2022-06-21), <i>Scaling Autoregressive Models for Content-Rich Text-to-Image Generation</i>, <a href="/wiki/ArXiv_(identifier)" class="mw-redirect" title="ArXiv (identifier)">arXiv</a>:<span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://arxiv.org/abs/2206.10789">2206.10789</a></span></cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&rft.genre=book&rft.btitle=Scaling+Autoregressive+Models+for+Content-Rich+Text-to-Image+Generation&rft.date=2022-06-21&rft_id=info%3Aarxiv%2F2206.10789&rft.aulast=Yu&rft.aufirst=Jiahui&rft.au=Xu%2C+Yuanzhong&rft.au=Koh%2C+Jing+Yu&rft.au=Luong%2C+Thang&rft.au=Baid%2C+Gunjan&rft.au=Wang%2C+Zirui&rft.au=Vasudevan%2C+Vijay&rft.au=Ku%2C+Alexander&rft.au=Yang%2C+Yinfei&rfr_id=info%3Asid%2Fen.wikipedia.org%3ATransformer+%28deep+learning+architecture%29" class="Z3988"></span></span> </li> <li id="cite_note-116"><span class="mw-cite-backlink"><b><a href="#cite_ref-116">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFKariampuzhaAlyeaQuSanjak2023" class="citation journal cs1">Kariampuzha, William; Alyea, Gioconda; Qu, Sue; Sanjak, Jaleal; Mathé, Ewy; Sid, Eric; Chatelaine, Haley; Yadaw, Arjun; Xu, Yanji; Zhu, Qian (2023). <a rel="nofollow" class="external text" href="https://www.ncbi.nlm.nih.gov/pmc/articles/PMC9972634">"Precision information extraction for rare disease epidemiology at scale"</a>. <i>Journal of Translational Medicine</i>. <b>21</b> (1): 157. <a href="/wiki/Doi_(identifier)" class="mw-redirect" title="Doi (identifier)">doi</a>:<span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://doi.org/10.1186%2Fs12967-023-04011-y">10.1186/s12967-023-04011-y</a></span>. <a href="/wiki/PMC_(identifier)" class="mw-redirect" title="PMC (identifier)">PMC</a> <span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://www.ncbi.nlm.nih.gov/pmc/articles/PMC9972634">9972634</a></span>. <a href="/wiki/PMID_(identifier)" class="mw-redirect" title="PMID (identifier)">PMID</a> <a rel="nofollow" class="external text" href="https://pubmed.ncbi.nlm.nih.gov/36855134">36855134</a>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=article&rft.jtitle=Journal+of+Translational+Medicine&rft.atitle=Precision+information+extraction+for+rare+disease+epidemiology+at+scale&rft.volume=21&rft.issue=1&rft.pages=157&rft.date=2023&rft_id=https%3A%2F%2Fwww.ncbi.nlm.nih.gov%2Fpmc%2Farticles%2FPMC9972634%23id-name%3DPMC&rft_id=info%3Apmid%2F36855134&rft_id=info%3Adoi%2F10.1186%2Fs12967-023-04011-y&rft.aulast=Kariampuzha&rft.aufirst=William&rft.au=Alyea%2C+Gioconda&rft.au=Qu%2C+Sue&rft.au=Sanjak%2C+Jaleal&rft.au=Math%C3%A9%2C+Ewy&rft.au=Sid%2C+Eric&rft.au=Chatelaine%2C+Haley&rft.au=Yadaw%2C+Arjun&rft.au=Xu%2C+Yanji&rft.au=Zhu%2C+Qian&rft_id=https%3A%2F%2Fwww.ncbi.nlm.nih.gov%2Fpmc%2Farticles%2FPMC9972634&rfr_id=info%3Asid%2Fen.wikipedia.org%3ATransformer+%28deep+learning+architecture%29" class="Z3988"></span></span> </li> </ol></div></div> <div class="mw-heading mw-heading2"><h2 id="Further_reading">Further reading</h2><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Transformer_(deep_learning_architecture)&action=edit&section=48" title="Edit section: Further reading"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <style data-mw-deduplicate="TemplateStyles:r1239549316">.mw-parser-output .refbegin{margin-bottom:0.5em}.mw-parser-output .refbegin-hanging-indents>ul{margin-left:0}.mw-parser-output .refbegin-hanging-indents>ul>li{margin-left:0;padding-left:3.2em;text-indent:-3.2em}.mw-parser-output .refbegin-hanging-indents ul,.mw-parser-output .refbegin-hanging-indents ul li{list-style:none}@media(max-width:720px){.mw-parser-output .refbegin-hanging-indents>ul>li{padding-left:1.6em;text-indent:-1.6em}}.mw-parser-output .refbegin-columns{margin-top:0.3em}.mw-parser-output .refbegin-columns ul{margin-top:0}.mw-parser-output .refbegin-columns li{page-break-inside:avoid;break-inside:avoid-column}@media screen{.mw-parser-output .refbegin{font-size:90%}}</style><div class="refbegin" style=""> <ul><li>Alexander Rush, <a rel="nofollow" class="external text" href="https://nlp.seas.harvard.edu/2018/04/03/attention.html">The Annotated transformer</a> <a rel="nofollow" class="external text" href="https://web.archive.org/web/20210922093841/https://nlp.seas.harvard.edu/2018/04/03/attention.html">Archived</a> 2021-09-22 at the <a href="/wiki/Wayback_Machine" title="Wayback Machine">Wayback Machine</a>, Harvard NLP group, 3 April 2018</li> <li><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFPhuongHutter2022" class="citation arxiv cs1">Phuong, Mary; Hutter, Marcus (2022). "Formal Algorithms for Transformers". <a href="/wiki/ArXiv_(identifier)" class="mw-redirect" title="ArXiv (identifier)">arXiv</a>:<span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://arxiv.org/abs/2207.09238">2207.09238</a></span> [<a rel="nofollow" class="external text" href="https://arxiv.org/archive/cs.LG">cs.LG</a>].</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=preprint&rft.jtitle=arXiv&rft.atitle=Formal+Algorithms+for+Transformers&rft.date=2022&rft_id=info%3Aarxiv%2F2207.09238&rft.aulast=Phuong&rft.aufirst=Mary&rft.au=Hutter%2C+Marcus&rfr_id=info%3Asid%2Fen.wikipedia.org%3ATransformer+%28deep+learning+architecture%29" class="Z3988"></span></li> <li><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFFerrandoSartiBisazzaCosta-jussà2024" class="citation arxiv cs1">Ferrando, Javier; Sarti, Gabriele; Bisazza, Arianna; Costa-jussà, Marta R. (2024-05-01). "A Primer on the Inner Workings of Transformer-based Language Models". <a href="/wiki/ArXiv_(identifier)" class="mw-redirect" title="ArXiv (identifier)">arXiv</a>:<span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://arxiv.org/abs/2405.00208">2405.00208</a></span> [<a rel="nofollow" class="external text" href="https://arxiv.org/archive/cs.CL">cs.CL</a>].</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=preprint&rft.jtitle=arXiv&rft.atitle=A+Primer+on+the+Inner+Workings+of+Transformer-based+Language+Models&rft.date=2024-05-01&rft_id=info%3Aarxiv%2F2405.00208&rft.aulast=Ferrando&rft.aufirst=Javier&rft.au=Sarti%2C+Gabriele&rft.au=Bisazza%2C+Arianna&rft.au=Costa-juss%C3%A0%2C+Marta+R.&rfr_id=info%3Asid%2Fen.wikipedia.org%3ATransformer+%28deep+learning+architecture%29" class="Z3988"></span></li></ul> </div> <div class="navbox-styles"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1129693374"><style data-mw-deduplicate="TemplateStyles:r1236075235">.mw-parser-output .navbox{box-sizing:border-box;border:1px solid #a2a9b1;width:100%;clear:both;font-size:88%;text-align:center;padding:1px;margin:1em auto 0}.mw-parser-output .navbox .navbox{margin-top:0}.mw-parser-output .navbox+.navbox,.mw-parser-output .navbox+.navbox-styles+.navbox{margin-top:-1px}.mw-parser-output .navbox-inner,.mw-parser-output .navbox-subgroup{width:100%}.mw-parser-output .navbox-group,.mw-parser-output .navbox-title,.mw-parser-output .navbox-abovebelow{padding:0.25em 1em;line-height:1.5em;text-align:center}.mw-parser-output .navbox-group{white-space:nowrap;text-align:right}.mw-parser-output .navbox,.mw-parser-output .navbox-subgroup{background-color:#fdfdfd}.mw-parser-output .navbox-list{line-height:1.5em;border-color:#fdfdfd}.mw-parser-output .navbox-list-with-group{text-align:left;border-left-width:2px;border-left-style:solid}.mw-parser-output tr+tr>.navbox-abovebelow,.mw-parser-output tr+tr>.navbox-group,.mw-parser-output tr+tr>.navbox-image,.mw-parser-output tr+tr>.navbox-list{border-top:2px solid #fdfdfd}.mw-parser-output .navbox-title{background-color:#ccf}.mw-parser-output .navbox-abovebelow,.mw-parser-output .navbox-group,.mw-parser-output .navbox-subgroup .navbox-title{background-color:#ddf}.mw-parser-output .navbox-subgroup .navbox-group,.mw-parser-output .navbox-subgroup .navbox-abovebelow{background-color:#e6e6ff}.mw-parser-output .navbox-even{background-color:#f7f7f7}.mw-parser-output .navbox-odd{background-color:transparent}.mw-parser-output .navbox .hlist td dl,.mw-parser-output .navbox .hlist td ol,.mw-parser-output .navbox .hlist td ul,.mw-parser-output .navbox td.hlist dl,.mw-parser-output .navbox td.hlist ol,.mw-parser-output .navbox td.hlist ul{padding:0.125em 0}.mw-parser-output .navbox .navbar{display:block;font-size:100%}.mw-parser-output .navbox-title .navbar{float:left;text-align:left;margin-right:0.5em}body.skin--responsive .mw-parser-output .navbox-image img{max-width:none!important}@media print{body.ns-0 .mw-parser-output .navbox{display:none!important}}</style></div><div role="navigation" class="navbox" aria-labelledby="Google_AI352" style="padding:3px"><table class="nowraplinks hlist mw-collapsible autocollapse navbox-inner" style="border-spacing:0;background:transparent;color:inherit"><tbody><tr><th scope="col" class="navbox-title" colspan="2"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1129693374"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1239400231"><div class="navbar plainlinks hlist navbar-mini"><ul><li class="nv-view"><a href="/wiki/Template:Google_AI" title="Template:Google AI"><abbr title="View this template">v</abbr></a></li><li class="nv-talk"><a href="/wiki/Template_talk:Google_AI" title="Template talk:Google AI"><abbr title="Discuss this template">t</abbr></a></li><li class="nv-edit"><a href="/wiki/Special:EditPage/Template:Google_AI" title="Special:EditPage/Template:Google AI"><abbr title="Edit this template">e</abbr></a></li></ul></div><div id="Google_AI352" style="font-size:114%;margin:0 4em"><a href="/wiki/Google_AI" title="Google AI">Google AI</a></div></th></tr><tr><td class="navbox-abovebelow" colspan="2"><div> <ul><li><a href="/wiki/Google" title="Google">Google</a></li> <li><a href="/wiki/Google_Brain" title="Google Brain">Google Brain</a></li> <li><a href="/wiki/Google_DeepMind" title="Google DeepMind">Google DeepMind</a></li></ul> </div></td></tr><tr><th scope="row" class="navbox-group" style="width:1%">Computer programs</th><td class="navbox-list-with-group navbox-list navbox-odd" style="width:100%;padding:0"><div style="padding:0 0.25em"></div><table class="nowraplinks navbox-subgroup" style="border-spacing:0"><tbody><tr><th scope="row" class="navbox-group" style="width:1%">AlphaGo</th><td class="navbox-list-with-group navbox-list navbox-odd" style="width:100%;padding:0"><div style="padding:0 0.25em"></div><table class="nowraplinks navbox-subgroup" style="border-spacing:0"><tbody><tr><th scope="row" class="navbox-group" style="width:1%">Versions</th><td class="navbox-list-with-group navbox-list navbox-odd" style="width:100%;padding:0"><div style="padding:0 0.25em"> <ul><li><a href="/wiki/AlphaGo" title="AlphaGo">AlphaGo</a> (2015)</li> <li><a href="/wiki/Master_(software)" title="Master (software)">Master</a> (2016)</li> <li><a href="/wiki/AlphaGo_Zero" title="AlphaGo Zero">AlphaGo Zero</a> (2017)</li> <li><a href="/wiki/AlphaZero" title="AlphaZero">AlphaZero</a> (2017)</li> <li><a href="/wiki/MuZero" title="MuZero">MuZero</a> (2019)</li></ul> </div></td></tr><tr><th scope="row" class="navbox-group" style="width:1%">Competitions</th><td class="navbox-list-with-group navbox-list navbox-even" style="width:100%;padding:0"><div style="padding:0 0.25em"> <ul><li><a href="/wiki/AlphaGo_versus_Fan_Hui" title="AlphaGo versus Fan Hui">Fan Hui</a> (2015)</li> <li><a href="/wiki/AlphaGo_versus_Lee_Sedol" title="AlphaGo versus Lee Sedol">Lee Sedol</a> (2016)</li> <li><a href="/wiki/AlphaGo_versus_Ke_Jie" title="AlphaGo versus Ke Jie">Ke Jie</a> (2017)</li></ul> </div></td></tr><tr><th scope="row" class="navbox-group" style="width:1%">In popular culture</th><td class="navbox-list-with-group navbox-list navbox-odd" style="width:100%;padding:0"><div style="padding:0 0.25em"> <ul><li><i><a href="/wiki/AlphaGo_(film)" title="AlphaGo (film)">AlphaGo</a></i> (2017)</li> <li><i><a href="/wiki/The_MANIAC" title="The MANIAC">The MANIAC</a></i> (2023)</li></ul> </div></td></tr></tbody></table><div></div></td></tr><tr><th scope="row" class="navbox-group" style="width:1%">Other</th><td class="navbox-list-with-group navbox-list navbox-even" style="width:100%;padding:0"><div style="padding:0 0.25em"> <ul><li><a href="/wiki/AlphaFold" title="AlphaFold">AlphaFold</a> (2018)</li> <li><a href="/wiki/AlphaStar_(software)" title="AlphaStar (software)">AlphaStar</a> (2019)</li> <li><a href="/wiki/AlphaDev" title="AlphaDev">AlphaDev</a> (2023)</li> <li><a href="/wiki/AlphaGeometry" title="AlphaGeometry">AlphaGeometry</a> (2024)</li></ul> </div></td></tr></tbody></table><div></div></td></tr><tr><th scope="row" class="navbox-group" style="width:1%">Machine learning</th><td class="navbox-list-with-group navbox-list navbox-odd" style="width:100%;padding:0"><div style="padding:0 0.25em"></div><table class="nowraplinks navbox-subgroup" style="border-spacing:0"><tbody><tr><th scope="row" class="navbox-group" style="width:1%">Neural networks</th><td class="navbox-list-with-group navbox-list navbox-odd" style="width:100%;padding:0"><div style="padding:0 0.25em"> <ul><li><a href="/wiki/Inception_(deep_learning_architecture)" title="Inception (deep learning architecture)">Inception</a> (2014)</li> <li><a href="/wiki/WaveNet" title="WaveNet">WaveNet</a> (2016)</li> <li><a href="/wiki/MobileNet" title="MobileNet">MobileNet</a> (2017)</li> <li><a class="mw-selflink selflink">Transformer</a> (2017)</li> <li><a href="/wiki/EfficientNet" title="EfficientNet">EfficientNet</a> (2019)</li> <li><a href="/wiki/Gato_(DeepMind)" title="Gato (DeepMind)">Gato</a> (2022)</li></ul> </div></td></tr><tr><th scope="row" class="navbox-group" style="width:1%">Other</th><td class="navbox-list-with-group navbox-list navbox-even" style="width:100%;padding:0"><div style="padding:0 0.25em"> <ul><li><a href="/wiki/Quantum_Artificial_Intelligence_Lab" title="Quantum Artificial Intelligence Lab">Quantum Artificial Intelligence Lab</a></li> <li><a href="/wiki/TensorFlow" title="TensorFlow">TensorFlow</a></li> <li><a href="/wiki/Tensor_Processing_Unit" title="Tensor Processing Unit">Tensor Processing Unit</a></li></ul> </div></td></tr></tbody></table><div></div></td></tr><tr><th scope="row" class="navbox-group" style="width:1%">Generative AI</th><td class="navbox-list-with-group navbox-list navbox-odd" style="width:100%;padding:0"><div style="padding:0 0.25em"></div><table class="nowraplinks navbox-subgroup" style="border-spacing:0"><tbody><tr><th scope="row" class="navbox-group" style="width:1%">Chatbots</th><td class="navbox-list-with-group navbox-list navbox-odd" style="width:100%;padding:0"><div style="padding:0 0.25em"> <ul><li><a href="/wiki/Google_Assistant" title="Google Assistant">Assistant</a> (2016)</li> <li><a href="/wiki/Sparrow_(chatbot)" title="Sparrow (chatbot)">Sparrow</a> (2022)</li> <li><a href="/wiki/Gemini_(chatbot)" title="Gemini (chatbot)">Gemini</a> (2023)</li></ul> </div></td></tr><tr><th scope="row" class="navbox-group" style="width:1%">Language models</th><td class="navbox-list-with-group navbox-list navbox-even" style="width:100%;padding:0"><div style="padding:0 0.25em"> <ul><li><a href="/wiki/BERT_(language_model)" title="BERT (language model)">BERT</a> (2018)</li> <li><a href="/wiki/XLNet" title="XLNet">XLNet</a> (2019)</li> <li><a href="/wiki/T5_(language_model)" title="T5 (language model)">T5</a> (2019)</li> <li><a href="/wiki/LaMDA" title="LaMDA">LaMDA</a> (2021)</li> <li><a href="/wiki/Chinchilla_(language_model)" title="Chinchilla (language model)">Chinchilla</a> (2022)</li> <li><a href="/wiki/PaLM" title="PaLM">PaLM</a> (2022)</li> <li><a href="/wiki/Gemini_(language_model)" title="Gemini (language model)">Gemini</a> (2023)</li> <li><a href="/wiki/VideoPoet" title="VideoPoet">VideoPoet</a> (2024)</li></ul> </div></td></tr><tr><th scope="row" class="navbox-group" style="width:1%">Other</th><td class="navbox-list-with-group navbox-list navbox-odd" style="width:100%;padding:0"><div style="padding:0 0.25em"> <ul><li><a href="/wiki/DreamBooth" title="DreamBooth">DreamBooth</a> (2022)</li> <li><a href="/wiki/NotebookLM" title="NotebookLM">NotebookLM</a> (2023)</li> <li><a href="/wiki/Google_Vids" title="Google Vids">Vids</a> (2024)</li></ul> </div></td></tr></tbody></table><div></div></td></tr><tr><th scope="row" class="navbox-group" style="width:1%">See also</th><td class="navbox-list-with-group navbox-list navbox-even" style="width:100%;padding:0"><div style="padding:0 0.25em"> <ul><li>"<a href="/wiki/Attention_Is_All_You_Need" title="Attention Is All You Need">Attention Is All You Need</a>"</li> <li><a href="/wiki/Future_of_Go_Summit" title="Future of Go Summit">Future of Go Summit</a></li> <li><a href="/wiki/Generative_pre-trained_transformer" title="Generative pre-trained transformer">Generative pre-trained transformer</a></li> <li><a href="/wiki/Google_Labs" title="Google Labs">Google Labs</a></li> <li><a href="/wiki/Google_Pixel" title="Google Pixel">Google Pixel</a></li> <li><a href="/wiki/Google_Workspace" title="Google Workspace">Google Workspace</a></li> <li><a href="/wiki/Robot_Constitution" title="Robot Constitution">Robot Constitution</a></li></ul> </div></td></tr><tr><td class="navbox-abovebelow" colspan="2"><div> <ul><li><span class="noviewer" typeof="mw:File"><span title="Category"><img alt="" src="//upload.wikimedia.org/wikipedia/en/thumb/9/96/Symbol_category_class.svg/16px-Symbol_category_class.svg.png" decoding="async" width="16" height="16" class="mw-file-element" srcset="//upload.wikimedia.org/wikipedia/en/thumb/9/96/Symbol_category_class.svg/23px-Symbol_category_class.svg.png 1.5x, //upload.wikimedia.org/wikipedia/en/thumb/9/96/Symbol_category_class.svg/31px-Symbol_category_class.svg.png 2x" data-file-width="180" data-file-height="185" /></span></span> <a href="/wiki/Category:Google_DeepMind" title="Category:Google DeepMind">Category</a></li> <li><span class="noviewer" typeof="mw:File"><span title="Commons page"><img alt="" src="//upload.wikimedia.org/wikipedia/en/thumb/4/4a/Commons-logo.svg/12px-Commons-logo.svg.png" decoding="async" width="12" height="16" class="mw-file-element" srcset="//upload.wikimedia.org/wikipedia/en/thumb/4/4a/Commons-logo.svg/18px-Commons-logo.svg.png 1.5x, //upload.wikimedia.org/wikipedia/en/thumb/4/4a/Commons-logo.svg/24px-Commons-logo.svg.png 2x" data-file-width="1024" data-file-height="1376" /></span></span> <a href="https://commons.wikimedia.org/wiki/Category:DeepMind" class="extiw" title="commons:Category:DeepMind">Commons</a></li></ul> </div></td></tr></tbody></table></div> <div class="navbox-styles"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1129693374"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1236075235"></div><div role="navigation" class="navbox" aria-labelledby="Artificial_intelligence_(AI)776" style="padding:3px"><table class="nowraplinks hlist mw-collapsible autocollapse navbox-inner" style="border-spacing:0;background:transparent;color:inherit"><tbody><tr><th scope="col" class="navbox-title" colspan="2"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1129693374"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1239400231"><div class="navbar plainlinks hlist navbar-mini"><ul><li class="nv-view"><a href="/wiki/Template:Artificial_intelligence_navbox" title="Template:Artificial intelligence navbox"><abbr title="View this template">v</abbr></a></li><li class="nv-talk"><a href="/wiki/Template_talk:Artificial_intelligence_navbox" title="Template talk:Artificial intelligence navbox"><abbr title="Discuss this template">t</abbr></a></li><li class="nv-edit"><a href="/wiki/Special:EditPage/Template:Artificial_intelligence_navbox" title="Special:EditPage/Template:Artificial intelligence navbox"><abbr title="Edit this template">e</abbr></a></li></ul></div><div id="Artificial_intelligence_(AI)776" style="font-size:114%;margin:0 4em"><a href="/wiki/Artificial_intelligence" title="Artificial intelligence">Artificial intelligence (AI)</a></div></th></tr><tr><td class="navbox-abovebelow" colspan="2"><div><a href="/wiki/History_of_artificial_intelligence" title="History of artificial intelligence">History</a> (<a href="/wiki/Timeline_of_artificial_intelligence" title="Timeline of artificial intelligence">timeline</a>)</div></td></tr><tr><th scope="row" class="navbox-group" style="width:1%">Concepts</th><td class="navbox-list-with-group navbox-list navbox-odd" style="width:100%;padding:0"><div style="padding:0 0.25em"> <ul><li><a href="/wiki/Parameter" title="Parameter">Parameter</a> <ul><li><a href="/wiki/Hyperparameter_(machine_learning)" title="Hyperparameter (machine learning)">Hyperparameter</a></li></ul></li> <li><a href="/wiki/Loss_functions_for_classification" title="Loss functions for classification">Loss functions</a></li> <li><a href="/wiki/Regression_analysis" title="Regression analysis">Regression</a> <ul><li><a href="/wiki/Bias%E2%80%93variance_tradeoff" title="Bias–variance tradeoff">Bias–variance tradeoff</a></li> <li><a href="/wiki/Double_descent" title="Double descent">Double descent</a></li> <li><a href="/wiki/Overfitting" title="Overfitting">Overfitting</a></li></ul></li> <li><a href="/wiki/Cluster_analysis" title="Cluster analysis">Clustering</a></li> <li><a href="/wiki/Gradient_descent" title="Gradient descent">Gradient descent</a> <ul><li><a href="/wiki/Stochastic_gradient_descent" title="Stochastic gradient descent">SGD</a></li> <li><a href="/wiki/Quasi-Newton_method" title="Quasi-Newton method">Quasi-Newton method</a></li> <li><a href="/wiki/Conjugate_gradient_method" title="Conjugate gradient method">Conjugate gradient method</a></li></ul></li> <li><a href="/wiki/Backpropagation" title="Backpropagation">Backpropagation</a></li> <li><a href="/wiki/Attention_(machine_learning)" title="Attention (machine learning)">Attention</a></li> <li><a href="/wiki/Convolution" title="Convolution">Convolution</a></li> <li><a href="/wiki/Normalization_(machine_learning)" title="Normalization (machine learning)">Normalization</a> <ul><li><a href="/wiki/Batch_normalization" title="Batch normalization">Batchnorm</a></li></ul></li> <li><a href="/wiki/Activation_function" title="Activation function">Activation</a> <ul><li><a href="/wiki/Softmax_function" title="Softmax function">Softmax</a></li> <li><a href="/wiki/Sigmoid_function" title="Sigmoid function">Sigmoid</a></li> <li><a href="/wiki/Rectifier_(neural_networks)" title="Rectifier (neural networks)">Rectifier</a></li></ul></li> <li><a href="/wiki/Gating_mechanism" title="Gating mechanism">Gating</a></li> <li><a href="/wiki/Weight_initialization" title="Weight initialization">Weight initialization</a></li> <li><a href="/wiki/Regularization_(mathematics)" title="Regularization (mathematics)">Regularization</a></li> <li><a href="/wiki/Training,_validation,_and_test_data_sets" title="Training, validation, and test data sets">Datasets</a> <ul><li><a href="/wiki/Data_augmentation" title="Data augmentation">Augmentation</a></li></ul></li> <li><a href="/wiki/Prompt_engineering" title="Prompt engineering">Prompt engineering</a></li> <li><a href="/wiki/Reinforcement_learning" title="Reinforcement learning">Reinforcement learning</a> <ul><li><a href="/wiki/Q-learning" title="Q-learning">Q-learning</a></li> <li><a href="/wiki/State%E2%80%93action%E2%80%93reward%E2%80%93state%E2%80%93action" title="State–action–reward–state–action">SARSA</a></li> <li><a href="/wiki/Imitation_learning" title="Imitation learning">Imitation</a></li> <li><a href="/wiki/Policy_gradient_method" title="Policy gradient method">Policy gradient</a></li></ul></li> <li><a href="/wiki/Diffusion_process" title="Diffusion process">Diffusion</a></li> <li><a href="/wiki/Latent_diffusion_model" title="Latent diffusion model">Latent diffusion model</a></li> <li><a href="/wiki/Autoregressive_model" title="Autoregressive model">Autoregression</a></li> <li><a href="/wiki/Adversarial_machine_learning" title="Adversarial machine learning">Adversary</a></li> <li><a href="/wiki/Retrieval-augmented_generation" title="Retrieval-augmented generation">RAG</a></li> <li><a href="/wiki/Uncanny_valley" title="Uncanny valley">Uncanny valley</a></li> <li><a href="/wiki/Reinforcement_learning_from_human_feedback" title="Reinforcement learning from human feedback">RLHF</a></li> <li><a href="/wiki/Self-supervised_learning" title="Self-supervised learning">Self-supervised learning</a></li> <li><a href="/wiki/Recursive_self-improvement" title="Recursive self-improvement">Recursive self-improvement</a></li> <li><a href="/wiki/Word_embedding" title="Word embedding">Word embedding</a></li> <li><a href="/wiki/Hallucination_(artificial_intelligence)" title="Hallucination (artificial intelligence)">Hallucination</a></li></ul> </div></td></tr><tr><th scope="row" class="navbox-group" style="width:1%">Applications</th><td class="navbox-list-with-group navbox-list navbox-even" style="width:100%;padding:0"><div style="padding:0 0.25em"> <ul><li><a href="/wiki/Machine_learning" title="Machine learning">Machine learning</a> <ul><li><a href="/wiki/Prompt_engineering#In-context_learning" title="Prompt engineering">In-context learning</a></li></ul></li> <li><a href="/wiki/Neural_network_(machine_learning)" title="Neural network (machine learning)">Artificial neural network</a> <ul><li><a href="/wiki/Deep_learning" title="Deep learning">Deep learning</a></li></ul></li> <li><a href="/wiki/Language_model" title="Language model">Language model</a> <ul><li><a href="/wiki/Large_language_model" title="Large language model">Large language model</a></li> <li><a href="/wiki/Neural_machine_translation" title="Neural machine translation">NMT</a></li></ul></li> <li><a href="/wiki/Artificial_general_intelligence" title="Artificial general intelligence">Artificial general intelligence</a></li></ul> </div></td></tr><tr><th scope="row" class="navbox-group" style="width:1%">Implementations</th><td class="navbox-list-with-group navbox-list navbox-odd" style="width:100%;padding:0"><div style="padding:0 0.25em"></div><table class="nowraplinks navbox-subgroup" style="border-spacing:0"><tbody><tr><th scope="row" class="navbox-group" style="width:1%">Audio–visual</th><td class="navbox-list-with-group navbox-list navbox-odd" style="width:100%;padding:0"><div style="padding:0 0.25em"> <ul><li><a href="/wiki/AlexNet" title="AlexNet">AlexNet</a></li> <li><a href="/wiki/WaveNet" title="WaveNet">WaveNet</a></li> <li><a href="/wiki/Human_image_synthesis" title="Human image synthesis">Human image synthesis</a></li> <li><a href="/wiki/Handwriting_recognition" title="Handwriting recognition">HWR</a></li> <li><a href="/wiki/Optical_character_recognition" title="Optical character recognition">OCR</a></li> <li><a href="/wiki/Deep_learning_speech_synthesis" title="Deep learning speech synthesis">Speech synthesis</a> <ul><li><a href="/wiki/15.ai" title="15.ai">15.ai</a></li> <li><a href="/wiki/ElevenLabs" title="ElevenLabs">ElevenLabs</a></li></ul></li> <li><a href="/wiki/Speech_recognition" title="Speech recognition">Speech recognition</a> <ul><li><a href="/wiki/Whisper_(speech_recognition_system)" title="Whisper (speech recognition system)">Whisper</a></li></ul></li> <li><a href="/wiki/Facial_recognition_system" title="Facial recognition system">Facial recognition</a></li> <li><a href="/wiki/AlphaFold" title="AlphaFold">AlphaFold</a></li> <li><a href="/wiki/Text-to-image_model" title="Text-to-image model">Text-to-image models</a> <ul><li><a href="/wiki/Aurora_(text-to-image_model)" class="mw-redirect" title="Aurora (text-to-image model)">Aurora</a></li> <li><a href="/wiki/DALL-E" title="DALL-E">DALL-E</a></li> <li><a href="/wiki/Adobe_Firefly" title="Adobe Firefly">Firefly</a></li> <li><a href="/wiki/Flux_(text-to-image_model)" title="Flux (text-to-image model)">Flux</a></li> <li><a href="/wiki/Ideogram_(text-to-image_model)" title="Ideogram (text-to-image model)">Ideogram</a></li> <li><a href="/wiki/Google_Brain#Text-to-image_model" title="Google Brain">Imagen</a></li> <li><a href="/wiki/Midjourney" title="Midjourney">Midjourney</a></li> <li><a href="/wiki/Stable_Diffusion" title="Stable Diffusion">Stable Diffusion</a></li></ul></li> <li><a href="/wiki/Text-to-video_model" title="Text-to-video model">Text-to-video models</a> <ul><li><a href="/wiki/Dream_Machine_(text-to-video_model)" title="Dream Machine (text-to-video model)">Dream Machine</a></li> <li><a href="/wiki/Runway_(company)#Gen-3_Alpha" title="Runway (company)">Gen-3 Alpha</a></li> <li><a href="/wiki/MiniMax_(company)#Hailuo_AI" title="MiniMax (company)">Hailuo AI</a></li> <li><a href="/wiki/Kling_(text-to-video_model)" class="mw-redirect" title="Kling (text-to-video model)">Kling</a></li> <li><a href="/wiki/Sora_(text-to-video_model)" title="Sora (text-to-video model)">Sora</a></li> <li><a href="/wiki/Google_DeepMind#Video_model" title="Google DeepMind">Veo</a></li></ul></li> <li><a href="/wiki/Music_and_artificial_intelligence" title="Music and artificial intelligence">Music generation</a> <ul><li><a href="/wiki/Suno_AI" title="Suno AI">Suno AI</a></li> <li><a href="/wiki/Udio" title="Udio">Udio</a></li></ul></li></ul> </div></td></tr><tr><th scope="row" class="navbox-group" style="width:1%">Text</th><td class="navbox-list-with-group navbox-list navbox-even" style="width:100%;padding:0"><div style="padding:0 0.25em"> <ul><li><a href="/wiki/Word2vec" title="Word2vec">Word2vec</a></li> <li><a href="/wiki/Seq2seq" title="Seq2seq">Seq2seq</a></li> <li><a href="/wiki/GloVe" title="GloVe">GloVe</a></li> <li><a href="/wiki/BERT_(language_model)" title="BERT (language model)">BERT</a></li> <li><a href="/wiki/T5_(language_model)" title="T5 (language model)">T5</a></li> <li><a href="/wiki/Llama_(language_model)" title="Llama (language model)">Llama</a></li> <li><a href="/wiki/Chinchilla_(language_model)" title="Chinchilla (language model)">Chinchilla AI</a></li> <li><a href="/wiki/PaLM" title="PaLM">PaLM</a></li> <li><a href="/wiki/Generative_pre-trained_transformer" title="Generative pre-trained transformer">GPT</a> <ul><li><a href="/wiki/GPT-1" title="GPT-1">1</a></li> <li><a href="/wiki/GPT-2" title="GPT-2">2</a></li> <li><a href="/wiki/GPT-3" title="GPT-3">3</a></li> <li><a href="/wiki/GPT-J" title="GPT-J">J</a></li> <li><a href="/wiki/ChatGPT" title="ChatGPT">ChatGPT</a></li> <li><a href="/wiki/GPT-4" title="GPT-4">4</a></li> <li><a href="/wiki/GPT-4o" title="GPT-4o">4o</a></li> <li><a href="/wiki/OpenAI_o1" title="OpenAI o1">o1</a></li> <li><a href="/wiki/OpenAI_o3" title="OpenAI o3">o3</a></li></ul></li> <li><a href="/wiki/Claude_(language_model)" title="Claude (language model)">Claude</a></li> <li><a href="/wiki/Gemini_(language_model)" title="Gemini (language model)">Gemini</a> <ul><li><a href="/wiki/Gemini_(chatbot)" title="Gemini (chatbot)">chatbot</a></li></ul></li> <li><a href="/wiki/Grok_(chatbot)" title="Grok (chatbot)">Grok</a></li> <li><a href="/wiki/LaMDA" title="LaMDA">LaMDA</a></li> <li><a href="/wiki/BLOOM_(language_model)" title="BLOOM (language model)">BLOOM</a></li> <li><a href="/wiki/Project_Debater" title="Project Debater">Project Debater</a></li> <li><a href="/wiki/IBM_Watson" title="IBM Watson">IBM Watson</a></li> <li><a href="/wiki/IBM_Watsonx" title="IBM Watsonx">IBM Watsonx</a></li> <li><a href="/wiki/IBM_Granite" title="IBM Granite">Granite</a></li> <li><a href="/wiki/Huawei_PanGu" title="Huawei PanGu">PanGu-Σ</a></li> <li><a href="/wiki/DeepSeek" title="DeepSeek">DeepSeek</a></li> <li><a href="/wiki/Qwen" title="Qwen">Qwen</a></li></ul> </div></td></tr><tr><th scope="row" class="navbox-group" style="width:1%">Decisional</th><td class="navbox-list-with-group navbox-list navbox-odd" style="width:100%;padding:0"><div style="padding:0 0.25em"> <ul><li><a href="/wiki/AlphaGo" title="AlphaGo">AlphaGo</a></li> <li><a href="/wiki/AlphaZero" title="AlphaZero">AlphaZero</a></li> <li><a href="/wiki/OpenAI_Five" title="OpenAI Five">OpenAI Five</a></li> <li><a href="/wiki/Self-driving_car" title="Self-driving car">Self-driving car</a></li> <li><a href="/wiki/MuZero" title="MuZero">MuZero</a></li> <li><a href="/wiki/Action_selection" title="Action selection">Action selection</a> <ul><li><a href="/wiki/AutoGPT" title="AutoGPT">AutoGPT</a></li></ul></li> <li><a href="/wiki/Robot_control" title="Robot control">Robot control</a></li></ul> </div></td></tr></tbody></table><div></div></td></tr><tr><th scope="row" class="navbox-group" style="width:1%">People</th><td class="navbox-list-with-group navbox-list navbox-even" style="width:100%;padding:0"><div style="padding:0 0.25em"> <ul><li><a href="/wiki/Alan_Turing" title="Alan Turing">Alan Turing</a></li> <li><a href="/wiki/Warren_Sturgis_McCulloch" title="Warren Sturgis McCulloch">Warren Sturgis McCulloch</a></li> <li><a href="/wiki/Walter_Pitts" title="Walter Pitts">Walter Pitts</a></li> <li><a href="/wiki/John_von_Neumann" title="John von Neumann">John von Neumann</a></li> <li><a href="/wiki/Claude_Shannon" title="Claude Shannon">Claude Shannon</a></li> <li><a href="/wiki/Marvin_Minsky" title="Marvin Minsky">Marvin Minsky</a></li> <li><a href="/wiki/John_McCarthy_(computer_scientist)" title="John McCarthy (computer scientist)">John McCarthy</a></li> <li><a href="/wiki/Nathaniel_Rochester_(computer_scientist)" title="Nathaniel Rochester (computer scientist)">Nathaniel Rochester</a></li> <li><a href="/wiki/Allen_Newell" title="Allen Newell">Allen Newell</a></li> <li><a href="/wiki/Cliff_Shaw" title="Cliff Shaw">Cliff Shaw</a></li> <li><a href="/wiki/Herbert_A._Simon" title="Herbert A. Simon">Herbert A. Simon</a></li> <li><a href="/wiki/Oliver_Selfridge" title="Oliver Selfridge">Oliver Selfridge</a></li> <li><a href="/wiki/Frank_Rosenblatt" title="Frank Rosenblatt">Frank Rosenblatt</a></li> <li><a href="/wiki/Bernard_Widrow" title="Bernard Widrow">Bernard Widrow</a></li> <li><a href="/wiki/Joseph_Weizenbaum" title="Joseph Weizenbaum">Joseph Weizenbaum</a></li> <li><a href="/wiki/Seymour_Papert" title="Seymour Papert">Seymour Papert</a></li> <li><a href="/wiki/Seppo_Linnainmaa" title="Seppo Linnainmaa">Seppo Linnainmaa</a></li> <li><a href="/wiki/Paul_Werbos" title="Paul Werbos">Paul Werbos</a></li> <li><a href="/wiki/J%C3%BCrgen_Schmidhuber" title="Jürgen Schmidhuber">Jürgen Schmidhuber</a></li> <li><a href="/wiki/Yann_LeCun" title="Yann LeCun">Yann LeCun</a></li> <li><a href="/wiki/Geoffrey_Hinton" title="Geoffrey Hinton">Geoffrey Hinton</a></li> <li><a href="/wiki/John_Hopfield" title="John Hopfield">John Hopfield</a></li> <li><a href="/wiki/Yoshua_Bengio" title="Yoshua Bengio">Yoshua Bengio</a></li> <li><a href="/wiki/Lotfi_A._Zadeh" title="Lotfi A. Zadeh">Lotfi A. Zadeh</a></li> <li><a href="/wiki/Stephen_Grossberg" title="Stephen Grossberg">Stephen Grossberg</a></li> <li><a href="/wiki/Alex_Graves_(computer_scientist)" title="Alex Graves (computer scientist)">Alex Graves</a></li> <li><a href="/wiki/Andrew_Ng" title="Andrew Ng">Andrew Ng</a></li> <li><a href="/wiki/Fei-Fei_Li" title="Fei-Fei Li">Fei-Fei Li</a></li> <li><a href="/wiki/Alex_Krizhevsky" title="Alex Krizhevsky">Alex Krizhevsky</a></li> <li><a href="/wiki/Ilya_Sutskever" title="Ilya Sutskever">Ilya Sutskever</a></li> <li><a href="/wiki/Demis_Hassabis" title="Demis Hassabis">Demis Hassabis</a></li> <li><a href="/wiki/David_Silver_(computer_scientist)" title="David Silver (computer scientist)">David Silver</a></li> <li><a href="/wiki/Ian_Goodfellow" title="Ian Goodfellow">Ian Goodfellow</a></li> <li><a href="/wiki/Andrej_Karpathy" title="Andrej Karpathy">Andrej Karpathy</a></li></ul> </div></td></tr><tr><th scope="row" class="navbox-group" style="width:1%">Architectures</th><td class="navbox-list-with-group navbox-list navbox-odd" style="width:100%;padding:0"><div style="padding:0 0.25em"> <ul><li><a href="/wiki/Neural_Turing_machine" title="Neural Turing machine">Neural Turing machine</a></li> <li><a href="/wiki/Differentiable_neural_computer" title="Differentiable neural computer">Differentiable neural computer</a></li> <li><a class="mw-selflink selflink">Transformer</a> <ul><li><a href="/wiki/Vision_transformer" title="Vision transformer">Vision transformer (ViT)</a></li></ul></li> <li><a href="/wiki/Recurrent_neural_network" title="Recurrent neural network">Recurrent neural network (RNN)</a></li> <li><a href="/wiki/Long_short-term_memory" title="Long short-term memory">Long short-term memory (LSTM)</a></li> <li><a href="/wiki/Gated_recurrent_unit" title="Gated recurrent unit">Gated recurrent unit (GRU)</a></li> <li><a href="/wiki/Echo_state_network" title="Echo state network">Echo state network</a></li> <li><a href="/wiki/Multilayer_perceptron" title="Multilayer perceptron">Multilayer perceptron (MLP)</a></li> <li><a href="/wiki/Convolutional_neural_network" title="Convolutional neural network">Convolutional neural network (CNN)</a></li> <li><a href="/wiki/Residual_neural_network" title="Residual neural network">Residual neural network (RNN)</a></li> <li><a href="/wiki/Highway_network" title="Highway network">Highway network</a></li> <li><a href="/wiki/Mamba_(deep_learning_architecture)" title="Mamba (deep learning architecture)">Mamba</a></li> <li><a href="/wiki/Autoencoder" title="Autoencoder">Autoencoder</a></li> <li><a href="/wiki/Variational_autoencoder" title="Variational autoencoder">Variational autoencoder (VAE)</a></li> <li><a href="/wiki/Generative_adversarial_network" title="Generative adversarial network">Generative adversarial network (GAN)</a></li> <li><a href="/wiki/Graph_neural_network" title="Graph neural network">Graph neural network (GNN)</a></li></ul> </div></td></tr><tr><td class="navbox-abovebelow" colspan="2"><div> <ul><li><span class="noviewer" typeof="mw:File"><a href="/wiki/File:Symbol_portal_class.svg" class="mw-file-description" title="Portal"><img alt="" src="//upload.wikimedia.org/wikipedia/en/thumb/e/e2/Symbol_portal_class.svg/16px-Symbol_portal_class.svg.png" decoding="async" width="16" height="16" class="mw-file-element" srcset="//upload.wikimedia.org/wikipedia/en/thumb/e/e2/Symbol_portal_class.svg/23px-Symbol_portal_class.svg.png 1.5x, //upload.wikimedia.org/wikipedia/en/thumb/e/e2/Symbol_portal_class.svg/31px-Symbol_portal_class.svg.png 2x" data-file-width="180" data-file-height="185" /></a></span> Portals <ul><li><a href="/wiki/Portal:Technology" title="Portal:Technology">Technology</a></li></ul></li> <li><span class="noviewer" typeof="mw:File"><span title="Category"><img alt="" src="//upload.wikimedia.org/wikipedia/en/thumb/9/96/Symbol_category_class.svg/16px-Symbol_category_class.svg.png" decoding="async" width="16" height="16" class="mw-file-element" srcset="//upload.wikimedia.org/wikipedia/en/thumb/9/96/Symbol_category_class.svg/23px-Symbol_category_class.svg.png 1.5x, //upload.wikimedia.org/wikipedia/en/thumb/9/96/Symbol_category_class.svg/31px-Symbol_category_class.svg.png 2x" data-file-width="180" data-file-height="185" /></span></span> <a href="/wiki/Category:Artificial_intelligence" title="Category:Artificial intelligence">Category</a> <ul><li><a href="/wiki/Category:Artificial_neural_networks" title="Category:Artificial neural networks">Artificial neural networks</a></li> <li><a href="/wiki/Category:Machine_learning" title="Category:Machine learning">Machine learning</a></li></ul></li> <li><span class="noviewer" typeof="mw:File"><span title="List-Class article"><img alt="" src="//upload.wikimedia.org/wikipedia/en/thumb/d/db/Symbol_list_class.svg/16px-Symbol_list_class.svg.png" decoding="async" width="16" height="16" class="mw-file-element" srcset="//upload.wikimedia.org/wikipedia/en/thumb/d/db/Symbol_list_class.svg/23px-Symbol_list_class.svg.png 1.5x, //upload.wikimedia.org/wikipedia/en/thumb/d/db/Symbol_list_class.svg/31px-Symbol_list_class.svg.png 2x" data-file-width="180" data-file-height="185" /></span></span> List <ul><li><a href="/wiki/List_of_artificial_intelligence_companies" title="List of artificial intelligence companies">Companies</a></li> <li><a href="/wiki/List_of_artificial_intelligence_projects" title="List of artificial intelligence projects">Projects</a></li></ul></li></ul> </div></td></tr></tbody></table></div> <!-- NewPP limit report Parsed by mw‐api‐ext.codfw.main‐786d8bd985‐bvppj Cached time: 20250216123633 Cache expiry: 2592000 Reduced expiry: false Complications: [vary‐revision‐sha1, show‐toc] CPU time usage: 1.492 seconds Real time usage: 1.785 seconds Preprocessor visited node count: 11096/1000000 Post‐expand include size: 373217/2097152 bytes Template argument size: 5087/2097152 bytes Highest expansion depth: 16/100 Expensive parser function count: 8/500 Unstrip recursion depth: 1/20 Unstrip post‐expand size: 468247/5000000 bytes Lua time usage: 0.859/10.000 seconds Lua memory usage: 16306757/52428800 bytes Number of Wikibase entities loaded: 0/400 --> <!-- Transclusion expansion time report (%,ms,calls,template) 100.00% 1298.711 1 -total 60.00% 779.269 2 Template:Reflist 17.06% 221.508 24 Template:Cite_journal 14.44% 187.564 33 Template:Cite_arXiv 9.73% 126.418 7 Template:Annotated_link 8.49% 110.291 22 Template:Citation 7.03% 91.324 1 Template:Machine_learning 6.56% 85.165 22 Template:Cite_web 6.48% 84.096 1 Template:Sidebar_with_collapsible_lists 6.44% 83.662 1 Template:Short_description --> <!-- Saved in parser cache with key enwiki:pcache:61603971:|#|:idhash:canonical and timestamp 20250216123641 and revision id 1276021232. Rendering was triggered because: page-edit --> </div><!--esi <esi:include src="/esitest-fa8a495983347898/content" /> --><noscript><img src="https://login.wikimedia.org/wiki/Special:CentralAutoLogin/start?useformat=desktop&type=1x1&usesul3=0" alt="" width="1" height="1" style="border: none; position: absolute;"></noscript> <div class="printfooter" data-nosnippet="">Retrieved from "<a dir="ltr" href="https://en.wikipedia.org/w/index.php?title=Transformer_(deep_learning_architecture)&oldid=1276021232">https://en.wikipedia.org/w/index.php?title=Transformer_(deep_learning_architecture)&oldid=1276021232</a>"</div></div> <div id="catlinks" class="catlinks" data-mw="interface"><div id="mw-normal-catlinks" class="mw-normal-catlinks"><a href="/wiki/Help:Category" title="Help:Category">Categories</a>: <ul><li><a href="/wiki/Category:Google_software" title="Category:Google software">Google software</a></li><li><a href="/wiki/Category:Neural_network_architectures" title="Category:Neural network architectures">Neural network architectures</a></li><li><a href="/wiki/Category:2017_in_artificial_intelligence" title="Category:2017 in artificial intelligence">2017 in artificial intelligence</a></li></ul></div><div id="mw-hidden-catlinks" class="mw-hidden-catlinks mw-hidden-cats-hidden">Hidden categories: <ul><li><a href="/wiki/Category:CS1_maint:_multiple_names:_authors_list" title="Category:CS1 maint: multiple names: authors list">CS1 maint: multiple names: authors list</a></li><li><a href="/wiki/Category:CS1_errors:_missing_periodical" title="Category:CS1 errors: missing periodical">CS1 errors: missing periodical</a></li><li><a href="/wiki/Category:Articles_with_short_description" title="Category:Articles with short description">Articles with short description</a></li><li><a href="/wiki/Category:Short_description_is_different_from_Wikidata" title="Category:Short description is different from Wikidata">Short description is different from Wikidata</a></li><li><a href="/wiki/Category:Webarchive_template_wayback_links" title="Category:Webarchive template wayback links">Webarchive template wayback links</a></li></ul></div></div> </div> </main> </div> <div class="mw-footer-container"> <footer id="footer" class="mw-footer" > <ul id="footer-info"> <li id="footer-info-lastmod"> This page was last edited on 16 February 2025, at 12:36<span class="anonymous-show"> (UTC)</span>.</li> <li id="footer-info-copyright">Text is available under the <a href="/wiki/Wikipedia:Text_of_the_Creative_Commons_Attribution-ShareAlike_4.0_International_License" title="Wikipedia:Text of the Creative Commons Attribution-ShareAlike 4.0 International License">Creative Commons Attribution-ShareAlike 4.0 License</a>; additional terms may apply. By using this site, you agree to the <a href="https://foundation.wikimedia.org/wiki/Special:MyLanguage/Policy:Terms_of_Use" class="extiw" title="foundation:Special:MyLanguage/Policy:Terms of Use">Terms of Use</a> and <a href="https://foundation.wikimedia.org/wiki/Special:MyLanguage/Policy:Privacy_policy" class="extiw" title="foundation:Special:MyLanguage/Policy:Privacy policy">Privacy Policy</a>. Wikipedia® is a registered trademark of the <a rel="nofollow" class="external text" href="https://wikimediafoundation.org/">Wikimedia Foundation, Inc.</a>, a non-profit organization.</li> </ul> <ul id="footer-places"> <li id="footer-places-privacy"><a href="https://foundation.wikimedia.org/wiki/Special:MyLanguage/Policy:Privacy_policy">Privacy policy</a></li> <li id="footer-places-about"><a href="/wiki/Wikipedia:About">About Wikipedia</a></li> <li id="footer-places-disclaimers"><a href="/wiki/Wikipedia:General_disclaimer">Disclaimers</a></li> <li id="footer-places-contact"><a href="//en.wikipedia.org/wiki/Wikipedia:Contact_us">Contact Wikipedia</a></li> <li id="footer-places-wm-codeofconduct"><a href="https://foundation.wikimedia.org/wiki/Special:MyLanguage/Policy:Universal_Code_of_Conduct">Code of Conduct</a></li> <li id="footer-places-developers"><a href="https://developer.wikimedia.org">Developers</a></li> <li id="footer-places-statslink"><a href="https://stats.wikimedia.org/#/en.wikipedia.org">Statistics</a></li> <li id="footer-places-cookiestatement"><a href="https://foundation.wikimedia.org/wiki/Special:MyLanguage/Policy:Cookie_statement">Cookie statement</a></li> <li id="footer-places-mobileview"><a href="//en.m.wikipedia.org/w/index.php?title=Transformer_(deep_learning_architecture)&mobileaction=toggle_view_mobile" class="noprint stopMobileRedirectToggle">Mobile view</a></li> </ul> <ul id="footer-icons" class="noprint"> <li id="footer-copyrightico"><a href="https://wikimediafoundation.org/" class="cdx-button cdx-button--fake-button cdx-button--size-large cdx-button--fake-button--enabled"><img src="/static/images/footer/wikimedia-button.svg" width="84" height="29" alt="Wikimedia Foundation" lang="en" loading="lazy"></a></li> <li id="footer-poweredbyico"><a href="https://www.mediawiki.org/" class="cdx-button cdx-button--fake-button cdx-button--size-large cdx-button--fake-button--enabled"><picture><source media="(min-width: 500px)" srcset="/w/resources/assets/poweredby_mediawiki.svg" width="88" height="31"><img src="/w/resources/assets/mediawiki_compact.svg" alt="Powered by MediaWiki" width="25" height="25" loading="lazy"></picture></a></li> </ul> </footer> </div> </div> </div> <div class="vector-header-container vector-sticky-header-container"> <div id="vector-sticky-header" class="vector-sticky-header"> <div class="vector-sticky-header-start"> <div class="vector-sticky-header-icon-start vector-button-flush-left vector-button-flush-right" aria-hidden="true"> <button class="cdx-button cdx-button--weight-quiet cdx-button--icon-only vector-sticky-header-search-toggle" tabindex="-1" data-event-name="ui.vector-sticky-search-form.icon"><span class="vector-icon mw-ui-icon-search mw-ui-icon-wikimedia-search"></span> <span>Search</span> </button> </div> <div role="search" class="vector-search-box-vue vector-search-box-show-thumbnail vector-search-box"> <div class="vector-typeahead-search-container"> <div class="cdx-typeahead-search cdx-typeahead-search--show-thumbnail"> <form action="/w/index.php" id="vector-sticky-search-form" class="cdx-search-input cdx-search-input--has-end-button"> <div class="cdx-search-input__input-wrapper" data-search-loc="header-moved"> <div class="cdx-text-input cdx-text-input--has-start-icon"> <input class="cdx-text-input__input" type="search" name="search" placeholder="Search Wikipedia"> <span class="cdx-text-input__icon cdx-text-input__start-icon"></span> </div> <input type="hidden" name="title" value="Special:Search"> </div> <button class="cdx-button cdx-search-input__end-button">Search</button> </form> </div> </div> </div> <div class="vector-sticky-header-context-bar"> <nav aria-label="Contents" class="vector-toc-landmark"> <div id="vector-sticky-header-toc" class="vector-dropdown mw-portlet mw-portlet-sticky-header-toc vector-sticky-header-toc vector-button-flush-left" > <input type="checkbox" id="vector-sticky-header-toc-checkbox" role="button" aria-haspopup="true" data-event-name="ui.dropdown-vector-sticky-header-toc" class="vector-dropdown-checkbox " aria-label="Toggle the table of contents" > <label id="vector-sticky-header-toc-label" for="vector-sticky-header-toc-checkbox" class="vector-dropdown-label cdx-button cdx-button--fake-button cdx-button--fake-button--enabled cdx-button--weight-quiet cdx-button--icon-only " aria-hidden="true" ><span class="vector-icon mw-ui-icon-listBullet mw-ui-icon-wikimedia-listBullet"></span> <span class="vector-dropdown-label-text">Toggle the table of contents</span> </label> <div class="vector-dropdown-content"> <div id="vector-sticky-header-toc-unpinned-container" class="vector-unpinned-container"> </div> </div> </div> </nav> <div class="vector-sticky-header-context-bar-primary" aria-hidden="true" ><span class="mw-page-title-main">Transformer (deep learning architecture)</span></div> </div> </div> <div class="vector-sticky-header-end" aria-hidden="true"> <div class="vector-sticky-header-icons"> <a href="#" class="cdx-button cdx-button--fake-button cdx-button--fake-button--enabled cdx-button--weight-quiet cdx-button--icon-only" id="ca-talk-sticky-header" tabindex="-1" data-event-name="talk-sticky-header"><span class="vector-icon mw-ui-icon-speechBubbles mw-ui-icon-wikimedia-speechBubbles"></span> <span></span> </a> <a href="#" class="cdx-button cdx-button--fake-button cdx-button--fake-button--enabled cdx-button--weight-quiet cdx-button--icon-only" id="ca-subject-sticky-header" tabindex="-1" data-event-name="subject-sticky-header"><span class="vector-icon mw-ui-icon-article mw-ui-icon-wikimedia-article"></span> <span></span> </a> <a href="#" class="cdx-button cdx-button--fake-button cdx-button--fake-button--enabled cdx-button--weight-quiet cdx-button--icon-only" id="ca-history-sticky-header" tabindex="-1" data-event-name="history-sticky-header"><span class="vector-icon mw-ui-icon-wikimedia-history mw-ui-icon-wikimedia-wikimedia-history"></span> <span></span> </a> <a href="#" class="cdx-button cdx-button--fake-button cdx-button--fake-button--enabled cdx-button--weight-quiet cdx-button--icon-only mw-watchlink" id="ca-watchstar-sticky-header" tabindex="-1" data-event-name="watch-sticky-header"><span class="vector-icon mw-ui-icon-wikimedia-star mw-ui-icon-wikimedia-wikimedia-star"></span> <span></span> </a> <a href="#" class="cdx-button cdx-button--fake-button cdx-button--fake-button--enabled cdx-button--weight-quiet cdx-button--icon-only" id="ca-edit-sticky-header" tabindex="-1" data-event-name="wikitext-edit-sticky-header"><span class="vector-icon mw-ui-icon-wikimedia-wikiText mw-ui-icon-wikimedia-wikimedia-wikiText"></span> <span></span> </a> <a href="#" class="cdx-button cdx-button--fake-button cdx-button--fake-button--enabled cdx-button--weight-quiet cdx-button--icon-only" id="ca-ve-edit-sticky-header" tabindex="-1" data-event-name="ve-edit-sticky-header"><span class="vector-icon mw-ui-icon-wikimedia-edit mw-ui-icon-wikimedia-wikimedia-edit"></span> <span></span> </a> <a href="#" class="cdx-button cdx-button--fake-button cdx-button--fake-button--enabled cdx-button--weight-quiet cdx-button--icon-only" id="ca-viewsource-sticky-header" tabindex="-1" data-event-name="ve-edit-protected-sticky-header"><span class="vector-icon mw-ui-icon-wikimedia-editLock mw-ui-icon-wikimedia-wikimedia-editLock"></span> <span></span> </a> </div> <div class="vector-sticky-header-buttons"> <button class="cdx-button cdx-button--weight-quiet mw-interlanguage-selector" id="p-lang-btn-sticky-header" tabindex="-1" data-event-name="ui.dropdown-p-lang-btn-sticky-header"><span class="vector-icon mw-ui-icon-wikimedia-language mw-ui-icon-wikimedia-wikimedia-language"></span> <span>28 languages</span> </button> <a href="#" class="cdx-button cdx-button--fake-button cdx-button--fake-button--enabled cdx-button--weight-quiet cdx-button--action-progressive" id="ca-addsection-sticky-header" tabindex="-1" data-event-name="addsection-sticky-header"><span class="vector-icon mw-ui-icon-speechBubbleAdd-progressive mw-ui-icon-wikimedia-speechBubbleAdd-progressive"></span> <span>Add topic</span> </a> </div> <div class="vector-sticky-header-icon-end"> <div class="vector-user-links"> </div> </div> </div> </div> </div> <div class="vector-settings" id="p-dock-bottom"> <ul></ul> </div><script>(RLQ=window.RLQ||[]).push(function(){mw.config.set({"wgHostname":"mw-web.codfw.main-b766959bd-7n9v9","wgBackendResponseTime":129,"wgPageParseReport":{"limitreport":{"cputime":"1.492","walltime":"1.785","ppvisitednodes":{"value":11096,"limit":1000000},"postexpandincludesize":{"value":373217,"limit":2097152},"templateargumentsize":{"value":5087,"limit":2097152},"expansiondepth":{"value":16,"limit":100},"expensivefunctioncount":{"value":8,"limit":500},"unstrip-depth":{"value":1,"limit":20},"unstrip-size":{"value":468247,"limit":5000000},"entityaccesscount":{"value":0,"limit":400},"timingprofile":["100.00% 1298.711 1 -total"," 60.00% 779.269 2 Template:Reflist"," 17.06% 221.508 24 Template:Cite_journal"," 14.44% 187.564 33 Template:Cite_arXiv"," 9.73% 126.418 7 Template:Annotated_link"," 8.49% 110.291 22 Template:Citation"," 7.03% 91.324 1 Template:Machine_learning"," 6.56% 85.165 22 Template:Cite_web"," 6.48% 84.096 1 Template:Sidebar_with_collapsible_lists"," 6.44% 83.662 1 Template:Short_description"]},"scribunto":{"limitreport-timeusage":{"value":"0.859","limit":"10.000"},"limitreport-memusage":{"value":16306757,"limit":52428800}},"cachereport":{"origin":"mw-api-ext.codfw.main-786d8bd985-bvppj","timestamp":"20250216123633","ttl":2592000,"transientcontent":false}}});});</script> <script type="application/ld+json">{"@context":"https:\/\/schema.org","@type":"Article","name":"Transformer (deep learning architecture)","url":"https:\/\/en.wikipedia.org\/wiki\/Transformer_(deep_learning_architecture)","sameAs":"http:\/\/www.wikidata.org\/entity\/Q85810444","mainEntity":"http:\/\/www.wikidata.org\/entity\/Q85810444","author":{"@type":"Organization","name":"Contributors to Wikimedia projects"},"publisher":{"@type":"Organization","name":"Wikimedia Foundation, Inc.","logo":{"@type":"ImageObject","url":"https:\/\/www.wikimedia.org\/static\/images\/wmf-hor-googpub.png"}},"datePublished":"2019-08-25T16:32:02Z","dateModified":"2025-02-16T12:36:31Z","image":"https:\/\/upload.wikimedia.org\/wikipedia\/commons\/3\/34\/Transformer%2C_full_architecture.png","headline":"machine-learning model architecture first developed by Google Brain"}</script> </body> </html>