CINXE.COM
Neural machine translation - Wikipedia
<!DOCTYPE html> <html class="client-nojs vector-feature-language-in-header-enabled vector-feature-language-in-main-page-header-disabled vector-feature-sticky-header-disabled vector-feature-page-tools-pinned-disabled vector-feature-toc-pinned-clientpref-1 vector-feature-main-menu-pinned-disabled vector-feature-limited-width-clientpref-1 vector-feature-limited-width-content-enabled vector-feature-custom-font-size-clientpref-1 vector-feature-appearance-pinned-clientpref-1 vector-feature-night-mode-enabled skin-theme-clientpref-day vector-toc-available" lang="en" dir="ltr"> <head> <meta charset="UTF-8"> <title>Neural machine translation - Wikipedia</title> <script>(function(){var className="client-js vector-feature-language-in-header-enabled vector-feature-language-in-main-page-header-disabled vector-feature-sticky-header-disabled vector-feature-page-tools-pinned-disabled vector-feature-toc-pinned-clientpref-1 vector-feature-main-menu-pinned-disabled vector-feature-limited-width-clientpref-1 vector-feature-limited-width-content-enabled vector-feature-custom-font-size-clientpref-1 vector-feature-appearance-pinned-clientpref-1 vector-feature-night-mode-enabled skin-theme-clientpref-day vector-toc-available";var cookie=document.cookie.match(/(?:^|; )enwikimwclientpreferences=([^;]+)/);if(cookie){cookie[1].split('%2C').forEach(function(pref){className=className.replace(new RegExp('(^| )'+pref.replace(/-clientpref-\w+$|[^\w-]+/g,'')+'-clientpref-\\w+( |$)'),'$1'+pref+'$2');});}document.documentElement.className=className;}());RLCONF={"wgBreakFrames":false,"wgSeparatorTransformTable":["",""],"wgDigitTransformTable":["",""],"wgDefaultDateFormat":"dmy", "wgMonthNames":["","January","February","March","April","May","June","July","August","September","October","November","December"],"wgRequestId":"57178e31-1691-4938-8907-111b79fa5df6","wgCanonicalNamespace":"","wgCanonicalSpecialPageName":false,"wgNamespaceNumber":0,"wgPageName":"Neural_machine_translation","wgTitle":"Neural machine translation","wgCurRevisionId":1257438152,"wgRevisionId":1257438152,"wgArticleId":47961606,"wgIsArticle":true,"wgIsRedirect":false,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgCategories":["CS1 errors: generic name","Articles with short description","Short description is different from Wikidata","All articles with unsourced statements","Articles with unsourced statements from December 2023","Applications of artificial intelligence","Computational linguistics","Machine translation","Tasks of natural language processing"],"wgPageViewLanguage":"en","wgPageContentLanguage":"en","wgPageContentModel":"wikitext","wgRelevantPageName": "Neural_machine_translation","wgRelevantArticleId":47961606,"wgIsProbablyEditable":true,"wgRelevantPageIsProbablyEditable":true,"wgRestrictionEdit":[],"wgRestrictionMove":[],"wgNoticeProject":"wikipedia","wgCiteReferencePreviewsActive":false,"wgFlaggedRevsParams":{"tags":{"status":{"levels":1}}},"wgMediaViewerOnClick":true,"wgMediaViewerEnabledByDefault":true,"wgPopupsFlags":0,"wgVisualEditor":{"pageLanguageCode":"en","pageLanguageDir":"ltr","pageVariantFallbacks":"en"},"wgMFDisplayWikibaseDescriptions":{"search":true,"watchlist":true,"tagline":false,"nearby":true},"wgWMESchemaEditAttemptStepOversample":false,"wgWMEPageLength":40000,"wgRelatedArticlesCompat":[],"wgEditSubmitButtonLabelPublish":true,"wgULSPosition":"interlanguage","wgULSisCompactLinksEnabled":false,"wgVector2022LanguageInHeader":true,"wgULSisLanguageSelectorEmpty":false,"wgWikibaseItemId":"Q25053937","wgCheckUserClientHintsHeadersJsApi":["brands","architecture","bitness","fullVersionList","mobile","model","platform", "platformVersion"],"GEHomepageSuggestedEditsEnableTopics":true,"wgGETopicsMatchModeEnabled":false,"wgGEStructuredTaskRejectionReasonTextInputEnabled":false,"wgGELevelingUpEnabledForUser":false};RLSTATE={"ext.globalCssJs.user.styles":"ready","site.styles":"ready","user.styles":"ready","ext.globalCssJs.user":"ready","user":"ready","user.options":"loading","ext.cite.styles":"ready","ext.math.styles":"ready","skins.vector.search.codex.styles":"ready","skins.vector.styles":"ready","skins.vector.icons":"ready","jquery.makeCollapsible.styles":"ready","ext.wikimediamessages.styles":"ready","ext.visualEditor.desktopArticleTarget.noscript":"ready","ext.uls.interlanguage":"ready","wikibase.client.init":"ready","ext.wikimediaBadges":"ready"};RLPAGEMODULES=["ext.cite.ux-enhancements","site","mediawiki.page.ready","jquery.makeCollapsible","mediawiki.toc","skins.vector.js","ext.centralNotice.geoIP","ext.centralNotice.startUp","ext.gadget.ReferenceTooltips","ext.gadget.switcher", "ext.urlShortener.toolbar","ext.centralauth.centralautologin","mmv.bootstrap","ext.popups","ext.visualEditor.desktopArticleTarget.init","ext.visualEditor.targetLoader","ext.echo.centralauth","ext.eventLogging","ext.wikimediaEvents","ext.navigationTiming","ext.uls.interface","ext.cx.eventlogging.campaigns","ext.cx.uls.quick.actions","wikibase.client.vector-2022","ext.checkUser.clientHints","ext.growthExperiments.SuggestedEditSession","wikibase.sidebar.tracking"];</script> <script>(RLQ=window.RLQ||[]).push(function(){mw.loader.impl(function(){return["user.options@12s5i",function($,jQuery,require,module){mw.user.tokens.set({"patrolToken":"+\\","watchToken":"+\\","csrfToken":"+\\"}); }];});});</script> <link rel="stylesheet" href="/w/load.php?lang=en&modules=ext.cite.styles%7Cext.math.styles%7Cext.uls.interlanguage%7Cext.visualEditor.desktopArticleTarget.noscript%7Cext.wikimediaBadges%7Cext.wikimediamessages.styles%7Cjquery.makeCollapsible.styles%7Cskins.vector.icons%2Cstyles%7Cskins.vector.search.codex.styles%7Cwikibase.client.init&only=styles&skin=vector-2022"> <script async="" src="/w/load.php?lang=en&modules=startup&only=scripts&raw=1&skin=vector-2022"></script> <meta name="ResourceLoaderDynamicStyles" content=""> <link rel="stylesheet" href="/w/load.php?lang=en&modules=site.styles&only=styles&skin=vector-2022"> <meta name="generator" content="MediaWiki 1.44.0-wmf.5"> <meta name="referrer" content="origin"> <meta name="referrer" content="origin-when-cross-origin"> <meta name="robots" content="max-image-preview:standard"> <meta name="format-detection" content="telephone=no"> <meta name="viewport" content="width=1120"> <meta property="og:title" content="Neural machine translation - Wikipedia"> <meta property="og:type" content="website"> <link rel="preconnect" href="//upload.wikimedia.org"> <link rel="alternate" media="only screen and (max-width: 640px)" href="//en.m.wikipedia.org/wiki/Neural_machine_translation"> <link rel="alternate" type="application/x-wiki" title="Edit this page" href="/w/index.php?title=Neural_machine_translation&action=edit"> <link rel="apple-touch-icon" href="/static/apple-touch/wikipedia.png"> <link rel="icon" href="/static/favicon/wikipedia.ico"> <link rel="search" type="application/opensearchdescription+xml" href="/w/rest.php/v1/search" title="Wikipedia (en)"> <link rel="EditURI" type="application/rsd+xml" href="//en.wikipedia.org/w/api.php?action=rsd"> <link rel="canonical" href="https://en.wikipedia.org/wiki/Neural_machine_translation"> <link rel="license" href="https://creativecommons.org/licenses/by-sa/4.0/deed.en"> <link rel="alternate" type="application/atom+xml" title="Wikipedia Atom feed" href="/w/index.php?title=Special:RecentChanges&feed=atom"> <link rel="dns-prefetch" href="//meta.wikimedia.org" /> <link rel="dns-prefetch" href="//login.wikimedia.org"> </head> <body class="skin--responsive skin-vector skin-vector-search-vue mediawiki ltr sitedir-ltr mw-hide-empty-elt ns-0 ns-subject mw-editable page-Neural_machine_translation rootpage-Neural_machine_translation skin-vector-2022 action-view"><a class="mw-jump-link" href="#bodyContent">Jump to content</a> <div class="vector-header-container"> <header class="vector-header mw-header"> <div class="vector-header-start"> <nav class="vector-main-menu-landmark" aria-label="Site"> <div id="vector-main-menu-dropdown" class="vector-dropdown vector-main-menu-dropdown vector-button-flush-left vector-button-flush-right" > <input type="checkbox" id="vector-main-menu-dropdown-checkbox" role="button" aria-haspopup="true" data-event-name="ui.dropdown-vector-main-menu-dropdown" class="vector-dropdown-checkbox " aria-label="Main menu" > <label id="vector-main-menu-dropdown-label" for="vector-main-menu-dropdown-checkbox" class="vector-dropdown-label cdx-button cdx-button--fake-button cdx-button--fake-button--enabled cdx-button--weight-quiet cdx-button--icon-only " aria-hidden="true" ><span class="vector-icon mw-ui-icon-menu mw-ui-icon-wikimedia-menu"></span> <span class="vector-dropdown-label-text">Main menu</span> </label> <div class="vector-dropdown-content"> <div id="vector-main-menu-unpinned-container" class="vector-unpinned-container"> <div id="vector-main-menu" class="vector-main-menu vector-pinnable-element"> <div class="vector-pinnable-header vector-main-menu-pinnable-header vector-pinnable-header-unpinned" data-feature-name="main-menu-pinned" data-pinnable-element-id="vector-main-menu" data-pinned-container-id="vector-main-menu-pinned-container" data-unpinned-container-id="vector-main-menu-unpinned-container" > <div class="vector-pinnable-header-label">Main menu</div> <button class="vector-pinnable-header-toggle-button vector-pinnable-header-pin-button" data-event-name="pinnable-header.vector-main-menu.pin">move to sidebar</button> <button class="vector-pinnable-header-toggle-button vector-pinnable-header-unpin-button" data-event-name="pinnable-header.vector-main-menu.unpin">hide</button> </div> <div id="p-navigation" class="vector-menu mw-portlet mw-portlet-navigation" > <div class="vector-menu-heading"> Navigation </div> <div class="vector-menu-content"> <ul class="vector-menu-content-list"> <li id="n-mainpage-description" class="mw-list-item"><a href="/wiki/Main_Page" title="Visit the main page [z]" accesskey="z"><span>Main page</span></a></li><li id="n-contents" class="mw-list-item"><a href="/wiki/Wikipedia:Contents" title="Guides to browsing Wikipedia"><span>Contents</span></a></li><li id="n-currentevents" class="mw-list-item"><a href="/wiki/Portal:Current_events" title="Articles related to current events"><span>Current events</span></a></li><li id="n-randompage" class="mw-list-item"><a href="/wiki/Special:Random" title="Visit a randomly selected article [x]" accesskey="x"><span>Random article</span></a></li><li id="n-aboutsite" class="mw-list-item"><a href="/wiki/Wikipedia:About" title="Learn about Wikipedia and how it works"><span>About Wikipedia</span></a></li><li id="n-contactpage" class="mw-list-item"><a href="//en.wikipedia.org/wiki/Wikipedia:Contact_us" title="How to contact Wikipedia"><span>Contact us</span></a></li> </ul> </div> </div> <div id="p-interaction" class="vector-menu mw-portlet mw-portlet-interaction" > <div class="vector-menu-heading"> Contribute </div> <div class="vector-menu-content"> <ul class="vector-menu-content-list"> <li id="n-help" class="mw-list-item"><a href="/wiki/Help:Contents" title="Guidance on how to use and edit Wikipedia"><span>Help</span></a></li><li id="n-introduction" class="mw-list-item"><a href="/wiki/Help:Introduction" title="Learn how to edit Wikipedia"><span>Learn to edit</span></a></li><li id="n-portal" class="mw-list-item"><a href="/wiki/Wikipedia:Community_portal" title="The hub for editors"><span>Community portal</span></a></li><li id="n-recentchanges" class="mw-list-item"><a href="/wiki/Special:RecentChanges" title="A list of recent changes to Wikipedia [r]" accesskey="r"><span>Recent changes</span></a></li><li id="n-upload" class="mw-list-item"><a href="/wiki/Wikipedia:File_upload_wizard" title="Add images or other media for use on Wikipedia"><span>Upload file</span></a></li> </ul> </div> </div> </div> </div> </div> </div> </nav> <a href="/wiki/Main_Page" class="mw-logo"> <img class="mw-logo-icon" src="/static/images/icons/wikipedia.png" alt="" aria-hidden="true" height="50" width="50"> <span class="mw-logo-container skin-invert"> <img class="mw-logo-wordmark" alt="Wikipedia" src="/static/images/mobile/copyright/wikipedia-wordmark-en.svg" style="width: 7.5em; height: 1.125em;"> <img class="mw-logo-tagline" alt="The Free Encyclopedia" src="/static/images/mobile/copyright/wikipedia-tagline-en.svg" width="117" height="13" style="width: 7.3125em; height: 0.8125em;"> </span> </a> </div> <div class="vector-header-end"> <div id="p-search" role="search" class="vector-search-box-vue vector-search-box-collapses vector-search-box-show-thumbnail vector-search-box-auto-expand-width vector-search-box"> <a href="/wiki/Special:Search" class="cdx-button cdx-button--fake-button cdx-button--fake-button--enabled cdx-button--weight-quiet cdx-button--icon-only search-toggle" title="Search Wikipedia [f]" accesskey="f"><span class="vector-icon mw-ui-icon-search mw-ui-icon-wikimedia-search"></span> <span>Search</span> </a> <div class="vector-typeahead-search-container"> <div class="cdx-typeahead-search cdx-typeahead-search--show-thumbnail cdx-typeahead-search--auto-expand-width"> <form action="/w/index.php" id="searchform" class="cdx-search-input cdx-search-input--has-end-button"> <div id="simpleSearch" class="cdx-search-input__input-wrapper" data-search-loc="header-moved"> <div class="cdx-text-input cdx-text-input--has-start-icon"> <input class="cdx-text-input__input" type="search" name="search" placeholder="Search Wikipedia" aria-label="Search Wikipedia" autocapitalize="sentences" title="Search Wikipedia [f]" accesskey="f" id="searchInput" > <span class="cdx-text-input__icon cdx-text-input__start-icon"></span> </div> <input type="hidden" name="title" value="Special:Search"> </div> <button class="cdx-button cdx-search-input__end-button">Search</button> </form> </div> </div> </div> <nav class="vector-user-links vector-user-links-wide" aria-label="Personal tools"> <div class="vector-user-links-main"> <div id="p-vector-user-menu-preferences" class="vector-menu mw-portlet emptyPortlet" > <div class="vector-menu-content"> <ul class="vector-menu-content-list"> </ul> </div> </div> <div id="p-vector-user-menu-userpage" class="vector-menu mw-portlet emptyPortlet" > <div class="vector-menu-content"> <ul class="vector-menu-content-list"> </ul> </div> </div> <nav class="vector-appearance-landmark" aria-label="Appearance"> <div id="vector-appearance-dropdown" class="vector-dropdown " title="Change the appearance of the page's font size, width, and color" > <input type="checkbox" id="vector-appearance-dropdown-checkbox" role="button" aria-haspopup="true" data-event-name="ui.dropdown-vector-appearance-dropdown" class="vector-dropdown-checkbox " aria-label="Appearance" > <label id="vector-appearance-dropdown-label" for="vector-appearance-dropdown-checkbox" class="vector-dropdown-label cdx-button cdx-button--fake-button cdx-button--fake-button--enabled cdx-button--weight-quiet cdx-button--icon-only " aria-hidden="true" ><span class="vector-icon mw-ui-icon-appearance mw-ui-icon-wikimedia-appearance"></span> <span class="vector-dropdown-label-text">Appearance</span> </label> <div class="vector-dropdown-content"> <div id="vector-appearance-unpinned-container" class="vector-unpinned-container"> </div> </div> </div> </nav> <div id="p-vector-user-menu-notifications" class="vector-menu mw-portlet emptyPortlet" > <div class="vector-menu-content"> <ul class="vector-menu-content-list"> </ul> </div> </div> <div id="p-vector-user-menu-overflow" class="vector-menu mw-portlet" > <div class="vector-menu-content"> <ul class="vector-menu-content-list"> <li id="pt-sitesupport-2" class="user-links-collapsible-item mw-list-item user-links-collapsible-item"><a data-mw="interface" href="https://donate.wikimedia.org/wiki/Special:FundraiserRedirector?utm_source=donate&utm_medium=sidebar&utm_campaign=C13_en.wikipedia.org&uselang=en" class=""><span>Donate</span></a> </li> <li id="pt-createaccount-2" class="user-links-collapsible-item mw-list-item user-links-collapsible-item"><a data-mw="interface" href="/w/index.php?title=Special:CreateAccount&returnto=Neural+machine+translation" title="You are encouraged to create an account and log in; however, it is not mandatory" class=""><span>Create account</span></a> </li> <li id="pt-login-2" class="user-links-collapsible-item mw-list-item user-links-collapsible-item"><a data-mw="interface" href="/w/index.php?title=Special:UserLogin&returnto=Neural+machine+translation" title="You're encouraged to log in; however, it's not mandatory. [o]" accesskey="o" class=""><span>Log in</span></a> </li> </ul> </div> </div> </div> <div id="vector-user-links-dropdown" class="vector-dropdown vector-user-menu vector-button-flush-right vector-user-menu-logged-out" title="Log in and more options" > <input type="checkbox" id="vector-user-links-dropdown-checkbox" role="button" aria-haspopup="true" data-event-name="ui.dropdown-vector-user-links-dropdown" class="vector-dropdown-checkbox " aria-label="Personal tools" > <label id="vector-user-links-dropdown-label" for="vector-user-links-dropdown-checkbox" class="vector-dropdown-label cdx-button cdx-button--fake-button cdx-button--fake-button--enabled cdx-button--weight-quiet cdx-button--icon-only " aria-hidden="true" ><span class="vector-icon mw-ui-icon-ellipsis mw-ui-icon-wikimedia-ellipsis"></span> <span class="vector-dropdown-label-text">Personal tools</span> </label> <div class="vector-dropdown-content"> <div id="p-personal" class="vector-menu mw-portlet mw-portlet-personal user-links-collapsible-item" title="User menu" > <div class="vector-menu-content"> <ul class="vector-menu-content-list"> <li id="pt-sitesupport" class="user-links-collapsible-item mw-list-item"><a href="https://donate.wikimedia.org/wiki/Special:FundraiserRedirector?utm_source=donate&utm_medium=sidebar&utm_campaign=C13_en.wikipedia.org&uselang=en"><span>Donate</span></a></li><li id="pt-createaccount" class="user-links-collapsible-item mw-list-item"><a href="/w/index.php?title=Special:CreateAccount&returnto=Neural+machine+translation" title="You are encouraged to create an account and log in; however, it is not mandatory"><span class="vector-icon mw-ui-icon-userAdd mw-ui-icon-wikimedia-userAdd"></span> <span>Create account</span></a></li><li id="pt-login" class="user-links-collapsible-item mw-list-item"><a href="/w/index.php?title=Special:UserLogin&returnto=Neural+machine+translation" title="You're encouraged to log in; however, it's not mandatory. [o]" accesskey="o"><span class="vector-icon mw-ui-icon-logIn mw-ui-icon-wikimedia-logIn"></span> <span>Log in</span></a></li> </ul> </div> </div> <div id="p-user-menu-anon-editor" class="vector-menu mw-portlet mw-portlet-user-menu-anon-editor" > <div class="vector-menu-heading"> Pages for logged out editors <a href="/wiki/Help:Introduction" aria-label="Learn more about editing"><span>learn more</span></a> </div> <div class="vector-menu-content"> <ul class="vector-menu-content-list"> <li id="pt-anoncontribs" class="mw-list-item"><a href="/wiki/Special:MyContributions" title="A list of edits made from this IP address [y]" accesskey="y"><span>Contributions</span></a></li><li id="pt-anontalk" class="mw-list-item"><a href="/wiki/Special:MyTalk" title="Discussion about edits from this IP address [n]" accesskey="n"><span>Talk</span></a></li> </ul> </div> </div> </div> </div> </nav> </div> </header> </div> <div class="mw-page-container"> <div class="mw-page-container-inner"> <div class="vector-sitenotice-container"> <div id="siteNotice"><!-- CentralNotice --></div> </div> <div class="vector-column-start"> <div class="vector-main-menu-container"> <div id="mw-navigation"> <nav id="mw-panel" class="vector-main-menu-landmark" aria-label="Site"> <div id="vector-main-menu-pinned-container" class="vector-pinned-container"> </div> </nav> </div> </div> <div class="vector-sticky-pinned-container"> <nav id="mw-panel-toc" aria-label="Contents" data-event-name="ui.sidebar-toc" class="mw-table-of-contents-container vector-toc-landmark"> <div id="vector-toc-pinned-container" class="vector-pinned-container"> <div id="vector-toc" class="vector-toc vector-pinnable-element"> <div class="vector-pinnable-header vector-toc-pinnable-header vector-pinnable-header-pinned" data-feature-name="toc-pinned" data-pinnable-element-id="vector-toc" > <h2 class="vector-pinnable-header-label">Contents</h2> <button class="vector-pinnable-header-toggle-button vector-pinnable-header-pin-button" data-event-name="pinnable-header.vector-toc.pin">move to sidebar</button> <button class="vector-pinnable-header-toggle-button vector-pinnable-header-unpin-button" data-event-name="pinnable-header.vector-toc.unpin">hide</button> </div> <ul class="vector-toc-contents" id="mw-panel-toc-list"> <li id="toc-mw-content-text" class="vector-toc-list-item vector-toc-level-1"> <a href="#" class="vector-toc-link"> <div class="vector-toc-text">(Top)</div> </a> </li> <li id="toc-Overview" class="vector-toc-list-item vector-toc-level-1 vector-toc-list-item-expanded"> <a class="vector-toc-link" href="#Overview"> <div class="vector-toc-text"> <span class="vector-toc-numb">1</span> <span>Overview</span> </div> </a> <ul id="toc-Overview-sublist" class="vector-toc-list"> </ul> </li> <li id="toc-History" class="vector-toc-list-item vector-toc-level-1 vector-toc-list-item-expanded"> <a class="vector-toc-link" href="#History"> <div class="vector-toc-text"> <span class="vector-toc-numb">2</span> <span>History</span> </div> </a> <button aria-controls="toc-History-sublist" class="cdx-button cdx-button--weight-quiet cdx-button--icon-only vector-toc-toggle"> <span class="vector-icon mw-ui-icon-wikimedia-expand"></span> <span>Toggle History subsection</span> </button> <ul id="toc-History-sublist" class="vector-toc-list"> <li id="toc-Early_approaches" class="vector-toc-list-item vector-toc-level-2"> <a class="vector-toc-link" href="#Early_approaches"> <div class="vector-toc-text"> <span class="vector-toc-numb">2.1</span> <span>Early approaches</span> </div> </a> <ul id="toc-Early_approaches-sublist" class="vector-toc-list"> </ul> </li> <li id="toc-Hybrid_approaches" class="vector-toc-list-item vector-toc-level-2"> <a class="vector-toc-link" href="#Hybrid_approaches"> <div class="vector-toc-text"> <span class="vector-toc-numb">2.2</span> <span>Hybrid approaches</span> </div> </a> <ul id="toc-Hybrid_approaches-sublist" class="vector-toc-list"> </ul> </li> <li id="toc-seq2seq" class="vector-toc-list-item vector-toc-level-2"> <a class="vector-toc-link" href="#seq2seq"> <div class="vector-toc-text"> <span class="vector-toc-numb">2.3</span> <span>seq2seq</span> </div> </a> <ul id="toc-seq2seq-sublist" class="vector-toc-list"> </ul> </li> <li id="toc-Transformer" class="vector-toc-list-item vector-toc-level-2"> <a class="vector-toc-link" href="#Transformer"> <div class="vector-toc-text"> <span class="vector-toc-numb">2.4</span> <span>Transformer</span> </div> </a> <ul id="toc-Transformer-sublist" class="vector-toc-list"> <li id="toc-Generative_LLMs" class="vector-toc-list-item vector-toc-level-3"> <a class="vector-toc-link" href="#Generative_LLMs"> <div class="vector-toc-text"> <span class="vector-toc-numb">2.4.1</span> <span>Generative LLMs</span> </div> </a> <ul id="toc-Generative_LLMs-sublist" class="vector-toc-list"> </ul> </li> </ul> </li> </ul> </li> <li id="toc-Comparison_with_statistical_machine_translation" class="vector-toc-list-item vector-toc-level-1 vector-toc-list-item-expanded"> <a class="vector-toc-link" href="#Comparison_with_statistical_machine_translation"> <div class="vector-toc-text"> <span class="vector-toc-numb">3</span> <span>Comparison with statistical machine translation</span> </div> </a> <ul id="toc-Comparison_with_statistical_machine_translation-sublist" class="vector-toc-list"> </ul> </li> <li id="toc-Training_procedure" class="vector-toc-list-item vector-toc-level-1 vector-toc-list-item-expanded"> <a class="vector-toc-link" href="#Training_procedure"> <div class="vector-toc-text"> <span class="vector-toc-numb">4</span> <span>Training procedure</span> </div> </a> <button aria-controls="toc-Training_procedure-sublist" class="cdx-button cdx-button--weight-quiet cdx-button--icon-only vector-toc-toggle"> <span class="vector-icon mw-ui-icon-wikimedia-expand"></span> <span>Toggle Training procedure subsection</span> </button> <ul id="toc-Training_procedure-sublist" class="vector-toc-list"> <li id="toc-Cross-entropy_loss" class="vector-toc-list-item vector-toc-level-2"> <a class="vector-toc-link" href="#Cross-entropy_loss"> <div class="vector-toc-text"> <span class="vector-toc-numb">4.1</span> <span>Cross-entropy loss</span> </div> </a> <ul id="toc-Cross-entropy_loss-sublist" class="vector-toc-list"> </ul> </li> <li id="toc-Teacher_forcing" class="vector-toc-list-item vector-toc-level-2"> <a class="vector-toc-link" href="#Teacher_forcing"> <div class="vector-toc-text"> <span class="vector-toc-numb">4.2</span> <span>Teacher forcing</span> </div> </a> <ul id="toc-Teacher_forcing-sublist" class="vector-toc-list"> </ul> </li> </ul> </li> <li id="toc-Translation_by_prompt_engineering_LLMs" class="vector-toc-list-item vector-toc-level-1 vector-toc-list-item-expanded"> <a class="vector-toc-link" href="#Translation_by_prompt_engineering_LLMs"> <div class="vector-toc-text"> <span class="vector-toc-numb">5</span> <span>Translation by prompt engineering LLMs</span> </div> </a> <ul id="toc-Translation_by_prompt_engineering_LLMs-sublist" class="vector-toc-list"> </ul> </li> <li id="toc-Literature" class="vector-toc-list-item vector-toc-level-1 vector-toc-list-item-expanded"> <a class="vector-toc-link" href="#Literature"> <div class="vector-toc-text"> <span class="vector-toc-numb">6</span> <span>Literature</span> </div> </a> <ul id="toc-Literature-sublist" class="vector-toc-list"> </ul> </li> <li id="toc-See_also" class="vector-toc-list-item vector-toc-level-1 vector-toc-list-item-expanded"> <a class="vector-toc-link" href="#See_also"> <div class="vector-toc-text"> <span class="vector-toc-numb">7</span> <span>See also</span> </div> </a> <ul id="toc-See_also-sublist" class="vector-toc-list"> </ul> </li> <li id="toc-References" class="vector-toc-list-item vector-toc-level-1 vector-toc-list-item-expanded"> <a class="vector-toc-link" href="#References"> <div class="vector-toc-text"> <span class="vector-toc-numb">8</span> <span>References</span> </div> </a> <ul id="toc-References-sublist" class="vector-toc-list"> </ul> </li> </ul> </div> </div> </nav> </div> </div> <div class="mw-content-container"> <main id="content" class="mw-body"> <header class="mw-body-header vector-page-titlebar"> <nav aria-label="Contents" class="vector-toc-landmark"> <div id="vector-page-titlebar-toc" class="vector-dropdown vector-page-titlebar-toc vector-button-flush-left" > <input type="checkbox" id="vector-page-titlebar-toc-checkbox" role="button" aria-haspopup="true" data-event-name="ui.dropdown-vector-page-titlebar-toc" class="vector-dropdown-checkbox " aria-label="Toggle the table of contents" > <label id="vector-page-titlebar-toc-label" for="vector-page-titlebar-toc-checkbox" class="vector-dropdown-label cdx-button cdx-button--fake-button cdx-button--fake-button--enabled cdx-button--weight-quiet cdx-button--icon-only " aria-hidden="true" ><span class="vector-icon mw-ui-icon-listBullet mw-ui-icon-wikimedia-listBullet"></span> <span class="vector-dropdown-label-text">Toggle the table of contents</span> </label> <div class="vector-dropdown-content"> <div id="vector-page-titlebar-toc-unpinned-container" class="vector-unpinned-container"> </div> </div> </div> </nav> <h1 id="firstHeading" class="firstHeading mw-first-heading"><span class="mw-page-title-main">Neural machine translation</span></h1> <div id="p-lang-btn" class="vector-dropdown mw-portlet mw-portlet-lang" > <input type="checkbox" id="p-lang-btn-checkbox" role="button" aria-haspopup="true" data-event-name="ui.dropdown-p-lang-btn" class="vector-dropdown-checkbox mw-interlanguage-selector" aria-label="Go to an article in another language. Available in 21 languages" > <label id="p-lang-btn-label" for="p-lang-btn-checkbox" class="vector-dropdown-label cdx-button cdx-button--fake-button cdx-button--fake-button--enabled cdx-button--weight-quiet cdx-button--action-progressive mw-portlet-lang-heading-21" aria-hidden="true" ><span class="vector-icon mw-ui-icon-language-progressive mw-ui-icon-wikimedia-language-progressive"></span> <span class="vector-dropdown-label-text">21 languages</span> </label> <div class="vector-dropdown-content"> <div class="vector-menu-content"> <ul class="vector-menu-content-list"> <li class="interlanguage-link interwiki-ar mw-list-item"><a href="https://ar.wikipedia.org/wiki/%D8%AA%D8%B1%D8%AC%D9%85%D8%A9_%D8%A2%D9%84%D9%8A%D8%A9_%D8%B9%D8%B5%D8%A8%D9%8A%D8%A9" title="ترجمة آلية عصبية – Arabic" lang="ar" hreflang="ar" data-title="ترجمة آلية عصبية" data-language-autonym="العربية" data-language-local-name="Arabic" class="interlanguage-link-target"><span>العربية</span></a></li><li class="interlanguage-link interwiki-zh-min-nan mw-list-item"><a href="https://zh-min-nan.wikipedia.org/wiki/S%C3%AEn-keng_ki-h%C3%A2i_hoan-e%CC%8Dk" title="Sîn-keng ki-hâi hoan-e̍k – Minnan" lang="nan" hreflang="nan" data-title="Sîn-keng ki-hâi hoan-e̍k" data-language-autonym="閩南語 / Bân-lâm-gú" data-language-local-name="Minnan" class="interlanguage-link-target"><span>閩南語 / Bân-lâm-gú</span></a></li><li class="interlanguage-link interwiki-ca mw-list-item"><a href="https://ca.wikipedia.org/wiki/Traducci%C3%B3_autom%C3%A0tica_neuronal" title="Traducció automàtica neuronal – Catalan" lang="ca" hreflang="ca" data-title="Traducció automàtica neuronal" data-language-autonym="Català" data-language-local-name="Catalan" class="interlanguage-link-target"><span>Català</span></a></li><li class="interlanguage-link interwiki-es mw-list-item"><a href="https://es.wikipedia.org/wiki/Traducci%C3%B3n_autom%C3%A1tica_neuronal" title="Traducción automática neuronal – Spanish" lang="es" hreflang="es" data-title="Traducción automática neuronal" data-language-autonym="Español" data-language-local-name="Spanish" class="interlanguage-link-target"><span>Español</span></a></li><li class="interlanguage-link interwiki-eu mw-list-item"><a href="https://eu.wikipedia.org/wiki/Itzulpen_automatiko_neuronal" title="Itzulpen automatiko neuronal – Basque" lang="eu" hreflang="eu" data-title="Itzulpen automatiko neuronal" data-language-autonym="Euskara" data-language-local-name="Basque" class="interlanguage-link-target"><span>Euskara</span></a></li><li class="interlanguage-link interwiki-fr mw-list-item"><a href="https://fr.wikipedia.org/wiki/Traduction_automatique_neuronale" title="Traduction automatique neuronale – French" lang="fr" hreflang="fr" data-title="Traduction automatique neuronale" data-language-autonym="Français" data-language-local-name="French" class="interlanguage-link-target"><span>Français</span></a></li><li class="interlanguage-link interwiki-ga mw-list-item"><a href="https://ga.wikipedia.org/wiki/Meais%C3%ADnaistri%C3%BAch%C3%A1n_n%C3%A9arach" title="Meaisínaistriúchán néarach – Irish" lang="ga" hreflang="ga" data-title="Meaisínaistriúchán néarach" data-language-autonym="Gaeilge" data-language-local-name="Irish" class="interlanguage-link-target"><span>Gaeilge</span></a></li><li class="interlanguage-link interwiki-ko mw-list-item"><a href="https://ko.wikipedia.org/wiki/%EC%8B%A0%EA%B2%BD%EB%A7%9D_%EA%B8%B0%EA%B3%84_%EB%B2%88%EC%97%AD" title="신경망 기계 번역 – Korean" lang="ko" hreflang="ko" data-title="신경망 기계 번역" data-language-autonym="한국어" data-language-local-name="Korean" class="interlanguage-link-target"><span>한국어</span></a></li><li class="interlanguage-link interwiki-id mw-list-item"><a href="https://id.wikipedia.org/wiki/Terjemahan_mesin_saraf" title="Terjemahan mesin saraf – Indonesian" lang="id" hreflang="id" data-title="Terjemahan mesin saraf" data-language-autonym="Bahasa Indonesia" data-language-local-name="Indonesian" class="interlanguage-link-target"><span>Bahasa Indonesia</span></a></li><li class="interlanguage-link interwiki-ia mw-list-item"><a href="https://ia.wikipedia.org/wiki/Traduction_automatic_neuronal" title="Traduction automatic neuronal – Interlingua" lang="ia" hreflang="ia" data-title="Traduction automatic neuronal" data-language-autonym="Interlingua" data-language-local-name="Interlingua" class="interlanguage-link-target"><span>Interlingua</span></a></li><li class="interlanguage-link interwiki-xmf mw-list-item"><a href="https://xmf.wikipedia.org/wiki/%E1%83%9C%E1%83%94%E1%83%98%E1%83%A0%E1%83%9D%E1%83%9C%E1%83%A3%E1%83%9A%E1%83%98_%E1%83%9B%E1%83%90%E1%83%9C%E1%83%A5%E1%83%90%E1%83%9C%E1%83%A3%E1%83%A0%E1%83%98_%E1%83%97%E1%83%90%E1%83%9C%E1%83%92%E1%83%A3%E1%83%90" title="ნეირონული მანქანური თანგუა – Mingrelian" lang="xmf" hreflang="xmf" data-title="ნეირონული მანქანური თანგუა" data-language-autonym="მარგალური" data-language-local-name="Mingrelian" class="interlanguage-link-target"><span>მარგალური</span></a></li><li class="interlanguage-link interwiki-ja mw-list-item"><a href="https://ja.wikipedia.org/wiki/%E3%83%8B%E3%83%A5%E3%83%BC%E3%83%A9%E3%83%AB%E6%A9%9F%E6%A2%B0%E7%BF%BB%E8%A8%B3" title="ニューラル機械翻訳 – Japanese" lang="ja" hreflang="ja" data-title="ニューラル機械翻訳" data-language-autonym="日本語" data-language-local-name="Japanese" class="interlanguage-link-target"><span>日本語</span></a></li><li class="interlanguage-link interwiki-or mw-list-item"><a href="https://or.wikipedia.org/wiki/%E0%AC%A8%E0%AC%BF%E0%AC%89%E0%AC%B0%E0%AC%BE%E0%AC%B2_%E0%AC%AE%E0%AD%87%E0%AC%B8%E0%AC%BF%E0%AC%A8_%E0%AC%9F%E0%AD%8D%E0%AC%B0%E0%AC%BE%E0%AC%A8%E0%AD%8D%E0%AC%B8%E0%AC%B2%E0%AD%87%E0%AC%B8%E0%AC%A8" title="ନିଉରାଲ ମେସିନ ଟ୍ରାନ୍ସଲେସନ – Odia" lang="or" hreflang="or" data-title="ନିଉରାଲ ମେସିନ ଟ୍ରାନ୍ସଲେସନ" data-language-autonym="ଓଡ଼ିଆ" data-language-local-name="Odia" class="interlanguage-link-target"><span>ଓଡ଼ିଆ</span></a></li><li class="interlanguage-link interwiki-uz mw-list-item"><a href="https://uz.wikipedia.org/wiki/Neyron_mashina_tarjimasi" title="Neyron mashina tarjimasi – Uzbek" lang="uz" hreflang="uz" data-title="Neyron mashina tarjimasi" data-language-autonym="Oʻzbekcha / ўзбекча" data-language-local-name="Uzbek" class="interlanguage-link-target"><span>Oʻzbekcha / ўзбекча</span></a></li><li class="interlanguage-link interwiki-ru mw-list-item"><a href="https://ru.wikipedia.org/wiki/%D0%9D%D0%B5%D0%B9%D1%80%D0%BE%D0%BD%D0%BD%D1%8B%D0%B9_%D0%BC%D0%B0%D1%88%D0%B8%D0%BD%D0%BD%D1%8B%D0%B9_%D0%BF%D0%B5%D1%80%D0%B5%D0%B2%D0%BE%D0%B4" title="Нейронный машинный перевод – Russian" lang="ru" hreflang="ru" data-title="Нейронный машинный перевод" data-language-autonym="Русский" data-language-local-name="Russian" class="interlanguage-link-target"><span>Русский</span></a></li><li class="interlanguage-link interwiki-th mw-list-item"><a href="https://th.wikipedia.org/wiki/%E0%B8%81%E0%B8%B2%E0%B8%A3%E0%B9%81%E0%B8%9B%E0%B8%A5%E0%B8%94%E0%B9%89%E0%B8%A7%E0%B8%A2%E0%B9%80%E0%B8%84%E0%B8%A3%E0%B8%B7%E0%B9%88%E0%B8%AD%E0%B8%87%E0%B8%9B%E0%B8%A3%E0%B8%B0%E0%B8%AA%E0%B8%B2%E0%B8%97" title="การแปลด้วยเครื่องประสาท – Thai" lang="th" hreflang="th" data-title="การแปลด้วยเครื่องประสาท" data-language-autonym="ไทย" data-language-local-name="Thai" class="interlanguage-link-target"><span>ไทย</span></a></li><li class="interlanguage-link interwiki-tr mw-list-item"><a href="https://tr.wikipedia.org/wiki/N%C3%B6ral_makine_%C3%A7evirisi" title="Nöral makine çevirisi – Turkish" lang="tr" hreflang="tr" data-title="Nöral makine çevirisi" data-language-autonym="Türkçe" data-language-local-name="Turkish" class="interlanguage-link-target"><span>Türkçe</span></a></li><li class="interlanguage-link interwiki-uk mw-list-item"><a href="https://uk.wikipedia.org/wiki/%D0%9D%D0%B5%D0%B9%D1%80%D0%BE%D0%BD%D0%BD%D0%B8%D0%B9_%D0%BC%D0%B0%D1%88%D0%B8%D0%BD%D0%BD%D0%B8%D0%B9_%D0%BF%D0%B5%D1%80%D0%B5%D0%BA%D0%BB%D0%B0%D0%B4" title="Нейронний машинний переклад – Ukrainian" lang="uk" hreflang="uk" data-title="Нейронний машинний переклад" data-language-autonym="Українська" data-language-local-name="Ukrainian" class="interlanguage-link-target"><span>Українська</span></a></li><li class="interlanguage-link interwiki-vi mw-list-item"><a href="https://vi.wikipedia.org/wiki/D%E1%BB%8Bch_m%C3%A1y_b%E1%BA%B1ng_n%C6%A1-ron" title="Dịch máy bằng nơ-ron – Vietnamese" lang="vi" hreflang="vi" data-title="Dịch máy bằng nơ-ron" data-language-autonym="Tiếng Việt" data-language-local-name="Vietnamese" class="interlanguage-link-target"><span>Tiếng Việt</span></a></li><li class="interlanguage-link interwiki-zh-yue mw-list-item"><a href="https://zh-yue.wikipedia.org/wiki/%E7%A5%9E%E7%B6%93%E6%A9%9F%E6%A2%B0%E7%BF%BB%E8%AD%AF" title="神經機械翻譯 – Cantonese" lang="yue" hreflang="yue" data-title="神經機械翻譯" data-language-autonym="粵語" data-language-local-name="Cantonese" class="interlanguage-link-target"><span>粵語</span></a></li><li class="interlanguage-link interwiki-zh mw-list-item"><a href="https://zh.wikipedia.org/wiki/%E7%A5%9E%E7%BB%8F%E6%9C%BA%E5%99%A8%E7%BF%BB%E8%AF%91" title="神经机器翻译 – Chinese" lang="zh" hreflang="zh" data-title="神经机器翻译" data-language-autonym="中文" data-language-local-name="Chinese" class="interlanguage-link-target"><span>中文</span></a></li> </ul> <div class="after-portlet after-portlet-lang"><span class="wb-langlinks-edit wb-langlinks-link"><a href="https://www.wikidata.org/wiki/Special:EntityPage/Q25053937#sitelinks-wikipedia" title="Edit interlanguage links" class="wbc-editpage">Edit links</a></span></div> </div> </div> </div> </header> <div class="vector-page-toolbar"> <div class="vector-page-toolbar-container"> <div id="left-navigation"> <nav aria-label="Namespaces"> <div id="p-associated-pages" class="vector-menu vector-menu-tabs mw-portlet mw-portlet-associated-pages" > <div class="vector-menu-content"> <ul class="vector-menu-content-list"> <li id="ca-nstab-main" class="selected vector-tab-noicon mw-list-item"><a href="/wiki/Neural_machine_translation" title="View the content page [c]" accesskey="c"><span>Article</span></a></li><li id="ca-talk" class="vector-tab-noicon mw-list-item"><a href="/wiki/Talk:Neural_machine_translation" rel="discussion" title="Discuss improvements to the content page [t]" accesskey="t"><span>Talk</span></a></li> </ul> </div> </div> <div id="vector-variants-dropdown" class="vector-dropdown emptyPortlet" > <input type="checkbox" id="vector-variants-dropdown-checkbox" role="button" aria-haspopup="true" data-event-name="ui.dropdown-vector-variants-dropdown" class="vector-dropdown-checkbox " aria-label="Change language variant" > <label id="vector-variants-dropdown-label" for="vector-variants-dropdown-checkbox" class="vector-dropdown-label cdx-button cdx-button--fake-button cdx-button--fake-button--enabled cdx-button--weight-quiet" aria-hidden="true" ><span class="vector-dropdown-label-text">English</span> </label> <div class="vector-dropdown-content"> <div id="p-variants" class="vector-menu mw-portlet mw-portlet-variants emptyPortlet" > <div class="vector-menu-content"> <ul class="vector-menu-content-list"> </ul> </div> </div> </div> </div> </nav> </div> <div id="right-navigation" class="vector-collapsible"> <nav aria-label="Views"> <div id="p-views" class="vector-menu vector-menu-tabs mw-portlet mw-portlet-views" > <div class="vector-menu-content"> <ul class="vector-menu-content-list"> <li id="ca-view" class="selected vector-tab-noicon mw-list-item"><a href="/wiki/Neural_machine_translation"><span>Read</span></a></li><li id="ca-edit" class="vector-tab-noicon mw-list-item"><a href="/w/index.php?title=Neural_machine_translation&action=edit" title="Edit this page [e]" accesskey="e"><span>Edit</span></a></li><li id="ca-history" class="vector-tab-noicon mw-list-item"><a href="/w/index.php?title=Neural_machine_translation&action=history" title="Past revisions of this page [h]" accesskey="h"><span>View history</span></a></li> </ul> </div> </div> </nav> <nav class="vector-page-tools-landmark" aria-label="Page tools"> <div id="vector-page-tools-dropdown" class="vector-dropdown vector-page-tools-dropdown" > <input type="checkbox" id="vector-page-tools-dropdown-checkbox" role="button" aria-haspopup="true" data-event-name="ui.dropdown-vector-page-tools-dropdown" class="vector-dropdown-checkbox " aria-label="Tools" > <label id="vector-page-tools-dropdown-label" for="vector-page-tools-dropdown-checkbox" class="vector-dropdown-label cdx-button cdx-button--fake-button cdx-button--fake-button--enabled cdx-button--weight-quiet" aria-hidden="true" ><span class="vector-dropdown-label-text">Tools</span> </label> <div class="vector-dropdown-content"> <div id="vector-page-tools-unpinned-container" class="vector-unpinned-container"> <div id="vector-page-tools" class="vector-page-tools vector-pinnable-element"> <div class="vector-pinnable-header vector-page-tools-pinnable-header vector-pinnable-header-unpinned" data-feature-name="page-tools-pinned" data-pinnable-element-id="vector-page-tools" data-pinned-container-id="vector-page-tools-pinned-container" data-unpinned-container-id="vector-page-tools-unpinned-container" > <div class="vector-pinnable-header-label">Tools</div> <button class="vector-pinnable-header-toggle-button vector-pinnable-header-pin-button" data-event-name="pinnable-header.vector-page-tools.pin">move to sidebar</button> <button class="vector-pinnable-header-toggle-button vector-pinnable-header-unpin-button" data-event-name="pinnable-header.vector-page-tools.unpin">hide</button> </div> <div id="p-cactions" class="vector-menu mw-portlet mw-portlet-cactions emptyPortlet vector-has-collapsible-items" title="More options" > <div class="vector-menu-heading"> Actions </div> <div class="vector-menu-content"> <ul class="vector-menu-content-list"> <li id="ca-more-view" class="selected vector-more-collapsible-item mw-list-item"><a href="/wiki/Neural_machine_translation"><span>Read</span></a></li><li id="ca-more-edit" class="vector-more-collapsible-item mw-list-item"><a href="/w/index.php?title=Neural_machine_translation&action=edit" title="Edit this page [e]" accesskey="e"><span>Edit</span></a></li><li id="ca-more-history" class="vector-more-collapsible-item mw-list-item"><a href="/w/index.php?title=Neural_machine_translation&action=history"><span>View history</span></a></li> </ul> </div> </div> <div id="p-tb" class="vector-menu mw-portlet mw-portlet-tb" > <div class="vector-menu-heading"> General </div> <div class="vector-menu-content"> <ul class="vector-menu-content-list"> <li id="t-whatlinkshere" class="mw-list-item"><a href="/wiki/Special:WhatLinksHere/Neural_machine_translation" title="List of all English Wikipedia pages containing links to this page [j]" accesskey="j"><span>What links here</span></a></li><li id="t-recentchangeslinked" class="mw-list-item"><a href="/wiki/Special:RecentChangesLinked/Neural_machine_translation" rel="nofollow" title="Recent changes in pages linked from this page [k]" accesskey="k"><span>Related changes</span></a></li><li id="t-upload" class="mw-list-item"><a href="/wiki/Wikipedia:File_Upload_Wizard" title="Upload files [u]" accesskey="u"><span>Upload file</span></a></li><li id="t-specialpages" class="mw-list-item"><a href="/wiki/Special:SpecialPages" title="A list of all special pages [q]" accesskey="q"><span>Special pages</span></a></li><li id="t-permalink" class="mw-list-item"><a href="/w/index.php?title=Neural_machine_translation&oldid=1257438152" title="Permanent link to this revision of this page"><span>Permanent link</span></a></li><li id="t-info" class="mw-list-item"><a href="/w/index.php?title=Neural_machine_translation&action=info" title="More information about this page"><span>Page information</span></a></li><li id="t-cite" class="mw-list-item"><a href="/w/index.php?title=Special:CiteThisPage&page=Neural_machine_translation&id=1257438152&wpFormIdentifier=titleform" title="Information on how to cite this page"><span>Cite this page</span></a></li><li id="t-urlshortener" class="mw-list-item"><a href="/w/index.php?title=Special:UrlShortener&url=https%3A%2F%2Fen.wikipedia.org%2Fwiki%2FNeural_machine_translation"><span>Get shortened URL</span></a></li><li id="t-urlshortener-qrcode" class="mw-list-item"><a href="/w/index.php?title=Special:QrCode&url=https%3A%2F%2Fen.wikipedia.org%2Fwiki%2FNeural_machine_translation"><span>Download QR code</span></a></li> </ul> </div> </div> <div id="p-coll-print_export" class="vector-menu mw-portlet mw-portlet-coll-print_export" > <div class="vector-menu-heading"> Print/export </div> <div class="vector-menu-content"> <ul class="vector-menu-content-list"> <li id="coll-download-as-rl" class="mw-list-item"><a href="/w/index.php?title=Special:DownloadAsPdf&page=Neural_machine_translation&action=show-download-screen" title="Download this page as a PDF file"><span>Download as PDF</span></a></li><li id="t-print" class="mw-list-item"><a href="/w/index.php?title=Neural_machine_translation&printable=yes" title="Printable version of this page [p]" accesskey="p"><span>Printable version</span></a></li> </ul> </div> </div> <div id="p-wikibase-otherprojects" class="vector-menu mw-portlet mw-portlet-wikibase-otherprojects" > <div class="vector-menu-heading"> In other projects </div> <div class="vector-menu-content"> <ul class="vector-menu-content-list"> <li id="t-wikibase" class="wb-otherproject-link wb-otherproject-wikibase-dataitem mw-list-item"><a href="https://www.wikidata.org/wiki/Special:EntityPage/Q25053937" title="Structured data on this page hosted by Wikidata [g]" accesskey="g"><span>Wikidata item</span></a></li> </ul> </div> </div> </div> </div> </div> </div> </nav> </div> </div> </div> <div class="vector-column-end"> <div class="vector-sticky-pinned-container"> <nav class="vector-page-tools-landmark" aria-label="Page tools"> <div id="vector-page-tools-pinned-container" class="vector-pinned-container"> </div> </nav> <nav class="vector-appearance-landmark" aria-label="Appearance"> <div id="vector-appearance-pinned-container" class="vector-pinned-container"> <div id="vector-appearance" class="vector-appearance vector-pinnable-element"> <div class="vector-pinnable-header vector-appearance-pinnable-header vector-pinnable-header-pinned" data-feature-name="appearance-pinned" data-pinnable-element-id="vector-appearance" data-pinned-container-id="vector-appearance-pinned-container" data-unpinned-container-id="vector-appearance-unpinned-container" > <div class="vector-pinnable-header-label">Appearance</div> <button class="vector-pinnable-header-toggle-button vector-pinnable-header-pin-button" data-event-name="pinnable-header.vector-appearance.pin">move to sidebar</button> <button class="vector-pinnable-header-toggle-button vector-pinnable-header-unpin-button" data-event-name="pinnable-header.vector-appearance.unpin">hide</button> </div> </div> </div> </nav> </div> </div> <div id="bodyContent" class="vector-body" aria-labelledby="firstHeading" data-mw-ve-target-container> <div class="vector-body-before-content"> <div class="mw-indicators"> </div> <div id="siteSub" class="noprint">From Wikipedia, the free encyclopedia</div> </div> <div id="contentSub"><div id="mw-content-subtitle"></div></div> <div id="mw-content-text" class="mw-body-content"><div class="mw-content-ltr mw-parser-output" lang="en" dir="ltr"><div class="shortdescription nomobile noexcerpt noprint searchaux" style="display:none">Approach to machine translation using artificial neural networks</div> <p><b>Neural machine translation</b> (<b>NMT</b>) is an approach to <a href="/wiki/Machine_translation" title="Machine translation">machine translation</a> that uses an <a href="/wiki/Artificial_neural_network" class="mw-redirect" title="Artificial neural network">artificial neural network</a> to predict the likelihood of a sequence of words, typically modeling entire sentences in a single integrated model. </p><p>It is the dominant approach today<sup id="cite_ref-Koehn2020_1-0" class="reference"><a href="#cite_note-Koehn2020-1"><span class="cite-bracket">[</span>1<span class="cite-bracket">]</span></a></sup><sup class="reference nowrap"><span title="Page: 293">: 293 </span></sup><sup id="cite_ref-Stahlberg2020_2-0" class="reference"><a href="#cite_note-Stahlberg2020-2"><span class="cite-bracket">[</span>2<span class="cite-bracket">]</span></a></sup><sup class="reference nowrap"><span title="Page: 1">: 1 </span></sup> and can produce translations that rival human translations when translating between high-resource languages under specific conditions.<sup id="cite_ref-Popel2020_3-0" class="reference"><a href="#cite_note-Popel2020-3"><span class="cite-bracket">[</span>3<span class="cite-bracket">]</span></a></sup> However, there still remain challenges, especially with languages where less high-quality data is available,<sup id="cite_ref-Haddow2022_4-0" class="reference"><a href="#cite_note-Haddow2022-4"><span class="cite-bracket">[</span>4<span class="cite-bracket">]</span></a></sup><sup id="cite_ref-Poibeau2022_5-0" class="reference"><a href="#cite_note-Poibeau2022-5"><span class="cite-bracket">[</span>5<span class="cite-bracket">]</span></a></sup><sup id="cite_ref-Koehn2020_1-1" class="reference"><a href="#cite_note-Koehn2020-1"><span class="cite-bracket">[</span>1<span class="cite-bracket">]</span></a></sup><sup class="reference nowrap"><span title="Page: 293">: 293 </span></sup> and with <a href="/wiki/Domain_adaptation#Domain_shift" title="Domain adaptation">domain shift</a> between the data a system was trained on and the texts it is supposed to translate.<sup id="cite_ref-Koehn2020_1-2" class="reference"><a href="#cite_note-Koehn2020-1"><span class="cite-bracket">[</span>1<span class="cite-bracket">]</span></a></sup><sup class="reference nowrap"><span title="Page: 293">: 293 </span></sup> NMT systems also tend to produce fairly literal translations.<sup id="cite_ref-Poibeau2022_5-1" class="reference"><a href="#cite_note-Poibeau2022-5"><span class="cite-bracket">[</span>5<span class="cite-bracket">]</span></a></sup> </p> <meta property="mw:PageProp/toc" /> <div class="mw-heading mw-heading2"><h2 id="Overview">Overview</h2><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Neural_machine_translation&action=edit&section=1" title="Edit section: Overview"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <p>In the translation task, a sentence <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle \mathbf {x} =x_{1,I}}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mrow class="MJX-TeXAtom-ORD"> <mi mathvariant="bold">x</mi> </mrow> <mo>=</mo> <msub> <mi>x</mi> <mrow class="MJX-TeXAtom-ORD"> <mn>1</mn> <mo>,</mo> <mi>I</mi> </mrow> </msub> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle \mathbf {x} =x_{1,I}}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/bc8ac345f77ad6533d93b83d8eba761cb8851ae8" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -1.005ex; width:8.179ex; height:2.343ex;" alt="{\displaystyle \mathbf {x} =x_{1,I}}"></span> (consisting of <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle I}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mi>I</mi> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle I}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/535ea7fc4134a31cbe2251d9d3511374bc41be9f" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.338ex; width:1.172ex; height:2.176ex;" alt="{\displaystyle I}"></span> tokens <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle x_{i}}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <msub> <mi>x</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>i</mi> </mrow> </msub> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle x_{i}}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/e87000dd6142b81d041896a30fe58f0c3acb2158" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.671ex; width:2.129ex; height:2.009ex;" alt="{\displaystyle x_{i}}"></span>) in the source language is to be translated into a sentence <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle \mathbf {y} =x_{1,J}}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mrow class="MJX-TeXAtom-ORD"> <mi mathvariant="bold">y</mi> </mrow> <mo>=</mo> <msub> <mi>x</mi> <mrow class="MJX-TeXAtom-ORD"> <mn>1</mn> <mo>,</mo> <mi>J</mi> </mrow> </msub> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle \mathbf {y} =x_{1,J}}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/4eb59780b5adcbeb630ebb24ef5b4277c190f658" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -1.005ex; width:8.391ex; height:2.343ex;" alt="{\displaystyle \mathbf {y} =x_{1,J}}"></span> (consisting of <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle J}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mi>J</mi> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle J}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/359e4f407b49910e02c27c2f52e87a36cd74c053" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.338ex; width:1.471ex; height:2.176ex;" alt="{\displaystyle J}"></span> tokens <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle x_{j}}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <msub> <mi>x</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>j</mi> </mrow> </msub> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle x_{j}}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/5db47cb3d2f9496205a17a6856c91c1d3d363ccd" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -1.005ex; width:2.239ex; height:2.343ex;" alt="{\displaystyle x_{j}}"></span>) in the target language. The source and target tokens (which in the simple event are used for each other in order for a particular game ] vectors, so they can be processed mathematically. </p><p>NMT models assign a probability <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle P(y|x)}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mi>P</mi> <mo stretchy="false">(</mo> <mi>y</mi> <mrow class="MJX-TeXAtom-ORD"> <mo stretchy="false">|</mo> </mrow> <mi>x</mi> <mo stretchy="false">)</mo> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle P(y|x)}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/5d08508dff9e465cc317804ff19999c4ffbf7d94" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.838ex; width:6.687ex; height:2.843ex;" alt="{\displaystyle P(y|x)}"></span><sup id="cite_ref-Stahlberg2020_2-1" class="reference"><a href="#cite_note-Stahlberg2020-2"><span class="cite-bracket">[</span>2<span class="cite-bracket">]</span></a></sup><sup class="reference nowrap"><span title="Page: 5">: 5 </span></sup><sup id="cite_ref-Tan2020_6-0" class="reference"><a href="#cite_note-Tan2020-6"><span class="cite-bracket">[</span>6<span class="cite-bracket">]</span></a></sup><sup class="reference nowrap"><span title="Page: 1">: 1 </span></sup> to potential translations y and then search a subset of potential translations for the one with the highest probability. Most NMT models are <i>auto-regressive</i>: They model the probability of each target token as a function of the source sentence and the previously predicted target tokens. The probability of the whole translation then is the product of the probabilities of the individual predicted tokens:<sup id="cite_ref-Stahlberg2020_2-2" class="reference"><a href="#cite_note-Stahlberg2020-2"><span class="cite-bracket">[</span>2<span class="cite-bracket">]</span></a></sup><sup class="reference nowrap"><span title="Page: 5">: 5 </span></sup><sup id="cite_ref-Tan2020_6-1" class="reference"><a href="#cite_note-Tan2020-6"><span class="cite-bracket">[</span>6<span class="cite-bracket">]</span></a></sup><sup class="reference nowrap"><span title="Page: 2">: 2 </span></sup> </p><p><span class="mwe-math-element"><span class="mwe-math-mathml-display mwe-math-mathml-a11y" style="display: none;"><math display="block" xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle P(y|x)=\prod _{j=1}^{J}P(y_{j}|y_{1,i-1},\mathbf {x} )}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mi>P</mi> <mo stretchy="false">(</mo> <mi>y</mi> <mrow class="MJX-TeXAtom-ORD"> <mo stretchy="false">|</mo> </mrow> <mi>x</mi> <mo stretchy="false">)</mo> <mo>=</mo> <munderover> <mo>∏<!-- ∏ --></mo> <mrow class="MJX-TeXAtom-ORD"> <mi>j</mi> <mo>=</mo> <mn>1</mn> </mrow> <mrow class="MJX-TeXAtom-ORD"> <mi>J</mi> </mrow> </munderover> <mi>P</mi> <mo stretchy="false">(</mo> <msub> <mi>y</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>j</mi> </mrow> </msub> <mrow class="MJX-TeXAtom-ORD"> <mo stretchy="false">|</mo> </mrow> <msub> <mi>y</mi> <mrow class="MJX-TeXAtom-ORD"> <mn>1</mn> <mo>,</mo> <mi>i</mi> <mo>−<!-- − --></mo> <mn>1</mn> </mrow> </msub> <mo>,</mo> <mrow class="MJX-TeXAtom-ORD"> <mi mathvariant="bold">x</mi> </mrow> <mo stretchy="false">)</mo> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle P(y|x)=\prod _{j=1}^{J}P(y_{j}|y_{1,i-1},\mathbf {x} )}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/f240a020e7bd71dbc0416eda2735c9139d40eb02" class="mwe-math-fallback-image-display mw-invert skin-invert" aria-hidden="true" style="vertical-align: -3.338ex; width:27.156ex; height:7.676ex;" alt="{\displaystyle P(y|x)=\prod _{j=1}^{J}P(y_{j}|y_{1,i-1},\mathbf {x} )}"></span> </p><p>NMT models differ in how exactly they model this function <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle P}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mi>P</mi> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle P}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/b4dc73bf40314945ff376bd363916a738548d40a" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.338ex; width:1.745ex; height:2.176ex;" alt="{\displaystyle P}"></span>, but most use some variation of the <i>encoder-decoder</i> architecture:<sup id="cite_ref-Tan2020_6-2" class="reference"><a href="#cite_note-Tan2020-6"><span class="cite-bracket">[</span>6<span class="cite-bracket">]</span></a></sup><sup class="reference nowrap"><span title="Page: 2">: 2 </span></sup><sup id="cite_ref-Goodfellow2013_7-0" class="reference"><a href="#cite_note-Goodfellow2013-7"><span class="cite-bracket">[</span>7<span class="cite-bracket">]</span></a></sup><sup class="reference nowrap"><span title="Page: 469">: 469 </span></sup> They first use an encoder network to process <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle \mathbf {x} }"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mrow class="MJX-TeXAtom-ORD"> <mi mathvariant="bold">x</mi> </mrow> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle \mathbf {x} }</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/32adf004df5eb0a8c7fd8c0b6b7405183c5a5ef2" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.338ex; width:1.411ex; height:1.676ex;" alt="{\displaystyle \mathbf {x} }"></span> and encode it into a vector or matrix representation of the source sentence. Then they use a decoder network that usually produces one target word at a time, taking into account the source representation and the tokens it previously produced. As soon as the decoder produces a special <i>end of sentence</i> token, the decoding process is finished. Since the decoder refers to its own previous outputs during, this way of decoding is called <i>auto-regressive</i>. </p> <div class="mw-heading mw-heading2"><h2 id="History">History</h2><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Neural_machine_translation&action=edit&section=2" title="Edit section: History"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <div class="mw-heading mw-heading3"><h3 id="Early_approaches">Early approaches</h3><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Neural_machine_translation&action=edit&section=3" title="Edit section: Early approaches"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <p>In 1987, Robert B. Allen demonstrated the use of <a href="/wiki/Feedforward_neural_network" title="Feedforward neural network">feed-forward neural networks</a> for translating auto-generated English sentences with a limited vocabulary of 31 words into Spanish. In this experiment, the size of the network's input and output layers was chosen to be just large enough for the longest sentences in the source and target language, respectively, because the network did not have any mechanism to encode sequences of arbitrary length into a fixed-size representation. In his summary, Allen also already hinted at the possibility of using auto-associative models, one for encoding the source and one for decoding the target.<sup id="cite_ref-Allen1987_8-0" class="reference"><a href="#cite_note-Allen1987-8"><span class="cite-bracket">[</span>8<span class="cite-bracket">]</span></a></sup> </p><p>Lonnie Chrisman built upon Allen's work in 1991 by training separate <a href="/w/index.php?title=Recursive_auto-associative_memory&action=edit&redlink=1" class="new" title="Recursive auto-associative memory (page does not exist)">recursive auto-associative memory</a> (RAAM) networks (developed by <a href="/wiki/Jordan_Pollack" title="Jordan Pollack">Jordan B. Pollack</a><sup id="cite_ref-Pollack1990_9-0" class="reference"><a href="#cite_note-Pollack1990-9"><span class="cite-bracket">[</span>9<span class="cite-bracket">]</span></a></sup>) for the source and the target language. Each of the RAAM networks is trained to encode an arbitrary-length sentence into a fixed-size hidden representation and to decode the original sentence again from that representation. Additionally, the two networks are also trained to share their hidden representation; this way, the source encoder can produce a representation that the target decoder can decode.<sup id="cite_ref-Chrisman1991_10-0" class="reference"><a href="#cite_note-Chrisman1991-10"><span class="cite-bracket">[</span>10<span class="cite-bracket">]</span></a></sup> Forcada and Ñeco simplified this procedure in 1997 to directly train a source encoder and a target decoder in what they called a <i>recursive hetero-associative memory</i>.<sup id="cite_ref-Forcada1997_11-0" class="reference"><a href="#cite_note-Forcada1997-11"><span class="cite-bracket">[</span>11<span class="cite-bracket">]</span></a></sup> </p><p>Also in 1997, Castaño and Casacuberta employed an <a href="/wiki/Elman_network" class="mw-redirect" title="Elman network">Elman's recurrent neural network</a> in another machine translation task with very limited vocabulary and complexity.<sup id="cite_ref-Castano1997a_12-0" class="reference"><a href="#cite_note-Castano1997a-12"><span class="cite-bracket">[</span>12<span class="cite-bracket">]</span></a></sup><sup id="cite_ref-Castano1997b_13-0" class="reference"><a href="#cite_note-Castano1997b-13"><span class="cite-bracket">[</span>13<span class="cite-bracket">]</span></a></sup> </p><p>Even though these early approaches were already similar to modern NMT, the computing resources of the time were not sufficient to process datasets large enough for the computational complexity of the machine translation problem on real-world texts.<sup id="cite_ref-Koehn2020_1-3" class="reference"><a href="#cite_note-Koehn2020-1"><span class="cite-bracket">[</span>1<span class="cite-bracket">]</span></a></sup><sup class="reference nowrap"><span title="Page: 39">: 39 </span></sup><sup id="cite_ref-Yang2020_14-0" class="reference"><a href="#cite_note-Yang2020-14"><span class="cite-bracket">[</span>14<span class="cite-bracket">]</span></a></sup><sup class="reference nowrap"><span title="Page: 2">: 2 </span></sup> Instead, other methods like <a href="/wiki/Statistical_machine_translation" title="Statistical machine translation">statistical machine translation</a> rose to become the state of the art of the 1990s and 2000s. </p> <div class="mw-heading mw-heading3"><h3 id="Hybrid_approaches">Hybrid approaches</h3><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Neural_machine_translation&action=edit&section=4" title="Edit section: Hybrid approaches"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <p>During the time when statistical machine translation was prevalent, some works used neural methods to replace various parts in the statistical machine translation while still using the log-linear approach to tie them together.<sup id="cite_ref-Koehn2020_1-4" class="reference"><a href="#cite_note-Koehn2020-1"><span class="cite-bracket">[</span>1<span class="cite-bracket">]</span></a></sup><sup class="reference nowrap"><span title="Page: 39">: 39 </span></sup><sup id="cite_ref-Stahlberg2020_2-3" class="reference"><a href="#cite_note-Stahlberg2020-2"><span class="cite-bracket">[</span>2<span class="cite-bracket">]</span></a></sup><sup class="reference nowrap"><span title="Page: 1">: 1 </span></sup> For example, in various works together with other researchers, Holger Schwenk replaced the usual <a href="/wiki/N-gram_language_model" class="mw-redirect" title="N-gram language model">n-gram language model</a> with a <a href="/wiki/Neural_language_model" class="mw-redirect" title="Neural language model">neural one</a><sup id="cite_ref-Schwenk2006_15-0" class="reference"><a href="#cite_note-Schwenk2006-15"><span class="cite-bracket">[</span>15<span class="cite-bracket">]</span></a></sup><sup id="cite_ref-Schwenk2007_16-0" class="reference"><a href="#cite_note-Schwenk2007-16"><span class="cite-bracket">[</span>16<span class="cite-bracket">]</span></a></sup> and estimated phrase translation probabilities using a feed-forward network.<sup id="cite_ref-Schwenk2012_17-0" class="reference"><a href="#cite_note-Schwenk2012-17"><span class="cite-bracket">[</span>17<span class="cite-bracket">]</span></a></sup> </p> <div class="mw-heading mw-heading3"><h3 id="seq2seq">seq2seq</h3><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Neural_machine_translation&action=edit&section=5" title="Edit section: seq2seq"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <style data-mw-deduplicate="TemplateStyles:r1236090951">.mw-parser-output .hatnote{font-style:italic}.mw-parser-output div.hatnote{padding-left:1.6em;margin-bottom:0.5em}.mw-parser-output .hatnote i{font-style:normal}.mw-parser-output .hatnote+link+.hatnote{margin-top:-0.5em}@media print{body.ns-0 .mw-parser-output .hatnote{display:none!important}}</style><div role="note" class="hatnote navigation-not-searchable">Main article: <a href="/wiki/Seq2seq" title="Seq2seq">seq2seq</a></div> <p>In 2013 and 2014, end-to-end neural machine translation had their breakthrough with Kalchbrenner & Blunsom using a <a href="/wiki/Convolutional_neural_network" title="Convolutional neural network">convolutional neural network</a> (CNN) for encoding the source<sup id="cite_ref-KalchbrennerBlunsom2013_18-0" class="reference"><a href="#cite_note-KalchbrennerBlunsom2013-18"><span class="cite-bracket">[</span>18<span class="cite-bracket">]</span></a></sup> and both Cho et al. and Sutskever et al. using a <a href="/wiki/Recurrent_neural_network" title="Recurrent neural network">recurrent neural network</a> (RNN) instead.<sup id="cite_ref-Cho2014EncDec_19-0" class="reference"><a href="#cite_note-Cho2014EncDec-19"><span class="cite-bracket">[</span>19<span class="cite-bracket">]</span></a></sup><sup id="cite_ref-Sutskever2014_20-0" class="reference"><a href="#cite_note-Sutskever2014-20"><span class="cite-bracket">[</span>20<span class="cite-bracket">]</span></a></sup> All three used an RNN conditioned on a fixed encoding of the source as their decoder to produce the translation. However, these models performed poorly on longer sentences.<sup id="cite_ref-Cho2014Properties_21-0" class="reference"><a href="#cite_note-Cho2014Properties-21"><span class="cite-bracket">[</span>21<span class="cite-bracket">]</span></a></sup><sup class="reference nowrap"><span title="Page: 107">: 107 </span></sup><sup id="cite_ref-Koehn2020_1-5" class="reference"><a href="#cite_note-Koehn2020-1"><span class="cite-bracket">[</span>1<span class="cite-bracket">]</span></a></sup><sup class="reference nowrap"><span title="Page: 39">: 39 </span></sup><sup id="cite_ref-Stahlberg2020_2-4" class="reference"><a href="#cite_note-Stahlberg2020-2"><span class="cite-bracket">[</span>2<span class="cite-bracket">]</span></a></sup><sup class="reference nowrap"><span title="Page: 7">: 7 </span></sup> This problem was addressed when Bahdanau et al. introduced <a href="/wiki/Attention_(machine_learning)" title="Attention (machine learning)">attention</a> to their encoder-decoder architecture: At each decoding step, the state of the decoder is used to calculate a source representation that focuses on different parts of the source and uses that representation in the calculation of the probabilities for the next token.<sup id="cite_ref-Bahdanau2015_22-0" class="reference"><a href="#cite_note-Bahdanau2015-22"><span class="cite-bracket">[</span>22<span class="cite-bracket">]</span></a></sup> Based on these RNN-based architectures, <a href="/wiki/Baidu" title="Baidu">Baidu</a> launched the "first large-scale NMT system"<sup id="cite_ref-Wang2022_23-0" class="reference"><a href="#cite_note-Wang2022-23"><span class="cite-bracket">[</span>23<span class="cite-bracket">]</span></a></sup><sup class="reference nowrap"><span title="Page: 144">: 144 </span></sup> in 2015, followed by <a href="/wiki/Google_Neural_Machine_Translation" title="Google Neural Machine Translation">Google Neural Machine Translation</a> in 2016.<sup id="cite_ref-Wang2022_23-1" class="reference"><a href="#cite_note-Wang2022-23"><span class="cite-bracket">[</span>23<span class="cite-bracket">]</span></a></sup><sup class="reference nowrap"><span title="Page: 144">: 144 </span></sup><sup id="cite_ref-Wu2016_24-0" class="reference"><a href="#cite_note-Wu2016-24"><span class="cite-bracket">[</span>24<span class="cite-bracket">]</span></a></sup> From that year on, neural models also became the prevailing choice in the main machine translation conference Workshop on Statistical Machine Translation.<sup id="cite_ref-WMT2016_25-0" class="reference"><a href="#cite_note-WMT2016-25"><span class="cite-bracket">[</span>25<span class="cite-bracket">]</span></a></sup> </p><p>Gehring et al. combined a CNN encoder with an attention mechanism in 2017, which handled long-range dependencies in the source better than previous approaches and also increased translation speed because a CNN encoder is parallelizable, whereas an <a href="/w/index.php?title=RNN_encoder&action=edit&redlink=1" class="new" title="RNN encoder (page does not exist)">RNN encoder</a> has to encode one token at a time due to its recurrent nature.<sup id="cite_ref-Gehring2017_26-0" class="reference"><a href="#cite_note-Gehring2017-26"><span class="cite-bracket">[</span>26<span class="cite-bracket">]</span></a></sup><sup class="reference nowrap"><span title="Page: 230">: 230 </span></sup> In the same year, “Microsoft Translator released AI-powered online neural machine translation (NMT).<sup id="cite_ref-27" class="reference"><a href="#cite_note-27"><span class="cite-bracket">[</span>27<span class="cite-bracket">]</span></a></sup> <a href="/wiki/DeepL_Translator" title="DeepL Translator">DeepL Translator</a>, which was at the time based on a <a href="/w/index.php?title=CNN_encoder&action=edit&redlink=1" class="new" title="CNN encoder (page does not exist)">CNN encoder</a>, was also released in the same year and was judged by several news outlets to outperform its competitors.<sup id="cite_ref-DeepLTechCrunch_28-0" class="reference"><a href="#cite_note-DeepLTechCrunch-28"><span class="cite-bracket">[</span>28<span class="cite-bracket">]</span></a></sup><sup id="cite_ref-DeepLLeMonde_29-0" class="reference"><a href="#cite_note-DeepLLeMonde-29"><span class="cite-bracket">[</span>29<span class="cite-bracket">]</span></a></sup><sup id="cite_ref-DeepLGolem_30-0" class="reference"><a href="#cite_note-DeepLGolem-30"><span class="cite-bracket">[</span>30<span class="cite-bracket">]</span></a></sup> It has also been seen that <a href="/wiki/OpenAI" title="OpenAI">OpenAI</a>'s <a href="/wiki/GPT-3" title="GPT-3">GPT-3</a> released in 2020 can function as a neural machine translation system. Some other machine translation systems, such as Microsoft translator and SYSTRAN can be also seen to have integrated neural networks into their operations. </p> <div class="mw-heading mw-heading3"><h3 id="Transformer">Transformer</h3><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Neural_machine_translation&action=edit&section=6" title="Edit section: Transformer"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1236090951"><div role="note" class="hatnote navigation-not-searchable">Main article: <a href="/wiki/Transformer_(deep_learning_architecture)" title="Transformer (deep learning architecture)">Transformer (deep learning architecture)</a></div> <p>Another network architecture that lends itself to parallelization is the <a href="/wiki/Transformer_(machine_learning_model)" class="mw-redirect" title="Transformer (machine learning model)">transformer</a>, which was introduced by Vaswani et al. also in 2017.<sup id="cite_ref-Vaswani2017_31-0" class="reference"><a href="#cite_note-Vaswani2017-31"><span class="cite-bracket">[</span>31<span class="cite-bracket">]</span></a></sup> Like previous models, the transformer still uses the attention mechanism for weighting encoder output for the decoding steps. However, the transformer's encoder and decoder networks themselves are also based on attention instead of recurrence or convolution: Each layer weights and transforms the previous layer's output in a process called <i>self-attention</i>. Since the attention mechanism does not have any notion of token order, but the order of words in a sentence is obviously relevant, the token embeddings are combined with an <a href="/wiki/Transformer_(machine_learning_model)#Positional_encoding" class="mw-redirect" title="Transformer (machine learning model)">explicit encoding of their position in the sentence</a>.<sup id="cite_ref-Stahlberg2020_2-5" class="reference"><a href="#cite_note-Stahlberg2020-2"><span class="cite-bracket">[</span>2<span class="cite-bracket">]</span></a></sup><sup class="reference nowrap"><span title="Page: 15">: 15 </span></sup><sup id="cite_ref-Tan2020_6-3" class="reference"><a href="#cite_note-Tan2020-6"><span class="cite-bracket">[</span>6<span class="cite-bracket">]</span></a></sup><sup class="reference nowrap"><span title="Page: 7">: 7 </span></sup> Since both the transformer's encoder and decoder are free from recurrent elements, they can both be parallelized during training. However, the original transformer's decoder is still auto-regressive, which means that decoding still has to be done one token at a time during inference. </p><p>The transformer model quickly became the dominant choice for machine translation systems<sup id="cite_ref-Stahlberg2020_2-6" class="reference"><a href="#cite_note-Stahlberg2020-2"><span class="cite-bracket">[</span>2<span class="cite-bracket">]</span></a></sup><sup class="reference nowrap"><span title="Page: 44">: 44 </span></sup> and was still by far the most-used architecture in the Workshop on Statistical Machine Translation in 2022 and 2023.<sup id="cite_ref-WMT2022_32-0" class="reference"><a href="#cite_note-WMT2022-32"><span class="cite-bracket">[</span>32<span class="cite-bracket">]</span></a></sup><sup class="reference nowrap"><span title="Page: 35–40">: 35–40 </span></sup><sup id="cite_ref-WMT2023_33-0" class="reference"><a href="#cite_note-WMT2023-33"><span class="cite-bracket">[</span>33<span class="cite-bracket">]</span></a></sup><sup class="reference nowrap"><span title="Page: 28–31">: 28–31 </span></sup> </p><p>Usually, NMT models’ weights are initialized randomly and then learned by training on parallel datasets. However, since using <a href="/wiki/Large_language_model" title="Large language model">large language models</a> (LLMs) such as <a href="/wiki/BERT_(language_model)" title="BERT (language model)">BERT</a> pre-trained on large amounts of monolingual data as <a href="/wiki/Fine-tuning_(deep_learning)" title="Fine-tuning (deep learning)">a starting point for learning other tasks</a> has proven very successful in wider <a href="/wiki/Natural_language_processing" title="Natural language processing">NLP</a>, this paradigm is also becoming more prevalent in NMT. This is especially useful for low-resource languages, where large parallel datasets do not exist.<sup id="cite_ref-Haddow2022_4-1" class="reference"><a href="#cite_note-Haddow2022-4"><span class="cite-bracket">[</span>4<span class="cite-bracket">]</span></a></sup><sup class="reference nowrap"><span title="Page: 689–690">: 689–690 </span></sup> An example of this is the mBART model, which first trains one transformer on a multilingual dataset to recover masked tokens in sentences, and then fine-tunes the resulting <a href="/wiki/Autoencoder" title="Autoencoder">autoencoder</a> on the translation task.<sup id="cite_ref-Liu2020_34-0" class="reference"><a href="#cite_note-Liu2020-34"><span class="cite-bracket">[</span>34<span class="cite-bracket">]</span></a></sup> </p> <div class="mw-heading mw-heading4"><h4 id="Generative_LLMs">Generative LLMs</h4><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Neural_machine_translation&action=edit&section=7" title="Edit section: Generative LLMs"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <p>Instead of fine-tuning a pre-trained language model on the translation task, sufficiently large <a href="/wiki/Generative_model#Deep_generative_models" title="Generative model">generative models</a> can also be directly prompted to translate a sentence into the desired language. This approach was first comprehensively tested and evaluated for <a href="/wiki/GPT_3.5" class="mw-redirect" title="GPT 3.5">GPT 3.5</a> in 2023 by Hendy et al. They found that "GPT systems can produce highly fluent and competitive translation outputs even in the <a href="/wiki/Zero-shot_learning" title="Zero-shot learning">zero-shot</a> setting especially for the high-resource language translations".<sup id="cite_ref-Hendy2023_35-0" class="reference"><a href="#cite_note-Hendy2023-35"><span class="cite-bracket">[</span>35<span class="cite-bracket">]</span></a></sup><sup class="reference nowrap"><span title="Page: 22">: 22 </span></sup> The WMT23 evaluated the same approach (but using <a href="/wiki/GPT-4" title="GPT-4">GPT-4</a>) and found that it was on par with the state of the art when translating into English, but not quite when translating into lower-resource languages.<sup id="cite_ref-WMT2023_33-1" class="reference"><a href="#cite_note-WMT2023-33"><span class="cite-bracket">[</span>33<span class="cite-bracket">]</span></a></sup><sup class="reference nowrap"><span title="Page: 16–17">: 16–17 </span></sup> This is plausible considering that GPT models are trained mainly on English text.<sup id="cite_ref-GPT3LanguagesByCharacterCount2020_36-0" class="reference"><a href="#cite_note-GPT3LanguagesByCharacterCount2020-36"><span class="cite-bracket">[</span>36<span class="cite-bracket">]</span></a></sup> </p> <div class="mw-heading mw-heading2"><h2 id="Comparison_with_statistical_machine_translation">Comparison with statistical machine translation</h2><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Neural_machine_translation&action=edit&section=8" title="Edit section: Comparison with statistical machine translation"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <p>NMT has overcome several challenges that were present in statistical machine translation (SMT): </p> <ul><li>NMT's full reliance on continuous representation of tokens overcame sparsity issues caused by rare words or phrases. Models were able to generalize more effectively.<sup id="cite_ref-KalchbrennerBlunsom2013_18-1" class="reference"><a href="#cite_note-KalchbrennerBlunsom2013-18"><span class="cite-bracket">[</span>18<span class="cite-bracket">]</span></a></sup><sup class="reference nowrap"><span title="Page: 1">: 1 </span></sup><sup id="cite_ref-Russell2020_37-0" class="reference"><a href="#cite_note-Russell2020-37"><span class="cite-bracket">[</span>37<span class="cite-bracket">]</span></a></sup><sup class="reference nowrap"><span title="Page: 900–901">: 900–901 </span></sup></li> <li>The limited n-gram length used in SMT's n-gram language models caused a loss of context. NMT systems overcome this by not having a hard cut-off after a fixed number of tokens and by using attention to choosing which tokens to focus on when generating the next token.<sup id="cite_ref-Russell2020_37-1" class="reference"><a href="#cite_note-Russell2020-37"><span class="cite-bracket">[</span>37<span class="cite-bracket">]</span></a></sup><sup class="reference nowrap"><span title="Page: 900–901">: 900–901 </span></sup></li> <li>End-to-end training of a single model improved translation performance and also simplified the whole process.<sup class="noprint Inline-Template Template-Fact" style="white-space:nowrap;">[<i><a href="/wiki/Wikipedia:Citation_needed" title="Wikipedia:Citation needed"><span title="This claim needs references to reliable sources. (December 2023)">citation needed</span></a></i>]</sup></li> <li>The huge n-gram models (up to 7-gram) used in SMT required large amounts of memory,<sup id="cite_ref-Federico2007_38-0" class="reference"><a href="#cite_note-Federico2007-38"><span class="cite-bracket">[</span>38<span class="cite-bracket">]</span></a></sup><sup class="reference nowrap"><span title="Page: 88">: 88 </span></sup> whereas NMT requires less.</li></ul> <div class="mw-heading mw-heading2"><h2 id="Training_procedure">Training procedure</h2><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Neural_machine_translation&action=edit&section=9" title="Edit section: Training procedure"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <div class="mw-heading mw-heading3"><h3 id="Cross-entropy_loss">Cross-entropy loss</h3><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Neural_machine_translation&action=edit&section=10" title="Edit section: Cross-entropy loss"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <p>NMT models are usually trained to maximize the likelihood of observing the training data. I.e., for a dataset of <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle T}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mi>T</mi> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle T}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/ec7200acd984a1d3a3d7dc455e262fbe54f7f6e0" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.338ex; width:1.636ex; height:2.176ex;" alt="{\displaystyle T}"></span> source sentences <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle X=\mathbf {x} ^{(1)},...,\mathbf {x} ^{(T)}}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mi>X</mi> <mo>=</mo> <msup> <mrow class="MJX-TeXAtom-ORD"> <mi mathvariant="bold">x</mi> </mrow> <mrow class="MJX-TeXAtom-ORD"> <mo stretchy="false">(</mo> <mn>1</mn> <mo stretchy="false">)</mo> </mrow> </msup> <mo>,</mo> <mo>.</mo> <mo>.</mo> <mo>.</mo> <mo>,</mo> <msup> <mrow class="MJX-TeXAtom-ORD"> <mi mathvariant="bold">x</mi> </mrow> <mrow class="MJX-TeXAtom-ORD"> <mo stretchy="false">(</mo> <mi>T</mi> <mo stretchy="false">)</mo> </mrow> </msup> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle X=\mathbf {x} ^{(1)},...,\mathbf {x} ^{(T)}}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/409219ec088266a71ddb2f250f540db1d50ad255" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.671ex; width:18.072ex; height:3.176ex;" alt="{\displaystyle X=\mathbf {x} ^{(1)},...,\mathbf {x} ^{(T)}}"></span> and corresponding target sentences <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle Y=\mathbf {y} ^{(1)},...,\mathbf {y} ^{(T)}}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mi>Y</mi> <mo>=</mo> <msup> <mrow class="MJX-TeXAtom-ORD"> <mi mathvariant="bold">y</mi> </mrow> <mrow class="MJX-TeXAtom-ORD"> <mo stretchy="false">(</mo> <mn>1</mn> <mo stretchy="false">)</mo> </mrow> </msup> <mo>,</mo> <mo>.</mo> <mo>.</mo> <mo>.</mo> <mo>,</mo> <msup> <mrow class="MJX-TeXAtom-ORD"> <mi mathvariant="bold">y</mi> </mrow> <mrow class="MJX-TeXAtom-ORD"> <mo stretchy="false">(</mo> <mi>T</mi> <mo stretchy="false">)</mo> </mrow> </msup> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle Y=\mathbf {y} ^{(1)},...,\mathbf {y} ^{(T)}}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/4ac2062328189630770b3ddc7e567ba0904e1c97" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.671ex; width:17.866ex; height:3.176ex;" alt="{\displaystyle Y=\mathbf {y} ^{(1)},...,\mathbf {y} ^{(T)}}"></span>, the goal is finding the model parameters <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle \theta ^{*}}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <msup> <mi>θ<!-- θ --></mi> <mrow class="MJX-TeXAtom-ORD"> <mo>∗<!-- ∗ --></mo> </mrow> </msup> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle \theta ^{*}}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/5c77b69ba747144843a16bb2e053e9fcd8583735" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.338ex; width:2.145ex; height:2.343ex;" alt="{\displaystyle \theta ^{*}}"></span> that maximize the sum of the likelihood of each target sentence in the training data given the corresponding source sentence: </p><p><span class="mwe-math-element"><span class="mwe-math-mathml-display mwe-math-mathml-a11y" style="display: none;"><math display="block" xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle \theta ^{*}={\underset {\theta }{\operatorname {arg\,max} }}\sum _{i}^{T}P_{\theta }(\mathbf {y} ^{(i)}|\mathbf {x} ^{(i)})}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <msup> <mi>θ<!-- θ --></mi> <mrow class="MJX-TeXAtom-ORD"> <mo>∗<!-- ∗ --></mo> </mrow> </msup> <mo>=</mo> <mrow class="MJX-TeXAtom-ORD"> <munder> <mrow class="MJX-TeXAtom-OP MJX-fixedlimits"> <mi mathvariant="normal">a</mi> <mi mathvariant="normal">r</mi> <mi mathvariant="normal">g</mi> <mspace width="thinmathspace" /> <mi mathvariant="normal">m</mi> <mi mathvariant="normal">a</mi> <mi mathvariant="normal">x</mi> </mrow> <mi>θ<!-- θ --></mi> </munder> </mrow> <munderover> <mo>∑<!-- ∑ --></mo> <mrow class="MJX-TeXAtom-ORD"> <mi>i</mi> </mrow> <mrow class="MJX-TeXAtom-ORD"> <mi>T</mi> </mrow> </munderover> <msub> <mi>P</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>θ<!-- θ --></mi> </mrow> </msub> <mo stretchy="false">(</mo> <msup> <mrow class="MJX-TeXAtom-ORD"> <mi mathvariant="bold">y</mi> </mrow> <mrow class="MJX-TeXAtom-ORD"> <mo stretchy="false">(</mo> <mi>i</mi> <mo stretchy="false">)</mo> </mrow> </msup> <mrow class="MJX-TeXAtom-ORD"> <mo stretchy="false">|</mo> </mrow> <msup> <mrow class="MJX-TeXAtom-ORD"> <mi mathvariant="bold">x</mi> </mrow> <mrow class="MJX-TeXAtom-ORD"> <mo stretchy="false">(</mo> <mi>i</mi> <mo stretchy="false">)</mo> </mrow> </msup> <mo stretchy="false">)</mo> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle \theta ^{*}={\underset {\theta }{\operatorname {arg\,max} }}\sum _{i}^{T}P_{\theta }(\mathbf {y} ^{(i)}|\mathbf {x} ^{(i)})}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/9a56cbb62b6ad528282daf1349f4daa6bbb4768e" class="mwe-math-fallback-image-display mw-invert skin-invert" aria-hidden="true" style="vertical-align: -3.005ex; width:29.254ex; height:7.343ex;" alt="{\displaystyle \theta ^{*}={\underset {\theta }{\operatorname {arg\,max} }}\sum _{i}^{T}P_{\theta }(\mathbf {y} ^{(i)}|\mathbf {x} ^{(i)})}"></span> </p><p>Expanding to token level yields: </p><p><span class="mwe-math-element"><span class="mwe-math-mathml-display mwe-math-mathml-a11y" style="display: none;"><math display="block" xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle \theta ^{*}={\underset {\theta }{\operatorname {arg\,max} }}\sum _{i}^{T}\prod _{j=1}^{J^{(i)}}P(y_{j}^{(i)}|y_{1,j-1}^{(i)},\mathbf {x} ^{(i)})}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <msup> <mi>θ<!-- θ --></mi> <mrow class="MJX-TeXAtom-ORD"> <mo>∗<!-- ∗ --></mo> </mrow> </msup> <mo>=</mo> <mrow class="MJX-TeXAtom-ORD"> <munder> <mrow class="MJX-TeXAtom-OP MJX-fixedlimits"> <mi mathvariant="normal">a</mi> <mi mathvariant="normal">r</mi> <mi mathvariant="normal">g</mi> <mspace width="thinmathspace" /> <mi mathvariant="normal">m</mi> <mi mathvariant="normal">a</mi> <mi mathvariant="normal">x</mi> </mrow> <mi>θ<!-- θ --></mi> </munder> </mrow> <munderover> <mo>∑<!-- ∑ --></mo> <mrow class="MJX-TeXAtom-ORD"> <mi>i</mi> </mrow> <mrow class="MJX-TeXAtom-ORD"> <mi>T</mi> </mrow> </munderover> <munderover> <mo>∏<!-- ∏ --></mo> <mrow class="MJX-TeXAtom-ORD"> <mi>j</mi> <mo>=</mo> <mn>1</mn> </mrow> <mrow class="MJX-TeXAtom-ORD"> <msup> <mi>J</mi> <mrow class="MJX-TeXAtom-ORD"> <mo stretchy="false">(</mo> <mi>i</mi> <mo stretchy="false">)</mo> </mrow> </msup> </mrow> </munderover> <mi>P</mi> <mo stretchy="false">(</mo> <msubsup> <mi>y</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>j</mi> </mrow> <mrow class="MJX-TeXAtom-ORD"> <mo stretchy="false">(</mo> <mi>i</mi> <mo stretchy="false">)</mo> </mrow> </msubsup> <mrow class="MJX-TeXAtom-ORD"> <mo stretchy="false">|</mo> </mrow> <msubsup> <mi>y</mi> <mrow class="MJX-TeXAtom-ORD"> <mn>1</mn> <mo>,</mo> <mi>j</mi> <mo>−<!-- − --></mo> <mn>1</mn> </mrow> <mrow class="MJX-TeXAtom-ORD"> <mo stretchy="false">(</mo> <mi>i</mi> <mo stretchy="false">)</mo> </mrow> </msubsup> <mo>,</mo> <msup> <mrow class="MJX-TeXAtom-ORD"> <mi mathvariant="bold">x</mi> </mrow> <mrow class="MJX-TeXAtom-ORD"> <mo stretchy="false">(</mo> <mi>i</mi> <mo stretchy="false">)</mo> </mrow> </msup> <mo stretchy="false">)</mo> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle \theta ^{*}={\underset {\theta }{\operatorname {arg\,max} }}\sum _{i}^{T}\prod _{j=1}^{J^{(i)}}P(y_{j}^{(i)}|y_{1,j-1}^{(i)},\mathbf {x} ^{(i)})}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/8d1eb359cdcbce911d61be7ed5430acc3e83b0a9" class="mwe-math-fallback-image-display mw-invert skin-invert" aria-hidden="true" style="vertical-align: -3.338ex; width:38.072ex; height:8.176ex;" alt="{\displaystyle \theta ^{*}={\underset {\theta }{\operatorname {arg\,max} }}\sum _{i}^{T}\prod _{j=1}^{J^{(i)}}P(y_{j}^{(i)}|y_{1,j-1}^{(i)},\mathbf {x} ^{(i)})}"></span> </p><p>Since we are only interested in the maximum, we can just as well search for the maximum of the logarithm instead (which has the advantage that it avoids <a href="/wiki/Arithmetic_underflow" title="Arithmetic underflow">floating point underflow</a> that could happen with the product of low probabilities). </p><p><span class="mwe-math-element"><span class="mwe-math-mathml-display mwe-math-mathml-a11y" style="display: none;"><math display="block" xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle \theta ^{*}={\underset {\theta }{\operatorname {arg\,max} }}\sum _{i}^{T}\log \prod _{j=1}^{J^{(i)}}P(y_{j}^{(i)}|y_{1,j-1}^{(i)},\mathbf {x} ^{(i)})}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <msup> <mi>θ<!-- θ --></mi> <mrow class="MJX-TeXAtom-ORD"> <mo>∗<!-- ∗ --></mo> </mrow> </msup> <mo>=</mo> <mrow class="MJX-TeXAtom-ORD"> <munder> <mrow class="MJX-TeXAtom-OP MJX-fixedlimits"> <mi mathvariant="normal">a</mi> <mi mathvariant="normal">r</mi> <mi mathvariant="normal">g</mi> <mspace width="thinmathspace" /> <mi mathvariant="normal">m</mi> <mi mathvariant="normal">a</mi> <mi mathvariant="normal">x</mi> </mrow> <mi>θ<!-- θ --></mi> </munder> </mrow> <munderover> <mo>∑<!-- ∑ --></mo> <mrow class="MJX-TeXAtom-ORD"> <mi>i</mi> </mrow> <mrow class="MJX-TeXAtom-ORD"> <mi>T</mi> </mrow> </munderover> <mi>log</mi> <mo>⁡<!-- --></mo> <munderover> <mo>∏<!-- ∏ --></mo> <mrow class="MJX-TeXAtom-ORD"> <mi>j</mi> <mo>=</mo> <mn>1</mn> </mrow> <mrow class="MJX-TeXAtom-ORD"> <msup> <mi>J</mi> <mrow class="MJX-TeXAtom-ORD"> <mo stretchy="false">(</mo> <mi>i</mi> <mo stretchy="false">)</mo> </mrow> </msup> </mrow> </munderover> <mi>P</mi> <mo stretchy="false">(</mo> <msubsup> <mi>y</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>j</mi> </mrow> <mrow class="MJX-TeXAtom-ORD"> <mo stretchy="false">(</mo> <mi>i</mi> <mo stretchy="false">)</mo> </mrow> </msubsup> <mrow class="MJX-TeXAtom-ORD"> <mo stretchy="false">|</mo> </mrow> <msubsup> <mi>y</mi> <mrow class="MJX-TeXAtom-ORD"> <mn>1</mn> <mo>,</mo> <mi>j</mi> <mo>−<!-- − --></mo> <mn>1</mn> </mrow> <mrow class="MJX-TeXAtom-ORD"> <mo stretchy="false">(</mo> <mi>i</mi> <mo stretchy="false">)</mo> </mrow> </msubsup> <mo>,</mo> <msup> <mrow class="MJX-TeXAtom-ORD"> <mi mathvariant="bold">x</mi> </mrow> <mrow class="MJX-TeXAtom-ORD"> <mo stretchy="false">(</mo> <mi>i</mi> <mo stretchy="false">)</mo> </mrow> </msup> <mo stretchy="false">)</mo> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle \theta ^{*}={\underset {\theta }{\operatorname {arg\,max} }}\sum _{i}^{T}\log \prod _{j=1}^{J^{(i)}}P(y_{j}^{(i)}|y_{1,j-1}^{(i)},\mathbf {x} ^{(i)})}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/325fbebc21da2f71fb7aae22fb76bf5c6e389a71" class="mwe-math-fallback-image-display mw-invert skin-invert" aria-hidden="true" style="vertical-align: -3.338ex; width:41.431ex; height:8.176ex;" alt="{\displaystyle \theta ^{*}={\underset {\theta }{\operatorname {arg\,max} }}\sum _{i}^{T}\log \prod _{j=1}^{J^{(i)}}P(y_{j}^{(i)}|y_{1,j-1}^{(i)},\mathbf {x} ^{(i)})}"></span> </p><p>Using the fact that <a href="/wiki/List_of_logarithmic_identities#Logarithm_of_a_product" title="List of logarithmic identities">the logarithm of a product is the sum of the factors’ logarithms</a> and flipping the sign yields the classic <a href="/wiki/Cross-entropy#Cross-entropy_loss_function_and_logistic_regression" title="Cross-entropy">cross-entropy loss</a>: </p><p><span class="mwe-math-element"><span class="mwe-math-mathml-display mwe-math-mathml-a11y" style="display: none;"><math display="block" xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle \theta ^{*}={\underset {\theta }{\operatorname {arg\,min} }}-\sum _{i}^{T}\log \sum _{j=1}^{J^{(i)}}P(y_{j}^{(i)}|y_{1,j-1}^{(i)},\mathbf {x} ^{(i)})}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <msup> <mi>θ<!-- θ --></mi> <mrow class="MJX-TeXAtom-ORD"> <mo>∗<!-- ∗ --></mo> </mrow> </msup> <mo>=</mo> <mrow class="MJX-TeXAtom-ORD"> <munder> <mrow class="MJX-TeXAtom-OP MJX-fixedlimits"> <mi mathvariant="normal">a</mi> <mi mathvariant="normal">r</mi> <mi mathvariant="normal">g</mi> <mspace width="thinmathspace" /> <mi mathvariant="normal">m</mi> <mi mathvariant="normal">i</mi> <mi mathvariant="normal">n</mi> </mrow> <mi>θ<!-- θ --></mi> </munder> </mrow> <mo>−<!-- − --></mo> <munderover> <mo>∑<!-- ∑ --></mo> <mrow class="MJX-TeXAtom-ORD"> <mi>i</mi> </mrow> <mrow class="MJX-TeXAtom-ORD"> <mi>T</mi> </mrow> </munderover> <mi>log</mi> <mo>⁡<!-- --></mo> <munderover> <mo>∑<!-- ∑ --></mo> <mrow class="MJX-TeXAtom-ORD"> <mi>j</mi> <mo>=</mo> <mn>1</mn> </mrow> <mrow class="MJX-TeXAtom-ORD"> <msup> <mi>J</mi> <mrow class="MJX-TeXAtom-ORD"> <mo stretchy="false">(</mo> <mi>i</mi> <mo stretchy="false">)</mo> </mrow> </msup> </mrow> </munderover> <mi>P</mi> <mo stretchy="false">(</mo> <msubsup> <mi>y</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>j</mi> </mrow> <mrow class="MJX-TeXAtom-ORD"> <mo stretchy="false">(</mo> <mi>i</mi> <mo stretchy="false">)</mo> </mrow> </msubsup> <mrow class="MJX-TeXAtom-ORD"> <mo stretchy="false">|</mo> </mrow> <msubsup> <mi>y</mi> <mrow class="MJX-TeXAtom-ORD"> <mn>1</mn> <mo>,</mo> <mi>j</mi> <mo>−<!-- − --></mo> <mn>1</mn> </mrow> <mrow class="MJX-TeXAtom-ORD"> <mo stretchy="false">(</mo> <mi>i</mi> <mo stretchy="false">)</mo> </mrow> </msubsup> <mo>,</mo> <msup> <mrow class="MJX-TeXAtom-ORD"> <mi mathvariant="bold">x</mi> </mrow> <mrow class="MJX-TeXAtom-ORD"> <mo stretchy="false">(</mo> <mi>i</mi> <mo stretchy="false">)</mo> </mrow> </msup> <mo stretchy="false">)</mo> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle \theta ^{*}={\underset {\theta }{\operatorname {arg\,min} }}-\sum _{i}^{T}\log \sum _{j=1}^{J^{(i)}}P(y_{j}^{(i)}|y_{1,j-1}^{(i)},\mathbf {x} ^{(i)})}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/faae4dde705b8c5c1e07abd246c397808624fd7b" class="mwe-math-fallback-image-display mw-invert skin-invert" aria-hidden="true" style="vertical-align: -3.338ex; width:43.819ex; height:8.176ex;" alt="{\displaystyle \theta ^{*}={\underset {\theta }{\operatorname {arg\,min} }}-\sum _{i}^{T}\log \sum _{j=1}^{J^{(i)}}P(y_{j}^{(i)}|y_{1,j-1}^{(i)},\mathbf {x} ^{(i)})}"></span> </p><p>In practice, this minimization is done iteratively on small subsets (mini-batches) of the training set using <a href="/wiki/Stochastic_gradient_descent" title="Stochastic gradient descent">stochastic gradient descent</a>. </p> <div class="mw-heading mw-heading3"><h3 id="Teacher_forcing">Teacher forcing</h3><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Neural_machine_translation&action=edit&section=11" title="Edit section: Teacher forcing"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1236090951"><div role="note" class="hatnote navigation-not-searchable">Main article: <a href="/wiki/Teacher_forcing" title="Teacher forcing">Teacher forcing</a></div> <p>During inference, auto-regressive decoders use the token generated in the previous step as the input token. However, the vocabulary of target tokens is usually very large. So, at the beginning of the training phase, untrained models will pick the wrong token almost always; and subsequent steps would then have to work with wrong input tokens, which would slow down training considerably. Instead, <i>teacher forcing</i> is used during the training phase: The model (the “student” in the teacher forcing metaphor) is always fed the previous ground-truth tokens as input for the next token, regardless of what it predicted in the previous step. </p> <div class="mw-heading mw-heading2"><h2 id="Translation_by_prompt_engineering_LLMs">Translation by prompt engineering LLMs</h2><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Neural_machine_translation&action=edit&section=12" title="Edit section: Translation by prompt engineering LLMs"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <p>As outlined in the history section above, instead of using an NMT system that is trained on parallel text, one can also prompt a generative LLM to translate a text. These models differ from an encoder-decoder NMT system in a number of ways:<sup id="cite_ref-Hendy2023_35-1" class="reference"><a href="#cite_note-Hendy2023-35"><span class="cite-bracket">[</span>35<span class="cite-bracket">]</span></a></sup><sup class="reference nowrap"><span title="Page: 1">: 1 </span></sup> </p> <ul><li>Generative language models are not trained on the translation task, let alone on a parallel dataset. Instead, they are trained on a language modeling objective, such as predicting the next word in a sequence drawn from a large dataset of text. This dataset can contain documents in many languages, but is in practice dominated by English text.<sup id="cite_ref-GPT3LanguagesByCharacterCount2020_36-1" class="reference"><a href="#cite_note-GPT3LanguagesByCharacterCount2020-36"><span class="cite-bracket">[</span>36<span class="cite-bracket">]</span></a></sup> After this pre-training, they are <a href="/wiki/Large_language_model#Training_and_architecture" title="Large language model">fine-tuned on another task</a>, usually to follow instructions.<sup id="cite_ref-Radford2018_39-0" class="reference"><a href="#cite_note-Radford2018-39"><span class="cite-bracket">[</span>39<span class="cite-bracket">]</span></a></sup></li> <li>Since they are not trained on translation, they also do not feature an encoder-decoder architecture. Instead, they just consist of a transformer's decoder.</li> <li>In order to be competitive on the machine translation task, LLMs need to be much larger than other NMT systems. E.g., GPT-3 has 175 billion parameters,<sup id="cite_ref-Brown2020_40-0" class="reference"><a href="#cite_note-Brown2020-40"><span class="cite-bracket">[</span>40<span class="cite-bracket">]</span></a></sup><sup class="reference nowrap"><span title="Page: 5">: 5 </span></sup> while mBART has 680 million<sup id="cite_ref-Liu2020_34-1" class="reference"><a href="#cite_note-Liu2020-34"><span class="cite-bracket">[</span>34<span class="cite-bracket">]</span></a></sup><sup class="reference nowrap"><span title="Page: 727">: 727 </span></sup> and the original transformer-big has “only” 213 million.<sup id="cite_ref-Vaswani2017_31-1" class="reference"><a href="#cite_note-Vaswani2017-31"><span class="cite-bracket">[</span>31<span class="cite-bracket">]</span></a></sup><sup class="reference nowrap"><span title="Page: 9">: 9 </span></sup> This means that they are computationally more expensive to train and use.</li></ul> <p>A generative LLM can be prompted in a <a href="/wiki/Zero-shot_learning" title="Zero-shot learning">zero-shot</a> fashion by just asking it to translate a text into another language without giving any further examples in the prompt. Or one can include one or several example translations in the prompt before asking to translate the text in question. This is then called <a href="/wiki/Few-shot_learning_(natural_language_processing)" class="mw-redirect" title="Few-shot learning (natural language processing)">one-shot or few-shot learning</a>, respectively. For example, the following prompts were used by Hendy et al. (2023) for zero-shot and one-shot translation:<sup id="cite_ref-Hendy2023_35-2" class="reference"><a href="#cite_note-Hendy2023-35"><span class="cite-bracket">[</span>35<span class="cite-bracket">]</span></a></sup> </p> <pre>### Translate this sentence from [source language] to [target language], Source: [source sentence] ### Target: </pre> <pre>Translate this into 1. [target language]: [shot 1 source] 1. [shot 1 reference] Translate this into 1. [target language]: [input] 1.</pre> <div class="mw-heading mw-heading2"><h2 id="Literature">Literature</h2><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Neural_machine_translation&action=edit&section=13" title="Edit section: Literature"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <ul><li><a href="/wiki/Philipp_Koehn" title="Philipp Koehn">Koehn, Philipp</a> (2020). <a rel="nofollow" class="external text" href="http://www2.statmt.org/nmt-book/">Neural Machine Translation.</a> Cambridge University Press.</li> <li>Stahlberg, Felix (2020). <a rel="nofollow" class="external text" href="https://arxiv.org/abs/1912.02047v2">Neural Machine Translation: A Review and Survey.</a></li></ul> <div class="mw-heading mw-heading2"><h2 id="See_also">See also</h2><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Neural_machine_translation&action=edit&section=14" title="Edit section: See also"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <ul><li><a href="/wiki/Attention_(machine_learning)" title="Attention (machine learning)">Attention (machine learning)</a></li> <li><a href="/wiki/Transformer_(machine_learning_model)" class="mw-redirect" title="Transformer (machine learning model)">Transformer (machine learning model)</a></li> <li><a href="/wiki/Seq2seq" title="Seq2seq">Seq2seq</a></li></ul> <div class="mw-heading mw-heading2"><h2 id="References">References</h2><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Neural_machine_translation&action=edit&section=15" title="Edit section: References"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <style data-mw-deduplicate="TemplateStyles:r1239543626">.mw-parser-output .reflist{margin-bottom:0.5em;list-style-type:decimal}@media screen{.mw-parser-output .reflist{font-size:90%}}.mw-parser-output .reflist .references{font-size:100%;margin-bottom:0;list-style-type:inherit}.mw-parser-output .reflist-columns-2{column-width:30em}.mw-parser-output .reflist-columns-3{column-width:25em}.mw-parser-output .reflist-columns{margin-top:0.3em}.mw-parser-output .reflist-columns ol{margin-top:0}.mw-parser-output .reflist-columns li{page-break-inside:avoid;break-inside:avoid-column}.mw-parser-output .reflist-upper-alpha{list-style-type:upper-alpha}.mw-parser-output .reflist-upper-roman{list-style-type:upper-roman}.mw-parser-output .reflist-lower-alpha{list-style-type:lower-alpha}.mw-parser-output .reflist-lower-greek{list-style-type:lower-greek}.mw-parser-output .reflist-lower-roman{list-style-type:lower-roman}</style><div class="reflist"> <div class="mw-references-wrap mw-references-columns"><ol class="references"> <li id="cite_note-Koehn2020-1"><span class="mw-cite-backlink">^ <a href="#cite_ref-Koehn2020_1-0"><sup><i><b>a</b></i></sup></a> <a href="#cite_ref-Koehn2020_1-1"><sup><i><b>b</b></i></sup></a> <a href="#cite_ref-Koehn2020_1-2"><sup><i><b>c</b></i></sup></a> <a href="#cite_ref-Koehn2020_1-3"><sup><i><b>d</b></i></sup></a> <a href="#cite_ref-Koehn2020_1-4"><sup><i><b>e</b></i></sup></a> <a href="#cite_ref-Koehn2020_1-5"><sup><i><b>f</b></i></sup></a></span> <span class="reference-text"><style data-mw-deduplicate="TemplateStyles:r1238218222">.mw-parser-output cite.citation{font-style:inherit;word-wrap:break-word}.mw-parser-output .citation q{quotes:"\"""\"""'""'"}.mw-parser-output .citation:target{background-color:rgba(0,127,255,0.133)}.mw-parser-output .id-lock-free.id-lock-free a{background:url("//upload.wikimedia.org/wikipedia/commons/6/65/Lock-green.svg")right 0.1em center/9px no-repeat}.mw-parser-output .id-lock-limited.id-lock-limited a,.mw-parser-output .id-lock-registration.id-lock-registration a{background:url("//upload.wikimedia.org/wikipedia/commons/d/d6/Lock-gray-alt-2.svg")right 0.1em center/9px no-repeat}.mw-parser-output .id-lock-subscription.id-lock-subscription a{background:url("//upload.wikimedia.org/wikipedia/commons/a/aa/Lock-red-alt-2.svg")right 0.1em center/9px no-repeat}.mw-parser-output .cs1-ws-icon a{background:url("//upload.wikimedia.org/wikipedia/commons/4/4c/Wikisource-logo.svg")right 0.1em center/12px no-repeat}body:not(.skin-timeless):not(.skin-minerva) .mw-parser-output .id-lock-free a,body:not(.skin-timeless):not(.skin-minerva) .mw-parser-output .id-lock-limited a,body:not(.skin-timeless):not(.skin-minerva) .mw-parser-output .id-lock-registration a,body:not(.skin-timeless):not(.skin-minerva) .mw-parser-output .id-lock-subscription a,body:not(.skin-timeless):not(.skin-minerva) .mw-parser-output .cs1-ws-icon a{background-size:contain;padding:0 1em 0 0}.mw-parser-output .cs1-code{color:inherit;background:inherit;border:none;padding:inherit}.mw-parser-output .cs1-hidden-error{display:none;color:var(--color-error,#d33)}.mw-parser-output .cs1-visible-error{color:var(--color-error,#d33)}.mw-parser-output .cs1-maint{display:none;color:#085;margin-left:0.3em}.mw-parser-output .cs1-kern-left{padding-left:0.2em}.mw-parser-output .cs1-kern-right{padding-right:0.2em}.mw-parser-output .citation .mw-selflink{font-weight:inherit}@media screen{.mw-parser-output .cs1-format{font-size:95%}html.skin-theme-clientpref-night .mw-parser-output .cs1-maint{color:#18911f}}@media screen and (prefers-color-scheme:dark){html.skin-theme-clientpref-os .mw-parser-output .cs1-maint{color:#18911f}}</style><cite id="CITEREFKoehn2020" class="citation book cs1">Koehn, Philipp (2020). <a rel="nofollow" class="external text" href="http://www2.statmt.org/nmt-book/"><i>Neural Machine Translation</i></a>. Cambridge University Press.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&rft.genre=book&rft.btitle=Neural+Machine+Translation&rft.pub=Cambridge+University+Press&rft.date=2020&rft.aulast=Koehn&rft.aufirst=Philipp&rft_id=http%3A%2F%2Fwww2.statmt.org%2Fnmt-book%2F&rfr_id=info%3Asid%2Fen.wikipedia.org%3ANeural+machine+translation" class="Z3988"></span></span> </li> <li id="cite_note-Stahlberg2020-2"><span class="mw-cite-backlink">^ <a href="#cite_ref-Stahlberg2020_2-0"><sup><i><b>a</b></i></sup></a> <a href="#cite_ref-Stahlberg2020_2-1"><sup><i><b>b</b></i></sup></a> <a href="#cite_ref-Stahlberg2020_2-2"><sup><i><b>c</b></i></sup></a> <a href="#cite_ref-Stahlberg2020_2-3"><sup><i><b>d</b></i></sup></a> <a href="#cite_ref-Stahlberg2020_2-4"><sup><i><b>e</b></i></sup></a> <a href="#cite_ref-Stahlberg2020_2-5"><sup><i><b>f</b></i></sup></a> <a href="#cite_ref-Stahlberg2020_2-6"><sup><i><b>g</b></i></sup></a></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFStahlberg2020" class="citation arxiv cs1">Stahlberg, Felix (2020-09-29). "Neural Machine Translation: A Review and Survey". <a href="/wiki/ArXiv_(identifier)" class="mw-redirect" title="ArXiv (identifier)">arXiv</a>:<span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://arxiv.org/abs/1912.02047v2">1912.02047v2</a></span> [<a rel="nofollow" class="external text" href="https://arxiv.org/archive/cs.CL">cs.CL</a>].</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=preprint&rft.jtitle=arXiv&rft.atitle=Neural+Machine+Translation%3A+A+Review+and+Survey&rft.date=2020-09-29&rft_id=info%3Aarxiv%2F1912.02047v2&rft.aulast=Stahlberg&rft.aufirst=Felix&rfr_id=info%3Asid%2Fen.wikipedia.org%3ANeural+machine+translation" class="Z3988"></span></span> </li> <li id="cite_note-Popel2020-3"><span class="mw-cite-backlink"><b><a href="#cite_ref-Popel2020_3-0">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFPopelTomkovaTomekKaiser2020" class="citation journal cs1">Popel, Martin; Tomkova, Marketa; Tomek, Jakub; Kaiser, Łukasz; Uszkoreit, Jakob; Bojar, Ondřej; Žabokrtský, Zdeněk (2020-09-01). <a rel="nofollow" class="external text" href="https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7463233">"Transforming machine translation: a deep learning system reaches news translation quality comparable to human professionals"</a>. <i>Nature Communications</i>. <b>11</b> (1): 4381. <a href="/wiki/Bibcode_(identifier)" class="mw-redirect" title="Bibcode (identifier)">Bibcode</a>:<a rel="nofollow" class="external text" href="https://ui.adsabs.harvard.edu/abs/2020NatCo..11.4381P">2020NatCo..11.4381P</a>. <a href="/wiki/Doi_(identifier)" class="mw-redirect" title="Doi (identifier)">doi</a>:<a rel="nofollow" class="external text" href="https://doi.org/10.1038%2Fs41467-020-18073-9">10.1038/s41467-020-18073-9</a>. <a href="/wiki/Hdl_(identifier)" class="mw-redirect" title="Hdl (identifier)">hdl</a>:<span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://hdl.handle.net/11346%2FBIBLIO%40id%3D368112263610994118">11346/BIBLIO@id=368112263610994118</a></span>. <a href="/wiki/ISSN_(identifier)" class="mw-redirect" title="ISSN (identifier)">ISSN</a> <a rel="nofollow" class="external text" href="https://search.worldcat.org/issn/2041-1723">2041-1723</a>. <a href="/wiki/PMC_(identifier)" class="mw-redirect" title="PMC (identifier)">PMC</a> <span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7463233">7463233</a></span>. <a href="/wiki/PMID_(identifier)" class="mw-redirect" title="PMID (identifier)">PMID</a> <a rel="nofollow" class="external text" href="https://pubmed.ncbi.nlm.nih.gov/32873773">32873773</a>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=article&rft.jtitle=Nature+Communications&rft.atitle=Transforming+machine+translation%3A+a+deep+learning+system+reaches+news+translation+quality+comparable+to+human+professionals&rft.volume=11&rft.issue=1&rft.pages=4381&rft.date=2020-09-01&rft_id=https%3A%2F%2Fwww.ncbi.nlm.nih.gov%2Fpmc%2Farticles%2FPMC7463233%23id-name%3DPMC&rft_id=info%3Abibcode%2F2020NatCo..11.4381P&rft_id=info%3Adoi%2F10.1038%2Fs41467-020-18073-9&rft.issn=2041-1723&rft_id=info%3Apmid%2F32873773&rft_id=info%3Ahdl%2F11346%2FBIBLIO%40id%3D368112263610994118&rft.aulast=Popel&rft.aufirst=Martin&rft.au=Tomkova%2C+Marketa&rft.au=Tomek%2C+Jakub&rft.au=Kaiser%2C+%C5%81ukasz&rft.au=Uszkoreit%2C+Jakob&rft.au=Bojar%2C+Ond%C5%99ej&rft.au=%C5%BDabokrtsk%C3%BD%2C+Zden%C4%9Bk&rft_id=https%3A%2F%2Fwww.ncbi.nlm.nih.gov%2Fpmc%2Farticles%2FPMC7463233&rfr_id=info%3Asid%2Fen.wikipedia.org%3ANeural+machine+translation" class="Z3988"></span></span> </li> <li id="cite_note-Haddow2022-4"><span class="mw-cite-backlink">^ <a href="#cite_ref-Haddow2022_4-0"><sup><i><b>a</b></i></sup></a> <a href="#cite_ref-Haddow2022_4-1"><sup><i><b>b</b></i></sup></a></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFHaddowBawdenMiceli_BaroneHelcl2022" class="citation journal cs1">Haddow, Barry; Bawden, Rachel; Miceli Barone, Antonio Valerio; Helcl, Jindřich; Birch, Alexandra (2022). <a rel="nofollow" class="external text" href="https://aclanthology.org/2022.cl-3.6">"Survey of Low-Resource Machine Translation"</a>. <i>Computational Linguistics</i>. <b>48</b> (3): 673–732. <a href="/wiki/ArXiv_(identifier)" class="mw-redirect" title="ArXiv (identifier)">arXiv</a>:<span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://arxiv.org/abs/2109.00486">2109.00486</a></span>. <a href="/wiki/Doi_(identifier)" class="mw-redirect" title="Doi (identifier)">doi</a>:<a rel="nofollow" class="external text" href="https://doi.org/10.1162%2Fcoli_a_00446">10.1162/coli_a_00446</a>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=article&rft.jtitle=Computational+Linguistics&rft.atitle=Survey+of+Low-Resource+Machine+Translation&rft.volume=48&rft.issue=3&rft.pages=673-732&rft.date=2022&rft_id=info%3Aarxiv%2F2109.00486&rft_id=info%3Adoi%2F10.1162%2Fcoli_a_00446&rft.aulast=Haddow&rft.aufirst=Barry&rft.au=Bawden%2C+Rachel&rft.au=Miceli+Barone%2C+Antonio+Valerio&rft.au=Helcl%2C+Jind%C5%99ich&rft.au=Birch%2C+Alexandra&rft_id=https%3A%2F%2Faclanthology.org%2F2022.cl-3.6&rfr_id=info%3Asid%2Fen.wikipedia.org%3ANeural+machine+translation" class="Z3988"></span></span> </li> <li id="cite_note-Poibeau2022-5"><span class="mw-cite-backlink">^ <a href="#cite_ref-Poibeau2022_5-0"><sup><i><b>a</b></i></sup></a> <a href="#cite_ref-Poibeau2022_5-1"><sup><i><b>b</b></i></sup></a></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFPoibeau2022" class="citation journal cs1">Poibeau, Thierry (2022). Calzolari, Nicoletta; Béchet, Frédéric; Blache, Philippe; Choukri, Khalid; Cieri, Christopher; Declerck, Thierry; Goggi, Sara; Isahara, Hitoshi; Maegaard, Bente (eds.). <a rel="nofollow" class="external text" href="https://aclanthology.org/2022.lrec-1.647">"On "Human Parity" and "Super Human Performance" in Machine Translation Evaluation"</a>. <i>Proceedings of the Thirteenth Language Resources and Evaluation Conference</i>. Marseille, France: European Language Resources Association: 6018–6023.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=article&rft.jtitle=Proceedings+of+the+Thirteenth+Language+Resources+and+Evaluation+Conference&rft.atitle=On+%22Human+Parity%22+and+%22Super+Human+Performance%22+in+Machine+Translation+Evaluation&rft.pages=6018-6023&rft.date=2022&rft.aulast=Poibeau&rft.aufirst=Thierry&rft_id=https%3A%2F%2Faclanthology.org%2F2022.lrec-1.647&rfr_id=info%3Asid%2Fen.wikipedia.org%3ANeural+machine+translation" class="Z3988"></span></span> </li> <li id="cite_note-Tan2020-6"><span class="mw-cite-backlink">^ <a href="#cite_ref-Tan2020_6-0"><sup><i><b>a</b></i></sup></a> <a href="#cite_ref-Tan2020_6-1"><sup><i><b>b</b></i></sup></a> <a href="#cite_ref-Tan2020_6-2"><sup><i><b>c</b></i></sup></a> <a href="#cite_ref-Tan2020_6-3"><sup><i><b>d</b></i></sup></a></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFTanWangYangChen2020" class="citation arxiv cs1">Tan, Zhixing; Wang, Shuo; Yang, Zonghan; Chen, Gang; Huang, Xuancheng; Sun, Maosong; Liu, Yang (2020-12-31). "Neural Machine Translation: A Review of Methods, Resources, and Tools". <a href="/wiki/ArXiv_(identifier)" class="mw-redirect" title="ArXiv (identifier)">arXiv</a>:<span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://arxiv.org/abs/2012.15515">2012.15515</a></span> [<a rel="nofollow" class="external text" href="https://arxiv.org/archive/cs.CL">cs.CL</a>].</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=preprint&rft.jtitle=arXiv&rft.atitle=Neural+Machine+Translation%3A+A+Review+of+Methods%2C+Resources%2C+and+Tools&rft.date=2020-12-31&rft_id=info%3Aarxiv%2F2012.15515&rft.aulast=Tan&rft.aufirst=Zhixing&rft.au=Wang%2C+Shuo&rft.au=Yang%2C+Zonghan&rft.au=Chen%2C+Gang&rft.au=Huang%2C+Xuancheng&rft.au=Sun%2C+Maosong&rft.au=Liu%2C+Yang&rfr_id=info%3Asid%2Fen.wikipedia.org%3ANeural+machine+translation" class="Z3988"></span></span> </li> <li id="cite_note-Goodfellow2013-7"><span class="mw-cite-backlink"><b><a href="#cite_ref-Goodfellow2013_7-0">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFGoodfellowBengioCourville2016" class="citation book cs1">Goodfellow, Ian; Bengio, Yoshua; Courville, Aaron (2016). <a rel="nofollow" class="external text" href="https://www.deeplearningbook.org/contents/applications.html">"12.4.5 Neural Machine Translation"</a>. <a rel="nofollow" class="external text" href="https://www.deeplearningbook.org/"><i>Deep Learning</i></a>. MIT Press. pp. 468–471<span class="reference-accessdate">. Retrieved <span class="nowrap">2022-12-29</span></span>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&rft.genre=bookitem&rft.atitle=12.4.5+Neural+Machine+Translation&rft.btitle=Deep+Learning&rft.pages=468-471&rft.pub=MIT+Press&rft.date=2016&rft.aulast=Goodfellow&rft.aufirst=Ian&rft.au=Bengio%2C+Yoshua&rft.au=Courville%2C+Aaron&rft_id=https%3A%2F%2Fwww.deeplearningbook.org%2Fcontents%2Fapplications.html&rfr_id=info%3Asid%2Fen.wikipedia.org%3ANeural+machine+translation" class="Z3988"></span></span> </li> <li id="cite_note-Allen1987-8"><span class="mw-cite-backlink"><b><a href="#cite_ref-Allen1987_8-0">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFAllen1987" class="citation conference cs1">Allen, Robert B. (1987). <a rel="nofollow" class="external text" href="https://www.researchgate.net/publication/243614356"><i>Several Studies on Natural Language and Back-Propagation</i></a>. IEEE First International Conference on Neural Networks. Vol. 2. San Diego. pp. 335–341<span class="reference-accessdate">. Retrieved <span class="nowrap">2022-12-30</span></span>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&rft.genre=conference&rft.btitle=Several+Studies+on+Natural+Language+and+Back-Propagation&rft.place=San+Diego&rft.pages=335-341&rft.date=1987&rft.aulast=Allen&rft.aufirst=Robert+B.&rft_id=https%3A%2F%2Fwww.researchgate.net%2Fpublication%2F243614356&rfr_id=info%3Asid%2Fen.wikipedia.org%3ANeural+machine+translation" class="Z3988"></span></span> </li> <li id="cite_note-Pollack1990-9"><span class="mw-cite-backlink"><b><a href="#cite_ref-Pollack1990_9-0">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFChrisman1991" class="citation journal cs1">Chrisman, Lonnie (1991). <a rel="nofollow" class="external text" href="https://figshare.com/articles/journal_contribution/6606899">"Learning Recursive Distributed Representations for Holistic Computation"</a>. <i>Connection Science</i>. <b>3</b> (4): 345–366. <a href="/wiki/Doi_(identifier)" class="mw-redirect" title="Doi (identifier)">doi</a>:<a rel="nofollow" class="external text" href="https://doi.org/10.1080%2F09540099108946592">10.1080/09540099108946592</a>. <a href="/wiki/ISSN_(identifier)" class="mw-redirect" title="ISSN (identifier)">ISSN</a> <a rel="nofollow" class="external text" href="https://search.worldcat.org/issn/0954-0091">0954-0091</a>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=article&rft.jtitle=Connection+Science&rft.atitle=Learning+Recursive+Distributed+Representations+for+Holistic+Computation&rft.volume=3&rft.issue=4&rft.pages=345-366&rft.date=1991&rft_id=info%3Adoi%2F10.1080%2F09540099108946592&rft.issn=0954-0091&rft.aulast=Chrisman&rft.aufirst=Lonnie&rft_id=https%3A%2F%2Ffigshare.com%2Farticles%2Fjournal_contribution%2F6606899&rfr_id=info%3Asid%2Fen.wikipedia.org%3ANeural+machine+translation" class="Z3988"></span></span> </li> <li id="cite_note-Chrisman1991-10"><span class="mw-cite-backlink"><b><a href="#cite_ref-Chrisman1991_10-0">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFPollack1990" class="citation journal cs1">Pollack, Jordan B. (1990). <a rel="nofollow" class="external text" href="https://dx.doi.org/10.1016/0004-3702%2890%2990005-K">"Recursive distributed representations"</a>. <i>Artificial Intelligence</i>. <b>46</b> (1): 77–105. <a href="/wiki/Doi_(identifier)" class="mw-redirect" title="Doi (identifier)">doi</a>:<a rel="nofollow" class="external text" href="https://doi.org/10.1016%2F0004-3702%2890%2990005-K">10.1016/0004-3702(90)90005-K</a>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=article&rft.jtitle=Artificial+Intelligence&rft.atitle=Recursive+distributed+representations&rft.volume=46&rft.issue=1&rft.pages=77-105&rft.date=1990&rft_id=info%3Adoi%2F10.1016%2F0004-3702%2890%2990005-K&rft.aulast=Pollack&rft.aufirst=Jordan+B.&rft_id=https%3A%2F%2Fdx.doi.org%2F10.1016%2F0004-3702%252890%252990005-K&rfr_id=info%3Asid%2Fen.wikipedia.org%3ANeural+machine+translation" class="Z3988"></span></span> </li> <li id="cite_note-Forcada1997-11"><span class="mw-cite-backlink"><b><a href="#cite_ref-Forcada1997_11-0">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFForcadaÑeco1997" class="citation book cs1">Forcada, Mikel L.; Ñeco, Ramón P. (1997). "Recursive hetero-associative memories for translation". <i>Biological and Artificial Computation: From Neuroscience to Technology</i>. Lecture Notes in Computer Science. Vol. 1240. pp. 453–462. <a href="/wiki/Doi_(identifier)" class="mw-redirect" title="Doi (identifier)">doi</a>:<a rel="nofollow" class="external text" href="https://doi.org/10.1007%2FBFb0032504">10.1007/BFb0032504</a>. <a href="/wiki/ISBN_(identifier)" class="mw-redirect" title="ISBN (identifier)">ISBN</a> <a href="/wiki/Special:BookSources/978-3-540-63047-0" title="Special:BookSources/978-3-540-63047-0"><bdi>978-3-540-63047-0</bdi></a>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&rft.genre=bookitem&rft.atitle=Recursive+hetero-associative+memories+for+translation&rft.btitle=Biological+and+Artificial+Computation%3A+From+Neuroscience+to+Technology&rft.series=Lecture+Notes+in+Computer+Science&rft.pages=453-462&rft.date=1997&rft_id=info%3Adoi%2F10.1007%2FBFb0032504&rft.isbn=978-3-540-63047-0&rft.aulast=Forcada&rft.aufirst=Mikel+L.&rft.au=%C3%91eco%2C+Ram%C3%B3n+P.&rfr_id=info%3Asid%2Fen.wikipedia.org%3ANeural+machine+translation" class="Z3988"></span></span> </li> <li id="cite_note-Castano1997a-12"><span class="mw-cite-backlink"><b><a href="#cite_ref-Castano1997a_12-0">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFCastañoCasacuberta1997" class="citation conference cs1">Castaño, Asunción; Casacuberta, Francisco (1997). <a rel="nofollow" class="external text" href="https://www.isca-speech.org/archive/eurospeech_1997/castano97_eurospeech.html"><i>A connectionist approach to machine translation</i></a>. 5th European Conference on Speech Communication and Technology (Eurospeech 1997). Rhodes, Greece. pp. 91–94. <a href="/wiki/Doi_(identifier)" class="mw-redirect" title="Doi (identifier)">doi</a>:<a rel="nofollow" class="external text" href="https://doi.org/10.21437%2FEurospeech.1997-50">10.21437/Eurospeech.1997-50</a>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&rft.genre=conference&rft.btitle=A+connectionist+approach+to+machine+translation&rft.place=Rhodes%2C+Greece&rft.pages=91-94&rft.date=1997&rft_id=info%3Adoi%2F10.21437%2FEurospeech.1997-50&rft.aulast=Casta%C3%B1o&rft.aufirst=Asunci%C3%B3n&rft.au=Casacuberta%2C+Francisco&rft_id=https%3A%2F%2Fwww.isca-speech.org%2Farchive%2Feurospeech_1997%2Fcastano97_eurospeech.html&rfr_id=info%3Asid%2Fen.wikipedia.org%3ANeural+machine+translation" class="Z3988"></span></span> </li> <li id="cite_note-Castano1997b-13"><span class="mw-cite-backlink"><b><a href="#cite_ref-Castano1997b_13-0">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFCastañoCasacubertaVidal1997" class="citation conference cs1">Castaño, Asunción; Casacuberta, Francisco; Vidal, Enrique (1997-07-23). <a rel="nofollow" class="external text" href="https://aclanthology.org/1997.tmi-1.19"><i>Machine translation using neural networks and finite-state models</i></a>. Proceedings of the 7th Conference on Theoretical and Methodological Issues in Machine Translation of Natural Languages. St John's College, Santa Fe.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&rft.genre=conference&rft.btitle=Machine+translation+using+neural+networks+and+finite-state+models&rft.place=St+John%27s+College%2C+Santa+Fe&rft.date=1997-07-23&rft.aulast=Casta%C3%B1o&rft.aufirst=Asunci%C3%B3n&rft.au=Casacuberta%2C+Francisco&rft.au=Vidal%2C+Enrique&rft_id=https%3A%2F%2Faclanthology.org%2F1997.tmi-1.19&rfr_id=info%3Asid%2Fen.wikipedia.org%3ANeural+machine+translation" class="Z3988"></span></span> </li> <li id="cite_note-Yang2020-14"><span class="mw-cite-backlink"><b><a href="#cite_ref-Yang2020_14-0">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFYangWangChu2020" class="citation arxiv cs1">Yang, Shuoheng; Wang, Yuxin; Chu, Xiaowen (2020-02-18). "A Survey of Deep Learning Techniques for Neural Machine Translation". <a href="/wiki/ArXiv_(identifier)" class="mw-redirect" title="ArXiv (identifier)">arXiv</a>:<span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://arxiv.org/abs/2002.07526">2002.07526</a></span> [<a rel="nofollow" class="external text" href="https://arxiv.org/archive/cs.CL">cs.CL</a>].</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=preprint&rft.jtitle=arXiv&rft.atitle=A+Survey+of+Deep+Learning+Techniques+for+Neural+Machine+Translation&rft.date=2020-02-18&rft_id=info%3Aarxiv%2F2002.07526&rft.aulast=Yang&rft.aufirst=Shuoheng&rft.au=Wang%2C+Yuxin&rft.au=Chu%2C+Xiaowen&rfr_id=info%3Asid%2Fen.wikipedia.org%3ANeural+machine+translation" class="Z3988"></span></span> </li> <li id="cite_note-Schwenk2006-15"><span class="mw-cite-backlink"><b><a href="#cite_ref-Schwenk2006_15-0">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFSchwenkDechelotteGauvain2006" class="citation conference cs1">Schwenk, Holger; Dechelotte, Daniel; Gauvain, Jean-Luc (2006). <a rel="nofollow" class="external text" href="https://aclanthology.org/P06-2093"><i>Continuous Space Language Models for Statistical Machine Translation</i></a>. Proceedings of the COLING/ACL 2006 Main Conference Poster Sessions. Sydney, Australia. pp. 723–730.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&rft.genre=conference&rft.btitle=Continuous+Space+Language+Models+for+Statistical+Machine+Translation&rft.place=Sydney%2C+Australia&rft.pages=723-730&rft.date=2006&rft.aulast=Schwenk&rft.aufirst=Holger&rft.au=Dechelotte%2C+Daniel&rft.au=Gauvain%2C+Jean-Luc&rft_id=https%3A%2F%2Faclanthology.org%2FP06-2093&rfr_id=info%3Asid%2Fen.wikipedia.org%3ANeural+machine+translation" class="Z3988"></span></span> </li> <li id="cite_note-Schwenk2007-16"><span class="mw-cite-backlink"><b><a href="#cite_ref-Schwenk2007_16-0">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFSchwenk2007" class="citation journal cs1">Schwenk, Holger (2007). "Contiuous space language models". <i>Computer Speech and Language</i>. <b>3</b> (21): 492–518. <a href="/wiki/Doi_(identifier)" class="mw-redirect" title="Doi (identifier)">doi</a>:<a rel="nofollow" class="external text" href="https://doi.org/10.1016%2Fj.csl.2006.09.003">10.1016/j.csl.2006.09.003</a>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=article&rft.jtitle=Computer+Speech+and+Language&rft.atitle=Contiuous+space+language+models&rft.volume=3&rft.issue=21&rft.pages=492-518&rft.date=2007&rft_id=info%3Adoi%2F10.1016%2Fj.csl.2006.09.003&rft.aulast=Schwenk&rft.aufirst=Holger&rfr_id=info%3Asid%2Fen.wikipedia.org%3ANeural+machine+translation" class="Z3988"></span></span> </li> <li id="cite_note-Schwenk2012-17"><span class="mw-cite-backlink"><b><a href="#cite_ref-Schwenk2012_17-0">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFSchwenk2012" class="citation conference cs1">Schwenk, Holger (2012). <a rel="nofollow" class="external text" href="https://aclanthology.org/C12-2104"><i>Continuous Space Translation Models for Phrase-Based Statistical Machine Translation</i></a>. Proceedings of COLING 2012: Posters. Mumbai, India. pp. 1071–1080.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&rft.genre=conference&rft.btitle=Continuous+Space+Translation+Models+for+Phrase-Based+Statistical+Machine+Translation&rft.place=Mumbai%2C+India&rft.pages=1071-1080&rft.date=2012&rft.aulast=Schwenk&rft.aufirst=Holger&rft_id=https%3A%2F%2Faclanthology.org%2FC12-2104&rfr_id=info%3Asid%2Fen.wikipedia.org%3ANeural+machine+translation" class="Z3988"></span></span> </li> <li id="cite_note-KalchbrennerBlunsom2013-18"><span class="mw-cite-backlink">^ <a href="#cite_ref-KalchbrennerBlunsom2013_18-0"><sup><i><b>a</b></i></sup></a> <a href="#cite_ref-KalchbrennerBlunsom2013_18-1"><sup><i><b>b</b></i></sup></a></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFKalchbrennerBlunsom2013" class="citation journal cs1">Kalchbrenner, Nal; Blunsom, Philip (2013). <a rel="nofollow" class="external text" href="http://www.aclweb.org/anthology/D13-1176">"Recurrent Continuous Translation Models"</a>. <i>Proceedings of the Association for Computational Linguistics</i>: 1700–1709.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=article&rft.jtitle=Proceedings+of+the+Association+for+Computational+Linguistics&rft.atitle=Recurrent+Continuous+Translation+Models&rft.pages=1700-1709&rft.date=2013&rft.aulast=Kalchbrenner&rft.aufirst=Nal&rft.au=Blunsom%2C+Philip&rft_id=http%3A%2F%2Fwww.aclweb.org%2Fanthology%2FD13-1176&rfr_id=info%3Asid%2Fen.wikipedia.org%3ANeural+machine+translation" class="Z3988"></span></span> </li> <li id="cite_note-Cho2014EncDec-19"><span class="mw-cite-backlink"><b><a href="#cite_ref-Cho2014EncDec_19-0">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFChovan_MerriënboerGulcehreBahdanau2014" class="citation conference cs1">Cho, Kyunghyun; van Merriënboer, Bart; Gulcehre, Caglar; Bahdanau, Dzmitry; Bougares, Fethi; Schwenk, Holger; Bengio, Yoshua (2014). <i>Learning Phrase Representations using RNN Encoder–Decoder for Statistical Machine Translation</i>. Proceedings of the 2014 Conference on Empirical Methods in Natural Language Processing (EMNLP). Doha, Qatar: Association for Computational Linguistics. pp. 1724–1734. <a href="/wiki/ArXiv_(identifier)" class="mw-redirect" title="ArXiv (identifier)">arXiv</a>:<span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://arxiv.org/abs/1406.1078">1406.1078</a></span>. <a href="/wiki/Doi_(identifier)" class="mw-redirect" title="Doi (identifier)">doi</a>:<a rel="nofollow" class="external text" href="https://doi.org/10.3115%2Fv1%2FD14-1179">10.3115/v1/D14-1179</a>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&rft.genre=conference&rft.btitle=Learning+Phrase+Representations+using+RNN+Encoder%E2%80%93Decoder+for+Statistical+Machine+Translation&rft.place=Doha%2C+Qatar&rft.pages=1724-1734&rft.pub=Association+for+Computational+Linguistics&rft.date=2014&rft_id=info%3Aarxiv%2F1406.1078&rft_id=info%3Adoi%2F10.3115%2Fv1%2FD14-1179&rft.aulast=Cho&rft.aufirst=Kyunghyun&rft.au=van+Merri%C3%ABnboer%2C+Bart&rft.au=Gulcehre%2C+Caglar&rft.au=Bahdanau%2C+Dzmitry&rft.au=Bougares%2C+Fethi&rft.au=Schwenk%2C+Holger&rft.au=Bengio%2C+Yoshua&rfr_id=info%3Asid%2Fen.wikipedia.org%3ANeural+machine+translation" class="Z3988"></span></span> </li> <li id="cite_note-Sutskever2014-20"><span class="mw-cite-backlink"><b><a href="#cite_ref-Sutskever2014_20-0">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFSutskeverVinyalsLe2014" class="citation journal cs1">Sutskever, Ilya; Vinyals, Oriol; Le, Quoc V. (2014). <a rel="nofollow" class="external text" href="https://proceedings.neurips.cc/paper/2014/hash/a14ac55a4f27472c5d894ec1c3c743d2-Abstract.html">"Sequence to Sequence Learning with Neural Networks"</a>. <i>Advances in Neural Information Processing Systems</i>. <b>27</b>. Curran Associates, Inc. <a href="/wiki/ArXiv_(identifier)" class="mw-redirect" title="ArXiv (identifier)">arXiv</a>:<span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://arxiv.org/abs/1409.3215">1409.3215</a></span>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=article&rft.jtitle=Advances+in+Neural+Information+Processing+Systems&rft.atitle=Sequence+to+Sequence+Learning+with+Neural+Networks&rft.volume=27&rft.date=2014&rft_id=info%3Aarxiv%2F1409.3215&rft.aulast=Sutskever&rft.aufirst=Ilya&rft.au=Vinyals%2C+Oriol&rft.au=Le%2C+Quoc+V.&rft_id=https%3A%2F%2Fproceedings.neurips.cc%2Fpaper%2F2014%2Fhash%2Fa14ac55a4f27472c5d894ec1c3c743d2-Abstract.html&rfr_id=info%3Asid%2Fen.wikipedia.org%3ANeural+machine+translation" class="Z3988"></span></span> </li> <li id="cite_note-Cho2014Properties-21"><span class="mw-cite-backlink"><b><a href="#cite_ref-Cho2014Properties_21-0">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFChovan_MerriënboerBahdanauBengio2014" class="citation conference cs1">Cho, Kyunghyun; van Merriënboer, Bart; Bahdanau, Dzmitry; Bengio, Yoshua (2014). <i>On the Properties of Neural Machine Translation: Encoder–Decoder Approaches</i>. Proceedings of SSST-8, Eighth Workshop on Syntax, Semantics and Structure in Statistical Translation. Doha, Qatar: Association for Computational Linguistics. pp. 103–111. <a href="/wiki/ArXiv_(identifier)" class="mw-redirect" title="ArXiv (identifier)">arXiv</a>:<span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://arxiv.org/abs/1409.1259">1409.1259</a></span>. <a href="/wiki/Doi_(identifier)" class="mw-redirect" title="Doi (identifier)">doi</a>:<a rel="nofollow" class="external text" href="https://doi.org/10.3115%2Fv1%2FW14-4012">10.3115/v1/W14-4012</a>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&rft.genre=conference&rft.btitle=On+the+Properties+of+Neural+Machine+Translation%3A+Encoder%E2%80%93Decoder+Approaches&rft.place=Doha%2C+Qatar&rft.pages=103-111&rft.pub=Association+for+Computational+Linguistics&rft.date=2014&rft_id=info%3Aarxiv%2F1409.1259&rft_id=info%3Adoi%2F10.3115%2Fv1%2FW14-4012&rft.aulast=Cho&rft.aufirst=Kyunghyun&rft.au=van+Merri%C3%ABnboer%2C+Bart&rft.au=Bahdanau%2C+Dzmitry&rft.au=Bengio%2C+Yoshua&rfr_id=info%3Asid%2Fen.wikipedia.org%3ANeural+machine+translation" class="Z3988"></span></span> </li> <li id="cite_note-Bahdanau2015-22"><span class="mw-cite-backlink"><b><a href="#cite_ref-Bahdanau2015_22-0">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFBahdanauChoBengio2014" class="citation arxiv cs1">Bahdanau, Dzmitry; Cho, Kyunghyun; Bengio, Yoshua (2014). "Neural Machine Translation by Jointly Learning to Align and Translate". <a href="/wiki/ArXiv_(identifier)" class="mw-redirect" title="ArXiv (identifier)">arXiv</a>:<span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://arxiv.org/abs/1409.0473">1409.0473</a></span> [<a rel="nofollow" class="external text" href="https://arxiv.org/archive/cs.CL">cs.CL</a>].</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=preprint&rft.jtitle=arXiv&rft.atitle=Neural+Machine+Translation+by+Jointly+Learning+to+Align+and+Translate&rft.date=2014&rft_id=info%3Aarxiv%2F1409.0473&rft.aulast=Bahdanau&rft.aufirst=Dzmitry&rft.au=Cho%2C+Kyunghyun&rft.au=Bengio%2C+Yoshua&rfr_id=info%3Asid%2Fen.wikipedia.org%3ANeural+machine+translation" class="Z3988"></span></span> </li> <li id="cite_note-Wang2022-23"><span class="mw-cite-backlink">^ <a href="#cite_ref-Wang2022_23-0"><sup><i><b>a</b></i></sup></a> <a href="#cite_ref-Wang2022_23-1"><sup><i><b>b</b></i></sup></a></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFWangWuHeHuang2022" class="citation journal cs1">Wang, Haifeng; Wu, Hua; He, Zhongjun; Huang, Liang; Church, Kenneth Ward (2022-11-01). <a rel="nofollow" class="external text" href="https://www.sciencedirect.com/science/article/pii/S2095809921002745">"Progress in Machine Translation"</a>. <i>Engineering</i>. <b>18</b>: 143–153. <a href="/wiki/Bibcode_(identifier)" class="mw-redirect" title="Bibcode (identifier)">Bibcode</a>:<a rel="nofollow" class="external text" href="https://ui.adsabs.harvard.edu/abs/2022Engin..18..143W">2022Engin..18..143W</a>. <a href="/wiki/Doi_(identifier)" class="mw-redirect" title="Doi (identifier)">doi</a>:<a rel="nofollow" class="external text" href="https://doi.org/10.1016%2Fj.eng.2021.03.023">10.1016/j.eng.2021.03.023</a>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=article&rft.jtitle=Engineering&rft.atitle=Progress+in+Machine+Translation&rft.volume=18&rft.pages=143-153&rft.date=2022-11-01&rft_id=info%3Adoi%2F10.1016%2Fj.eng.2021.03.023&rft_id=info%3Abibcode%2F2022Engin..18..143W&rft.aulast=Wang&rft.aufirst=Haifeng&rft.au=Wu%2C+Hua&rft.au=He%2C+Zhongjun&rft.au=Huang%2C+Liang&rft.au=Church%2C+Kenneth+Ward&rft_id=https%3A%2F%2Fwww.sciencedirect.com%2Fscience%2Farticle%2Fpii%2FS2095809921002745&rfr_id=info%3Asid%2Fen.wikipedia.org%3ANeural+machine+translation" class="Z3988"></span></span> </li> <li id="cite_note-Wu2016-24"><span class="mw-cite-backlink"><b><a href="#cite_ref-Wu2016_24-0">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFWuSchusterChenLe2016" class="citation arxiv cs1">Wu, Yonghui; Schuster, Mike; Chen, Zhifeng; Le, Quoc V.; Norouzi, Mohammad; Macherey, Wolfgang; Krikun, Maxim; Cao, Yuan; Gao, Qin; Macherey, Klaus; Klingner, Jeff; Shah, Apurva; Johnson, Melvin; Liu, Xiaobing; Kaiser, Łukasz (2016). "Google's Neural Machine Translation System: Bridging the Gap between Human and Machine Translation". <a href="/wiki/ArXiv_(identifier)" class="mw-redirect" title="ArXiv (identifier)">arXiv</a>:<span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://arxiv.org/abs/1609.08144">1609.08144</a></span> [<a rel="nofollow" class="external text" href="https://arxiv.org/archive/cs.CL">cs.CL</a>].</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=preprint&rft.jtitle=arXiv&rft.atitle=Google%27s+Neural+Machine+Translation+System%3A+Bridging+the+Gap+between+Human+and+Machine+Translation&rft.date=2016&rft_id=info%3Aarxiv%2F1609.08144&rft.aulast=Wu&rft.aufirst=Yonghui&rft.au=Schuster%2C+Mike&rft.au=Chen%2C+Zhifeng&rft.au=Le%2C+Quoc+V.&rft.au=Norouzi%2C+Mohammad&rft.au=Macherey%2C+Wolfgang&rft.au=Krikun%2C+Maxim&rft.au=Cao%2C+Yuan&rft.au=Gao%2C+Qin&rft.au=Macherey%2C+Klaus&rft.au=Klingner%2C+Jeff&rft.au=Shah%2C+Apurva&rft.au=Johnson%2C+Melvin&rft.au=Liu%2C+Xiaobing&rft.au=Kaiser%2C+%C5%81ukasz&rfr_id=info%3Asid%2Fen.wikipedia.org%3ANeural+machine+translation" class="Z3988"></span></span> </li> <li id="cite_note-WMT2016-25"><span class="mw-cite-backlink"><b><a href="#cite_ref-WMT2016_25-0">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFBojarChatterjeeFedermannGraham2016" class="citation journal cs1">Bojar, Ondrej; Chatterjee, Rajen; Federmann, Christian; Graham, Yvette; Haddow, Barry; Huck, Matthias; Yepes, Antonio Jimeno; Koehn, Philipp; Logacheva, Varvara; Monz, Christof; Negri, Matteo; Névéol, Aurélie; Neves, Mariana; Popel, Martin; Post, Matt; Rubino, Raphael; Scarton, Carolina; Specia, Lucia; Turchi, Marco; Verspoor, Karin; Zampieri, Marcos (2016). <a rel="nofollow" class="external text" href="https://web.archive.org/web/20180127202851/https://cris.fbk.eu/retrieve/handle/11582/307240/14326/W16-2301.pdf">"Findings of the 2016 Conference on Machine Translation"</a> <span class="cs1-format">(PDF)</span>. <i>ACL 2016 First Conference on Machine Translation (WMT16)</i>. The Association for Computational Linguistics: 131–198. Archived from <a rel="nofollow" class="external text" href="https://cris.fbk.eu/retrieve/handle/11582/307240/14326/W16-2301.pdf">the original</a> <span class="cs1-format">(PDF)</span> on 2018-01-27<span class="reference-accessdate">. Retrieved <span class="nowrap">2018-01-27</span></span>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=article&rft.jtitle=ACL+2016+First+Conference+on+Machine+Translation+%28WMT16%29&rft.atitle=Findings+of+the+2016+Conference+on+Machine+Translation&rft.pages=131-198&rft.date=2016&rft.aulast=Bojar&rft.aufirst=Ondrej&rft.au=Chatterjee%2C+Rajen&rft.au=Federmann%2C+Christian&rft.au=Graham%2C+Yvette&rft.au=Haddow%2C+Barry&rft.au=Huck%2C+Matthias&rft.au=Yepes%2C+Antonio+Jimeno&rft.au=Koehn%2C+Philipp&rft.au=Logacheva%2C+Varvara&rft.au=Monz%2C+Christof&rft.au=Negri%2C+Matteo&rft.au=N%C3%A9v%C3%A9ol%2C+Aur%C3%A9lie&rft.au=Neves%2C+Mariana&rft.au=Popel%2C+Martin&rft.au=Post%2C+Matt&rft.au=Rubino%2C+Raphael&rft.au=Scarton%2C+Carolina&rft.au=Specia%2C+Lucia&rft.au=Turchi%2C+Marco&rft.au=Verspoor%2C+Karin&rft.au=Zampieri%2C+Marcos&rft_id=https%3A%2F%2Fcris.fbk.eu%2Fretrieve%2Fhandle%2F11582%2F307240%2F14326%2FW16-2301.pdf&rfr_id=info%3Asid%2Fen.wikipedia.org%3ANeural+machine+translation" class="Z3988"></span></span> </li> <li id="cite_note-Gehring2017-26"><span class="mw-cite-backlink"><b><a href="#cite_ref-Gehring2017_26-0">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFGehringAuliGrangierDauphin2017" class="citation conference cs1">Gehring, Jonas; Auli, Michael; Grangier, David; Dauphin, Yann (2017). <a rel="nofollow" class="external text" href="https://aclanthology.org/P17-1012"><i>A Convolutional Encoder Model for Neural Machine Translation</i></a>. Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers). Vancouver, Canada: Association for Computational Linguistics. pp. 123–135. <a href="/wiki/ArXiv_(identifier)" class="mw-redirect" title="ArXiv (identifier)">arXiv</a>:<span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://arxiv.org/abs/1611.02344">1611.02344</a></span>. <a href="/wiki/Doi_(identifier)" class="mw-redirect" title="Doi (identifier)">doi</a>:<a rel="nofollow" class="external text" href="https://doi.org/10.18653%2Fv1%2FP17-1012">10.18653/v1/P17-1012</a>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&rft.genre=conference&rft.btitle=A+Convolutional+Encoder+Model+for+Neural+Machine+Translation&rft.place=Vancouver%2C+Canada&rft.pages=123-135&rft.pub=Association+for+Computational+Linguistics&rft.date=2017&rft_id=info%3Aarxiv%2F1611.02344&rft_id=info%3Adoi%2F10.18653%2Fv1%2FP17-1012&rft.aulast=Gehring&rft.aufirst=Jonas&rft.au=Auli%2C+Michael&rft.au=Grangier%2C+David&rft.au=Dauphin%2C+Yann&rft_id=https%3A%2F%2Faclanthology.org%2FP17-1012&rfr_id=info%3Asid%2Fen.wikipedia.org%3ANeural+machine+translation" class="Z3988"></span></span> </li> <li id="cite_note-27"><span class="mw-cite-backlink"><b><a href="#cite_ref-27">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFTranslator2018" class="citation web cs1">Translator, Microsoft (2018-04-18). <a rel="nofollow" class="external text" href="https://www.microsoft.com/en-us/translator/blog/2018/04/18/microsoft-brings-ai-powered-translation-to-end-users-and-developers-whether-youre-online-or-offline/">"Microsoft brings AI-powered translation to end users and developers, whether you're online or offline"</a>. <i>Microsoft Translator Blog</i><span class="reference-accessdate">. Retrieved <span class="nowrap">2024-04-19</span></span>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=unknown&rft.jtitle=Microsoft+Translator+Blog&rft.atitle=Microsoft+brings+AI-powered+translation+to+end+users+and+developers%2C+whether+you%27re+online+or+offline&rft.date=2018-04-18&rft.aulast=Translator&rft.aufirst=Microsoft&rft_id=https%3A%2F%2Fwww.microsoft.com%2Fen-us%2Ftranslator%2Fblog%2F2018%2F04%2F18%2Fmicrosoft-brings-ai-powered-translation-to-end-users-and-developers-whether-youre-online-or-offline%2F&rfr_id=info%3Asid%2Fen.wikipedia.org%3ANeural+machine+translation" class="Z3988"></span> <span class="cs1-visible-error citation-comment"><code class="cs1-code">{{<a href="/wiki/Template:Cite_web" title="Template:Cite web">cite web</a>}}</code>: </span><span class="cs1-visible-error citation-comment"><code class="cs1-code">|last=</code> has generic name (<a href="/wiki/Help:CS1_errors#generic_name" title="Help:CS1 errors">help</a>)</span></span> </li> <li id="cite_note-DeepLTechCrunch-28"><span class="mw-cite-backlink"><b><a href="#cite_ref-DeepLTechCrunch_28-0">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFColdewey2017" class="citation news cs1">Coldewey, Devin (2017-08-29). <a rel="nofollow" class="external text" href="https://techcrunch.com/2017/08/29/deepl-schools-other-online-translators-with-clever-machine-learning/">"DeepL schools other online translators with clever machine learning"</a>. <i>TechCrunch</i><span class="reference-accessdate">. Retrieved <span class="nowrap">2023-12-26</span></span>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=article&rft.jtitle=TechCrunch&rft.atitle=DeepL+schools+other+online+translators+with+clever+machine+learning&rft.date=2017-08-29&rft.aulast=Coldewey&rft.aufirst=Devin&rft_id=https%3A%2F%2Ftechcrunch.com%2F2017%2F08%2F29%2Fdeepl-schools-other-online-translators-with-clever-machine-learning%2F&rfr_id=info%3Asid%2Fen.wikipedia.org%3ANeural+machine+translation" class="Z3988"></span></span> </li> <li id="cite_note-DeepLLeMonde-29"><span class="mw-cite-backlink"><b><a href="#cite_ref-DeepLLeMonde_29-0">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFLeloupLarousserie2022" class="citation news cs1">Leloup, Damien; Larousserie, David (2022-08-29). <a rel="nofollow" class="external text" href="https://www.lemonde.fr/pixels/article/2017/08/29/quel-est-le-meilleur-service-de-traduction-en-ligne_5177956_4408996.html">"Quel est le meilleur service de traduction en ligne?"</a>. <i>Le Monde</i><span class="reference-accessdate">. Retrieved <span class="nowrap">2023-01-10</span></span>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=article&rft.jtitle=Le+Monde&rft.atitle=Quel+est+le+meilleur+service+de+traduction+en+ligne%3F&rft.date=2022-08-29&rft.aulast=Leloup&rft.aufirst=Damien&rft.au=Larousserie%2C+David&rft_id=https%3A%2F%2Fwww.lemonde.fr%2Fpixels%2Farticle%2F2017%2F08%2F29%2Fquel-est-le-meilleur-service-de-traduction-en-ligne_5177956_4408996.html&rfr_id=info%3Asid%2Fen.wikipedia.org%3ANeural+machine+translation" class="Z3988"></span></span> </li> <li id="cite_note-DeepLGolem-30"><span class="mw-cite-backlink"><b><a href="#cite_ref-DeepLGolem_30-0">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFPakalski2017" class="citation news cs1">Pakalski, Ingo (2017-08-29). <a rel="nofollow" class="external text" href="https://www.golem.de/news/deepl-im-hands-on-neues-tool-uebersetzt-viel-besser-als-google-und-microsoft-1708-129715.html">"DeepL im Hands On: Neues Tool übersetzt viel besser als Google und Microsoft"</a>. <i>Golem</i><span class="reference-accessdate">. Retrieved <span class="nowrap">2023-01-10</span></span>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=article&rft.jtitle=Golem&rft.atitle=DeepL+im+Hands+On%3A+Neues+Tool+%C3%BCbersetzt+viel+besser+als+Google+und+Microsoft&rft.date=2017-08-29&rft.aulast=Pakalski&rft.aufirst=Ingo&rft_id=https%3A%2F%2Fwww.golem.de%2Fnews%2Fdeepl-im-hands-on-neues-tool-uebersetzt-viel-besser-als-google-und-microsoft-1708-129715.html&rfr_id=info%3Asid%2Fen.wikipedia.org%3ANeural+machine+translation" class="Z3988"></span></span> </li> <li id="cite_note-Vaswani2017-31"><span class="mw-cite-backlink">^ <a href="#cite_ref-Vaswani2017_31-0"><sup><i><b>a</b></i></sup></a> <a href="#cite_ref-Vaswani2017_31-1"><sup><i><b>b</b></i></sup></a></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFVaswaniShazeerParmarUszkoreit2017" class="citation conference cs1">Vaswani, Ashish; Shazeer, Noam; Parmar, Niki; Uszkoreit, Jakob; Gomez, Aidan N.; Kaiser, Łukasz; Polosukhin, Illia (2017). <a rel="nofollow" class="external text" href="https://papers.nips.cc/paper_files/paper/2017/hash/3f5ee243547dee91fbd053c1c4a845aa-Abstract.html"><i>Attention Is All You Need</i></a>. Advances in Neural Information Processing Systems 30 (NIPS 2017). pp. 5998–6008.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&rft.genre=conference&rft.btitle=Attention+Is+All+You+Need&rft.pages=5998-6008&rft.date=2017&rft.aulast=Vaswani&rft.aufirst=Ashish&rft.au=Shazeer%2C+Noam&rft.au=Parmar%2C+Niki&rft.au=Uszkoreit%2C+Jakob&rft.au=Gomez%2C+Aidan+N.&rft.au=Kaiser%2C+%C5%81ukasz&rft.au=Polosukhin%2C+Illia&rft_id=https%3A%2F%2Fpapers.nips.cc%2Fpaper_files%2Fpaper%2F2017%2Fhash%2F3f5ee243547dee91fbd053c1c4a845aa-Abstract.html&rfr_id=info%3Asid%2Fen.wikipedia.org%3ANeural+machine+translation" class="Z3988"></span></span> </li> <li id="cite_note-WMT2022-32"><span class="mw-cite-backlink"><b><a href="#cite_ref-WMT2022_32-0">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFKocmiBawdenBojarDvorkovich2022" class="citation conference cs1">Kocmi, Tom; Bawden, Rachel; Bojar, Ondřej; Dvorkovich, Anton; Federmann, Christian; Fishel, Mark; Gowda, Thamme; Graham, Yvette; Grundkiewicz, Roman; Haddow, Barry; Knowles, Rebecca; Koehn, Philipp; Monz, Christof; Morishita, Makoto; Nagata, Masaaki (2022). Koehn, Philipp; Barrault, Loïc; Bojar, Ondřej; Bougares, Fethi; Chatterjee, Rajen; Costa-jussà, Marta R.; Federmann, Christian; Fishel, Mark; Fraser, Alexander (eds.). <a rel="nofollow" class="external text" href="https://aclanthology.org/2022.wmt-1.1"><i>Findings of the 2022 Conference on Machine Translation (WMT22)</i></a>. Proceedings of the Seventh Conference on Machine Translation (WMT). Abu Dhabi, United Arab Emirates (Hybrid): Association for Computational Linguistics. pp. 1–45.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&rft.genre=conference&rft.btitle=Findings+of+the+2022+Conference+on+Machine+Translation+%28WMT22%29&rft.place=Abu+Dhabi%2C+United+Arab+Emirates+%28Hybrid%29&rft.pages=1-45&rft.pub=Association+for+Computational+Linguistics&rft.date=2022&rft.aulast=Kocmi&rft.aufirst=Tom&rft.au=Bawden%2C+Rachel&rft.au=Bojar%2C+Ond%C5%99ej&rft.au=Dvorkovich%2C+Anton&rft.au=Federmann%2C+Christian&rft.au=Fishel%2C+Mark&rft.au=Gowda%2C+Thamme&rft.au=Graham%2C+Yvette&rft.au=Grundkiewicz%2C+Roman&rft.au=Haddow%2C+Barry&rft.au=Knowles%2C+Rebecca&rft.au=Koehn%2C+Philipp&rft.au=Monz%2C+Christof&rft.au=Morishita%2C+Makoto&rft.au=Nagata%2C+Masaaki&rft_id=https%3A%2F%2Faclanthology.org%2F2022.wmt-1.1&rfr_id=info%3Asid%2Fen.wikipedia.org%3ANeural+machine+translation" class="Z3988"></span></span> </li> <li id="cite_note-WMT2023-33"><span class="mw-cite-backlink">^ <a href="#cite_ref-WMT2023_33-0"><sup><i><b>a</b></i></sup></a> <a href="#cite_ref-WMT2023_33-1"><sup><i><b>b</b></i></sup></a></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFKocmiAvramidisBawdenBojar2023" class="citation conference cs1">Kocmi, Tom; Avramidis, Eleftherios; Bawden, Rachel; Bojar, Ondřej; Dvorkovich, Anton; Federmann, Christian; Fishel, Mark; Freitag, Markus; Gowda, Thamme; Grundkiewicz, Roman; Haddow, Barry; Koehn, Philipp; Marie, Benjamin; Monz, Christof; Morishita, Makoto (2023). Koehn, Philipp; Haddow, Barry; Kocmi, Tom; Monz, Christof (eds.). <a rel="nofollow" class="external text" href="https://aclanthology.org/2023.wmt-1.1"><i>Findings of the 2023 Conference on Machine Translation (WMT23): LLMs Are Here but Not Quite There Yet</i></a>. <i>Proceedings of the Eighth Conference on Machine Translation</i>. Singapore: Association for Computational Linguistics. pp. 1–42. <a href="/wiki/Doi_(identifier)" class="mw-redirect" title="Doi (identifier)">doi</a>:<span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://doi.org/10.18653%2Fv1%2F2023.wmt-1.1">10.18653/v1/2023.wmt-1.1</a></span>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=conference&rft.jtitle=Proceedings+of+the+Eighth+Conference+on+Machine+Translation&rft.atitle=Findings+of+the+2023+Conference+on+Machine+Translation+%28WMT23%29%3A+LLMs+Are+Here+but+Not+Quite+There+Yet&rft.pages=1-42&rft.date=2023&rft_id=info%3Adoi%2F10.18653%2Fv1%2F2023.wmt-1.1&rft.aulast=Kocmi&rft.aufirst=Tom&rft.au=Avramidis%2C+Eleftherios&rft.au=Bawden%2C+Rachel&rft.au=Bojar%2C+Ond%C5%99ej&rft.au=Dvorkovich%2C+Anton&rft.au=Federmann%2C+Christian&rft.au=Fishel%2C+Mark&rft.au=Freitag%2C+Markus&rft.au=Gowda%2C+Thamme&rft.au=Grundkiewicz%2C+Roman&rft.au=Haddow%2C+Barry&rft.au=Koehn%2C+Philipp&rft.au=Marie%2C+Benjamin&rft.au=Monz%2C+Christof&rft.au=Morishita%2C+Makoto&rft_id=https%3A%2F%2Faclanthology.org%2F2023.wmt-1.1&rfr_id=info%3Asid%2Fen.wikipedia.org%3ANeural+machine+translation" class="Z3988"></span></span> </li> <li id="cite_note-Liu2020-34"><span class="mw-cite-backlink">^ <a href="#cite_ref-Liu2020_34-0"><sup><i><b>a</b></i></sup></a> <a href="#cite_ref-Liu2020_34-1"><sup><i><b>b</b></i></sup></a></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFLiuGuGoyalLi2020" class="citation journal cs1">Liu, Yinhan; Gu, Jiatao; Goyal, Naman; Li, Xian; Edunov, Sergey; Ghazvininejad, Marjan; Lewis, Mike; Zettlemoyer, Luke (2020). <a rel="nofollow" class="external text" href="https://doi.org/10.1162/tacl_a_00343">"Multilingual Denoising Pre-training for Neural Machine Translation"</a>. <i>Transactions of the Association for Computational Linguistics</i>. <b>8</b>: 726–742. <a href="/wiki/ArXiv_(identifier)" class="mw-redirect" title="ArXiv (identifier)">arXiv</a>:<span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://arxiv.org/abs/2001.08210">2001.08210</a></span>. <a href="/wiki/Doi_(identifier)" class="mw-redirect" title="Doi (identifier)">doi</a>:<a rel="nofollow" class="external text" href="https://doi.org/10.1162%2Ftacl_a_00343">10.1162/tacl_a_00343</a>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=article&rft.jtitle=Transactions+of+the+Association+for+Computational+Linguistics&rft.atitle=Multilingual+Denoising+Pre-training+for+Neural+Machine+Translation&rft.volume=8&rft.pages=726-742&rft.date=2020&rft_id=info%3Aarxiv%2F2001.08210&rft_id=info%3Adoi%2F10.1162%2Ftacl_a_00343&rft.aulast=Liu&rft.aufirst=Yinhan&rft.au=Gu%2C+Jiatao&rft.au=Goyal%2C+Naman&rft.au=Li%2C+Xian&rft.au=Edunov%2C+Sergey&rft.au=Ghazvininejad%2C+Marjan&rft.au=Lewis%2C+Mike&rft.au=Zettlemoyer%2C+Luke&rft_id=https%3A%2F%2Fdoi.org%2F10.1162%2Ftacl_a_00343&rfr_id=info%3Asid%2Fen.wikipedia.org%3ANeural+machine+translation" class="Z3988"></span></span> </li> <li id="cite_note-Hendy2023-35"><span class="mw-cite-backlink">^ <a href="#cite_ref-Hendy2023_35-0"><sup><i><b>a</b></i></sup></a> <a href="#cite_ref-Hendy2023_35-1"><sup><i><b>b</b></i></sup></a> <a href="#cite_ref-Hendy2023_35-2"><sup><i><b>c</b></i></sup></a></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFHendyAbdelrehimSharafRaunak2023" class="citation arxiv cs1">Hendy, Amr; Abdelrehim, Mohamed; Sharaf, Amr; Raunak, Vikas; Gabr, Mohamed; Matsushita, Hitokazu; Kim, Young Jin; Afify, Mohamed; Awadalla, Hany (2023-02-18). "How Good Are GPT Models at Machine Translation? A Comprehensive Evaluation". <a href="/wiki/ArXiv_(identifier)" class="mw-redirect" title="ArXiv (identifier)">arXiv</a>:<span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://arxiv.org/abs/2302.09210">2302.09210</a></span> [<a rel="nofollow" class="external text" href="https://arxiv.org/archive/cs.CL">cs.CL</a>].</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=preprint&rft.jtitle=arXiv&rft.atitle=How+Good+Are+GPT+Models+at+Machine+Translation%3F+A+Comprehensive+Evaluation&rft.date=2023-02-18&rft_id=info%3Aarxiv%2F2302.09210&rft.aulast=Hendy&rft.aufirst=Amr&rft.au=Abdelrehim%2C+Mohamed&rft.au=Sharaf%2C+Amr&rft.au=Raunak%2C+Vikas&rft.au=Gabr%2C+Mohamed&rft.au=Matsushita%2C+Hitokazu&rft.au=Kim%2C+Young+Jin&rft.au=Afify%2C+Mohamed&rft.au=Awadalla%2C+Hany&rfr_id=info%3Asid%2Fen.wikipedia.org%3ANeural+machine+translation" class="Z3988"></span></span> </li> <li id="cite_note-GPT3LanguagesByCharacterCount2020-36"><span class="mw-cite-backlink">^ <a href="#cite_ref-GPT3LanguagesByCharacterCount2020_36-0"><sup><i><b>a</b></i></sup></a> <a href="#cite_ref-GPT3LanguagesByCharacterCount2020_36-1"><sup><i><b>b</b></i></sup></a></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite class="citation web cs1"><a rel="nofollow" class="external text" href="https://github.com/openai/gpt-3/blob/master/dataset_statistics/languages_by_character_count.csv">"GPT 3 dataset statistics: languages by character count"</a>. OpenAI. 2020-06-01<span class="reference-accessdate">. Retrieved <span class="nowrap">2023-12-23</span></span>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&rft.genre=unknown&rft.btitle=GPT+3+dataset+statistics%3A+languages+by+character+count&rft.pub=OpenAI&rft.date=2020-06-01&rft_id=https%3A%2F%2Fgithub.com%2Fopenai%2Fgpt-3%2Fblob%2Fmaster%2Fdataset_statistics%2Flanguages_by_character_count.csv&rfr_id=info%3Asid%2Fen.wikipedia.org%3ANeural+machine+translation" class="Z3988"></span></span> </li> <li id="cite_note-Russell2020-37"><span class="mw-cite-backlink">^ <a href="#cite_ref-Russell2020_37-0"><sup><i><b>a</b></i></sup></a> <a href="#cite_ref-Russell2020_37-1"><sup><i><b>b</b></i></sup></a></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFRussellNorvig" class="citation book cs1">Russell, Stuart; Norvig, Peter. <a rel="nofollow" class="external text" href="http://aima.cs.berkeley.edu/global-index.html"><i>Artificial Intelligence: A Modern Approach</i></a> (4th, global ed.). Pearson.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&rft.genre=book&rft.btitle=Artificial+Intelligence%3A+A+Modern+Approach&rft.edition=4th%2C+global&rft.pub=Pearson&rft.aulast=Russell&rft.aufirst=Stuart&rft.au=Norvig%2C+Peter&rft_id=http%3A%2F%2Faima.cs.berkeley.edu%2Fglobal-index.html&rfr_id=info%3Asid%2Fen.wikipedia.org%3ANeural+machine+translation" class="Z3988"></span></span> </li> <li id="cite_note-Federico2007-38"><span class="mw-cite-backlink"><b><a href="#cite_ref-Federico2007_38-0">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFFedericoCettolo2007" class="citation journal cs1">Federico, Marcello; Cettolo, Mauro (2007). Callison-Burch, Chris; Koehn, Philipp; Fordyce, Cameron Shaw; Monz, Christof (eds.). <a rel="nofollow" class="external text" href="https://aclanthology.org/W07-0712">"Efficient Handling of N-gram Language Models for Statistical Machine Translation"</a>. <i>Proceedings of the Second Workshop on Statistical Machine Translation</i>. Prague, Czech Republic: Association for Computational Linguistics: 88–95. <a href="/wiki/Doi_(identifier)" class="mw-redirect" title="Doi (identifier)">doi</a>:<a rel="nofollow" class="external text" href="https://doi.org/10.3115%2F1626355.1626367">10.3115/1626355.1626367</a>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=article&rft.jtitle=Proceedings+of+the+Second+Workshop+on+Statistical+Machine+Translation&rft.atitle=Efficient+Handling+of+N-gram+Language+Models+for+Statistical+Machine+Translation&rft.pages=88-95&rft.date=2007&rft_id=info%3Adoi%2F10.3115%2F1626355.1626367&rft.aulast=Federico&rft.aufirst=Marcello&rft.au=Cettolo%2C+Mauro&rft_id=https%3A%2F%2Faclanthology.org%2FW07-0712&rfr_id=info%3Asid%2Fen.wikipedia.org%3ANeural+machine+translation" class="Z3988"></span></span> </li> <li id="cite_note-Radford2018-39"><span class="mw-cite-backlink"><b><a href="#cite_ref-Radford2018_39-0">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFRadfordNarasimhanSalimansSutskever2018" class="citation techreport cs1">Radford, Alec; Narasimhan, Karthik; Salimans, Tim; Sutskever, Ilya (2018). <a rel="nofollow" class="external text" href="https://cdn.openai.com/research-covers/language-unsupervised/language_understanding_paper.pdf"><i>Improving Language Understanding by Generative Pre-Training</i></a> <span class="cs1-format">(PDF)</span> (Technical report). OpenAI<span class="reference-accessdate">. Retrieved <span class="nowrap">2023-12-26</span></span>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&rft.genre=report&rft.btitle=Improving+Language+Understanding+by+Generative+Pre-Training&rft.pub=OpenAI&rft.date=2018&rft.aulast=Radford&rft.aufirst=Alec&rft.au=Narasimhan%2C+Karthik&rft.au=Salimans%2C+Tim&rft.au=Sutskever%2C+Ilya&rft_id=https%3A%2F%2Fcdn.openai.com%2Fresearch-covers%2Flanguage-unsupervised%2Flanguage_understanding_paper.pdf&rfr_id=info%3Asid%2Fen.wikipedia.org%3ANeural+machine+translation" class="Z3988"></span></span> </li> <li id="cite_note-Brown2020-40"><span class="mw-cite-backlink"><b><a href="#cite_ref-Brown2020_40-0">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFBrownMannRyderSubbiah2020" class="citation journal cs1">Brown, Tom; Mann, Benjamin; Ryder, Nick; Subbiah, Melanie; Kaplan, Jared D; Dhariwal, Prafulla; Neelakantan, Arvind; Shyam, Pranav; Sastry, Girish; Askell, Amanda; Agarwal, Sandhini; Herbert-Voss, Ariel; Krueger, Gretchen; Henighan, Tom; Child, Rewon (2020). <a rel="nofollow" class="external text" href="https://proceedings.neurips.cc/paper/2020/hash/1457c0d6bfcb4967418bfb8ac142f64a-Abstract.html">"Language Models are Few-Shot Learners"</a>. <i>Advances in Neural Information Processing Systems</i>. <b>33</b>. Curran Associates, Inc.: 1877–1901.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=article&rft.jtitle=Advances+in+Neural+Information+Processing+Systems&rft.atitle=Language+Models+are+Few-Shot+Learners&rft.volume=33&rft.pages=1877-1901&rft.date=2020&rft.aulast=Brown&rft.aufirst=Tom&rft.au=Mann%2C+Benjamin&rft.au=Ryder%2C+Nick&rft.au=Subbiah%2C+Melanie&rft.au=Kaplan%2C+Jared+D&rft.au=Dhariwal%2C+Prafulla&rft.au=Neelakantan%2C+Arvind&rft.au=Shyam%2C+Pranav&rft.au=Sastry%2C+Girish&rft.au=Askell%2C+Amanda&rft.au=Agarwal%2C+Sandhini&rft.au=Herbert-Voss%2C+Ariel&rft.au=Krueger%2C+Gretchen&rft.au=Henighan%2C+Tom&rft.au=Child%2C+Rewon&rft_id=https%3A%2F%2Fproceedings.neurips.cc%2Fpaper%2F2020%2Fhash%2F1457c0d6bfcb4967418bfb8ac142f64a-Abstract.html&rfr_id=info%3Asid%2Fen.wikipedia.org%3ANeural+machine+translation" class="Z3988"></span></span> </li> </ol></div></div> <div class="navbox-styles"><style data-mw-deduplicate="TemplateStyles:r1129693374">.mw-parser-output .hlist dl,.mw-parser-output .hlist ol,.mw-parser-output .hlist ul{margin:0;padding:0}.mw-parser-output .hlist dd,.mw-parser-output .hlist dt,.mw-parser-output .hlist li{margin:0;display:inline}.mw-parser-output .hlist.inline,.mw-parser-output .hlist.inline dl,.mw-parser-output .hlist.inline ol,.mw-parser-output .hlist.inline ul,.mw-parser-output .hlist dl dl,.mw-parser-output .hlist dl ol,.mw-parser-output .hlist dl ul,.mw-parser-output .hlist ol dl,.mw-parser-output .hlist ol ol,.mw-parser-output .hlist ol ul,.mw-parser-output .hlist ul dl,.mw-parser-output .hlist ul ol,.mw-parser-output .hlist ul ul{display:inline}.mw-parser-output .hlist .mw-empty-li{display:none}.mw-parser-output .hlist dt::after{content:": "}.mw-parser-output .hlist dd::after,.mw-parser-output .hlist li::after{content:" · ";font-weight:bold}.mw-parser-output .hlist dd:last-child::after,.mw-parser-output .hlist dt:last-child::after,.mw-parser-output .hlist li:last-child::after{content:none}.mw-parser-output .hlist dd dd:first-child::before,.mw-parser-output .hlist dd dt:first-child::before,.mw-parser-output .hlist dd li:first-child::before,.mw-parser-output .hlist dt dd:first-child::before,.mw-parser-output .hlist dt dt:first-child::before,.mw-parser-output .hlist dt li:first-child::before,.mw-parser-output .hlist li dd:first-child::before,.mw-parser-output .hlist li dt:first-child::before,.mw-parser-output .hlist li li:first-child::before{content:" (";font-weight:normal}.mw-parser-output .hlist dd dd:last-child::after,.mw-parser-output .hlist dd dt:last-child::after,.mw-parser-output .hlist dd li:last-child::after,.mw-parser-output .hlist dt dd:last-child::after,.mw-parser-output .hlist dt dt:last-child::after,.mw-parser-output .hlist dt li:last-child::after,.mw-parser-output .hlist li dd:last-child::after,.mw-parser-output .hlist li dt:last-child::after,.mw-parser-output .hlist li li:last-child::after{content:")";font-weight:normal}.mw-parser-output .hlist ol{counter-reset:listitem}.mw-parser-output .hlist ol>li{counter-increment:listitem}.mw-parser-output .hlist ol>li::before{content:" "counter(listitem)"\a0 "}.mw-parser-output .hlist dd ol>li:first-child::before,.mw-parser-output .hlist dt ol>li:first-child::before,.mw-parser-output .hlist li ol>li:first-child::before{content:" ("counter(listitem)"\a0 "}</style><style data-mw-deduplicate="TemplateStyles:r1236075235">.mw-parser-output .navbox{box-sizing:border-box;border:1px solid #a2a9b1;width:100%;clear:both;font-size:88%;text-align:center;padding:1px;margin:1em auto 0}.mw-parser-output .navbox .navbox{margin-top:0}.mw-parser-output .navbox+.navbox,.mw-parser-output .navbox+.navbox-styles+.navbox{margin-top:-1px}.mw-parser-output .navbox-inner,.mw-parser-output .navbox-subgroup{width:100%}.mw-parser-output .navbox-group,.mw-parser-output .navbox-title,.mw-parser-output .navbox-abovebelow{padding:0.25em 1em;line-height:1.5em;text-align:center}.mw-parser-output .navbox-group{white-space:nowrap;text-align:right}.mw-parser-output .navbox,.mw-parser-output .navbox-subgroup{background-color:#fdfdfd}.mw-parser-output .navbox-list{line-height:1.5em;border-color:#fdfdfd}.mw-parser-output .navbox-list-with-group{text-align:left;border-left-width:2px;border-left-style:solid}.mw-parser-output tr+tr>.navbox-abovebelow,.mw-parser-output tr+tr>.navbox-group,.mw-parser-output tr+tr>.navbox-image,.mw-parser-output tr+tr>.navbox-list{border-top:2px solid #fdfdfd}.mw-parser-output .navbox-title{background-color:#ccf}.mw-parser-output .navbox-abovebelow,.mw-parser-output .navbox-group,.mw-parser-output .navbox-subgroup .navbox-title{background-color:#ddf}.mw-parser-output .navbox-subgroup .navbox-group,.mw-parser-output .navbox-subgroup .navbox-abovebelow{background-color:#e6e6ff}.mw-parser-output .navbox-even{background-color:#f7f7f7}.mw-parser-output .navbox-odd{background-color:transparent}.mw-parser-output .navbox .hlist td dl,.mw-parser-output .navbox .hlist td ol,.mw-parser-output .navbox .hlist td ul,.mw-parser-output .navbox td.hlist dl,.mw-parser-output .navbox td.hlist ol,.mw-parser-output .navbox td.hlist ul{padding:0.125em 0}.mw-parser-output .navbox .navbar{display:block;font-size:100%}.mw-parser-output .navbox-title .navbar{float:left;text-align:left;margin-right:0.5em}body.skin--responsive .mw-parser-output .navbox-image img{max-width:none!important}@media print{body.ns-0 .mw-parser-output .navbox{display:none!important}}</style></div><div role="navigation" class="navbox" aria-labelledby="Approaches_to_machine_translation" style="padding:3px"><table class="nowraplinks mw-collapsible autocollapse navbox-inner" style="border-spacing:0;background:transparent;color:inherit"><tbody><tr><th scope="col" class="navbox-title" colspan="2"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1129693374"><style data-mw-deduplicate="TemplateStyles:r1239400231">.mw-parser-output .navbar{display:inline;font-size:88%;font-weight:normal}.mw-parser-output .navbar-collapse{float:left;text-align:left}.mw-parser-output .navbar-boxtext{word-spacing:0}.mw-parser-output .navbar ul{display:inline-block;white-space:nowrap;line-height:inherit}.mw-parser-output .navbar-brackets::before{margin-right:-0.125em;content:"[ "}.mw-parser-output .navbar-brackets::after{margin-left:-0.125em;content:" ]"}.mw-parser-output .navbar li{word-spacing:-0.125em}.mw-parser-output .navbar a>span,.mw-parser-output .navbar a>abbr{text-decoration:inherit}.mw-parser-output .navbar-mini abbr{font-variant:small-caps;border-bottom:none;text-decoration:none;cursor:inherit}.mw-parser-output .navbar-ct-full{font-size:114%;margin:0 7em}.mw-parser-output .navbar-ct-mini{font-size:114%;margin:0 4em}html.skin-theme-clientpref-night .mw-parser-output .navbar li a abbr{color:var(--color-base)!important}@media(prefers-color-scheme:dark){html.skin-theme-clientpref-os .mw-parser-output .navbar li a abbr{color:var(--color-base)!important}}@media print{.mw-parser-output .navbar{display:none!important}}</style><div class="navbar plainlinks hlist navbar-mini"><ul><li class="nv-view"><a href="/wiki/Template:Approaches_to_machine_translation" title="Template:Approaches to machine translation"><abbr title="View this template">v</abbr></a></li><li class="nv-talk"><a href="/wiki/Template_talk:Approaches_to_machine_translation" title="Template talk:Approaches to machine translation"><abbr title="Discuss this template">t</abbr></a></li><li class="nv-edit"><a href="/wiki/Special:EditPage/Template:Approaches_to_machine_translation" title="Special:EditPage/Template:Approaches to machine translation"><abbr title="Edit this template">e</abbr></a></li></ul></div><div id="Approaches_to_machine_translation" style="font-size:114%;margin:0 4em">Approaches to <a href="/wiki/Machine_translation" title="Machine translation">machine translation</a></div></th></tr><tr><td colspan="2" class="navbox-list navbox-odd hlist" style="width:100%;padding:0"><div style="padding:0 0.25em"> <ul><li><a href="/wiki/Dictionary-based_machine_translation" title="Dictionary-based machine translation">Dictionary-based</a></li> <li><a href="/wiki/Rule-based_machine_translation" title="Rule-based machine translation">Rule-based</a></li> <li><a href="/wiki/Transfer-based_machine_translation" title="Transfer-based machine translation">Transfer-based</a></li> <li><a href="/wiki/Statistical_machine_translation" title="Statistical machine translation">Statistical</a></li> <li><a href="/wiki/Example-based_machine_translation" title="Example-based machine translation">Example-based</a></li> <li><a href="/wiki/Interlingual_machine_translation" title="Interlingual machine translation">Interlingual</a></li> <li><a class="mw-selflink selflink">Neural</a></li> <li><a href="/wiki/Hybrid_machine_translation" title="Hybrid machine translation">Hybrid</a></li></ul> </div></td></tr></tbody></table></div> <div class="navbox-styles"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1129693374"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1236075235"></div><div role="navigation" class="navbox" aria-labelledby="Artificial_intelligence" style="padding:3px"><table class="nowraplinks hlist mw-collapsible {{{state}}} navbox-inner" style="border-spacing:0;background:transparent;color:inherit"><tbody><tr><th scope="col" class="navbox-title" colspan="2"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1129693374"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1239400231"><div class="navbar plainlinks hlist navbar-mini"><ul><li class="nv-view"><a href="/wiki/Template:Artificial_intelligence_(AI)" title="Template:Artificial intelligence (AI)"><abbr title="View this template">v</abbr></a></li><li class="nv-talk"><a href="/wiki/Template_talk:Artificial_intelligence_(AI)" class="mw-redirect" title="Template talk:Artificial intelligence (AI)"><abbr title="Discuss this template">t</abbr></a></li><li class="nv-edit"><a href="/wiki/Special:EditPage/Template:Artificial_intelligence_(AI)" title="Special:EditPage/Template:Artificial intelligence (AI)"><abbr title="Edit this template">e</abbr></a></li></ul></div><div id="Artificial_intelligence" style="font-size:114%;margin:0 4em"><a href="/wiki/Artificial_intelligence" title="Artificial intelligence">Artificial intelligence</a></div></th></tr><tr><th scope="row" class="navbox-group" style="width:1%">Concepts</th><td class="navbox-list-with-group navbox-list navbox-odd" style="width:100%;padding:0"><div style="padding:0 0.25em"> <ul><li><a href="/wiki/Parameter" title="Parameter">Parameter</a> <ul><li><a href="/wiki/Hyperparameter_(machine_learning)" title="Hyperparameter (machine learning)">Hyperparameter</a></li></ul></li> <li><a href="/wiki/Loss_functions_for_classification" title="Loss functions for classification">Loss functions</a></li> <li><a href="/wiki/Regression_analysis" title="Regression analysis">Regression</a> <ul><li><a href="/wiki/Bias%E2%80%93variance_tradeoff" title="Bias–variance tradeoff">Bias–variance tradeoff</a></li> <li><a href="/wiki/Double_descent" title="Double descent">Double descent</a></li> <li><a href="/wiki/Overfitting" title="Overfitting">Overfitting</a></li></ul></li> <li><a href="/wiki/Cluster_analysis" title="Cluster analysis">Clustering</a></li> <li><a href="/wiki/Gradient_descent" title="Gradient descent">Gradient descent</a> <ul><li><a href="/wiki/Stochastic_gradient_descent" title="Stochastic gradient descent">SGD</a></li> <li><a href="/wiki/Quasi-Newton_method" title="Quasi-Newton method">Quasi-Newton method</a></li> <li><a href="/wiki/Conjugate_gradient_method" title="Conjugate gradient method">Conjugate gradient method</a></li></ul></li> <li><a href="/wiki/Backpropagation" title="Backpropagation">Backpropagation</a></li> <li><a href="/wiki/Attention_(machine_learning)" title="Attention (machine learning)">Attention</a></li> <li><a href="/wiki/Convolution" title="Convolution">Convolution</a></li> <li><a href="/wiki/Normalization_(machine_learning)" title="Normalization (machine learning)">Normalization</a> <ul><li><a href="/wiki/Batch_normalization" title="Batch normalization">Batchnorm</a></li></ul></li> <li><a href="/wiki/Activation_function" title="Activation function">Activation</a> <ul><li><a href="/wiki/Softmax_function" title="Softmax function">Softmax</a></li> <li><a href="/wiki/Sigmoid_function" title="Sigmoid function">Sigmoid</a></li> <li><a href="/wiki/Rectifier_(neural_networks)" title="Rectifier (neural networks)">Rectifier</a></li></ul></li> <li><a href="/wiki/Gating_mechanism" title="Gating mechanism">Gating</a></li> <li><a href="/wiki/Weight_initialization" title="Weight initialization">Weight initialization</a></li> <li><a href="/wiki/Regularization_(mathematics)" title="Regularization (mathematics)">Regularization</a></li> <li><a href="/wiki/Training,_validation,_and_test_data_sets" title="Training, validation, and test data sets">Datasets</a> <ul><li><a href="/wiki/Data_augmentation" title="Data augmentation">Augmentation</a></li></ul></li> <li><a href="/wiki/Prompt_engineering" title="Prompt engineering">Prompt engineering</a></li> <li><a href="/wiki/Reinforcement_learning" title="Reinforcement learning">Reinforcement learning</a> <ul><li><a href="/wiki/Q-learning" title="Q-learning">Q-learning</a></li> <li><a href="/wiki/State%E2%80%93action%E2%80%93reward%E2%80%93state%E2%80%93action" title="State–action–reward–state–action">SARSA</a></li> <li><a href="/wiki/Imitation_learning" title="Imitation learning">Imitation</a></li></ul></li> <li><a href="/wiki/Diffusion_process" title="Diffusion process">Diffusion</a></li> <li><a href="/wiki/Latent_diffusion_model" title="Latent diffusion model">Latent diffusion model</a></li> <li><a href="/wiki/Autoregressive_model" title="Autoregressive model">Autoregression</a></li> <li><a href="/wiki/Adversarial_machine_learning" title="Adversarial machine learning">Adversary</a></li> <li><a href="/wiki/Retrieval-augmented_generation" title="Retrieval-augmented generation">RAG</a></li> <li><a href="/wiki/Reinforcement_learning_from_human_feedback" title="Reinforcement learning from human feedback">RLHF</a></li> <li><a href="/wiki/Self-supervised_learning" title="Self-supervised learning">Self-supervised learning</a></li> <li><a href="/wiki/Word_embedding" title="Word embedding">Word embedding</a></li> <li><a href="/wiki/Hallucination_(artificial_intelligence)" title="Hallucination (artificial intelligence)">Hallucination</a></li></ul> </div></td></tr><tr><th scope="row" class="navbox-group" style="width:1%">Applications</th><td class="navbox-list-with-group navbox-list navbox-even" style="width:100%;padding:0"><div style="padding:0 0.25em"> <ul><li><a href="/wiki/Machine_learning" title="Machine learning">Machine learning</a> <ul><li><a href="/wiki/Prompt_engineering#In-context_learning" title="Prompt engineering">In-context learning</a></li></ul></li> <li><a href="/wiki/Neural_network_(machine_learning)" title="Neural network (machine learning)">Artificial neural network</a> <ul><li><a href="/wiki/Deep_learning" title="Deep learning">Deep learning</a></li></ul></li> <li><a href="/wiki/Language_model" title="Language model">Language model</a> <ul><li><a href="/wiki/Large_language_model" title="Large language model">Large language model</a></li> <li><a class="mw-selflink selflink">NMT</a></li></ul></li> <li><a href="/wiki/Artificial_general_intelligence" title="Artificial general intelligence">Artificial general intelligence</a></li></ul> </div></td></tr><tr><th scope="row" class="navbox-group" style="width:1%">Implementations</th><td class="navbox-list-with-group navbox-list navbox-odd" style="width:100%;padding:0"><div style="padding:0 0.25em"></div><table class="nowraplinks navbox-subgroup" style="border-spacing:0"><tbody><tr><th scope="row" class="navbox-group" style="width:1%">Audio–visual</th><td class="navbox-list-with-group navbox-list navbox-odd" style="width:100%;padding:0"><div style="padding:0 0.25em"> <ul><li><a href="/wiki/AlexNet" title="AlexNet">AlexNet</a></li> <li><a href="/wiki/WaveNet" title="WaveNet">WaveNet</a></li> <li><a href="/wiki/Human_image_synthesis" title="Human image synthesis">Human image synthesis</a></li> <li><a href="/wiki/Handwriting_recognition" title="Handwriting recognition">HWR</a></li> <li><a href="/wiki/Optical_character_recognition" title="Optical character recognition">OCR</a></li> <li><a href="/wiki/Deep_learning_speech_synthesis" title="Deep learning speech synthesis">Speech synthesis</a> <ul><li><a href="/wiki/ElevenLabs" title="ElevenLabs">ElevenLabs</a></li></ul></li> <li><a href="/wiki/Speech_recognition" title="Speech recognition">Speech recognition</a> <ul><li><a href="/wiki/Whisper_(speech_recognition_system)" title="Whisper (speech recognition system)">Whisper</a></li></ul></li> <li><a href="/wiki/Facial_recognition_system" title="Facial recognition system">Facial recognition</a></li> <li><a href="/wiki/AlphaFold" title="AlphaFold">AlphaFold</a></li> <li><a href="/wiki/Text-to-image_model" title="Text-to-image model">Text-to-image models</a> <ul><li><a href="/wiki/DALL-E" title="DALL-E">DALL-E</a></li> <li><a href="/wiki/Flux_(text-to-image_model)" title="Flux (text-to-image model)">Flux</a></li> <li><a href="/wiki/Ideogram_(text-to-image_model)" title="Ideogram (text-to-image model)">Ideogram</a></li> <li><a href="/wiki/Midjourney" title="Midjourney">Midjourney</a></li> <li><a href="/wiki/Stable_Diffusion" title="Stable Diffusion">Stable Diffusion</a></li></ul></li> <li><a href="/wiki/Text-to-video_model" title="Text-to-video model">Text-to-video models</a> <ul><li><a href="/wiki/Sora_(text-to-video_model)" title="Sora (text-to-video model)">Sora</a></li> <li><a href="/wiki/Dream_Machine_(text-to-video_model)" title="Dream Machine (text-to-video model)">Dream Machine</a></li> <li><a href="/wiki/VideoPoet" title="VideoPoet">VideoPoet</a></li></ul></li> <li><a href="/wiki/Music_and_artificial_intelligence" title="Music and artificial intelligence">Music generation</a> <ul><li><a href="/wiki/Suno_AI" title="Suno AI">Suno AI</a></li> <li><a href="/wiki/Udio" title="Udio">Udio</a></li></ul></li></ul> </div></td></tr><tr><th scope="row" class="navbox-group" style="width:1%">Text</th><td class="navbox-list-with-group navbox-list navbox-even" style="width:100%;padding:0"><div style="padding:0 0.25em"> <ul><li><a href="/wiki/Word2vec" title="Word2vec">Word2vec</a></li> <li><a href="/wiki/Seq2seq" title="Seq2seq">Seq2seq</a></li> <li><a href="/wiki/GloVe" title="GloVe">GloVe</a></li> <li><a href="/wiki/BERT_(language_model)" title="BERT (language model)">BERT</a></li> <li><a href="/wiki/T5_(language_model)" title="T5 (language model)">T5</a></li> <li><a href="/wiki/Llama_(language_model)" title="Llama (language model)">Llama</a></li> <li><a href="/wiki/Chinchilla_(language_model)" title="Chinchilla (language model)">Chinchilla AI</a></li> <li><a href="/wiki/PaLM" title="PaLM">PaLM</a></li> <li><a href="/wiki/Generative_pre-trained_transformer" title="Generative pre-trained transformer">GPT</a> <ul><li><a href="/wiki/GPT-1" title="GPT-1">1</a></li> <li><a href="/wiki/GPT-2" title="GPT-2">2</a></li> <li><a href="/wiki/GPT-3" title="GPT-3">3</a></li> <li><a href="/wiki/GPT-J" title="GPT-J">J</a></li> <li><a href="/wiki/ChatGPT" title="ChatGPT">ChatGPT</a></li> <li><a href="/wiki/GPT-4" title="GPT-4">4</a></li> <li><a href="/wiki/GPT-4o" title="GPT-4o">4o</a></li> <li><a href="/wiki/OpenAI_o1" title="OpenAI o1">o1</a></li></ul></li> <li><a href="/wiki/Claude_(language_model)" title="Claude (language model)">Claude</a></li> <li><a href="/wiki/Gemini_(language_model)" title="Gemini (language model)">Gemini</a></li> <li><a href="/wiki/Grok_(chatbot)" title="Grok (chatbot)">Grok</a></li> <li><a href="/wiki/LaMDA" title="LaMDA">LaMDA</a></li> <li><a href="/wiki/BLOOM_(language_model)" title="BLOOM (language model)">BLOOM</a></li> <li><a href="/wiki/Project_Debater" title="Project Debater">Project Debater</a></li> <li><a href="/wiki/IBM_Watson" title="IBM Watson">IBM Watson</a></li> <li><a href="/wiki/IBM_Watsonx" title="IBM Watsonx">IBM Watsonx</a></li> <li><a href="/wiki/IBM_Granite" title="IBM Granite">Granite</a></li> <li><a href="/wiki/Huawei_PanGu" title="Huawei PanGu">PanGu-Σ</a></li></ul> </div></td></tr><tr><th scope="row" class="navbox-group" style="width:1%">Decisional</th><td class="navbox-list-with-group navbox-list navbox-odd" style="width:100%;padding:0"><div style="padding:0 0.25em"> <ul><li><a href="/wiki/AlphaGo" title="AlphaGo">AlphaGo</a></li> <li><a href="/wiki/AlphaZero" title="AlphaZero">AlphaZero</a></li> <li><a href="/wiki/OpenAI_Five" title="OpenAI Five">OpenAI Five</a></li> <li><a href="/wiki/Self-driving_car" title="Self-driving car">Self-driving car</a></li> <li><a href="/wiki/MuZero" title="MuZero">MuZero</a></li> <li><a href="/wiki/Action_selection" title="Action selection">Action selection</a> <ul><li><a href="/wiki/AutoGPT" title="AutoGPT">AutoGPT</a></li></ul></li> <li><a href="/wiki/Robot_control" title="Robot control">Robot control</a></li></ul> </div></td></tr></tbody></table><div></div></td></tr><tr><th scope="row" class="navbox-group" style="width:1%">People</th><td class="navbox-list-with-group navbox-list navbox-even" style="width:100%;padding:0"><div style="padding:0 0.25em"> <ul><li><a href="/wiki/Alan_Turing" title="Alan Turing">Alan Turing</a></li> <li><a href="/wiki/Warren_Sturgis_McCulloch" title="Warren Sturgis McCulloch">Warren Sturgis McCulloch</a></li> <li><a href="/wiki/Walter_Pitts" title="Walter Pitts">Walter Pitts</a></li> <li><a href="/wiki/John_von_Neumann" title="John von Neumann">John von Neumann</a></li> <li><a href="/wiki/Claude_Shannon" title="Claude Shannon">Claude Shannon</a></li> <li><a href="/wiki/Marvin_Minsky" title="Marvin Minsky">Marvin Minsky</a></li> <li><a href="/wiki/John_McCarthy_(computer_scientist)" title="John McCarthy (computer scientist)">John McCarthy</a></li> <li><a href="/wiki/Nathaniel_Rochester_(computer_scientist)" title="Nathaniel Rochester (computer scientist)">Nathaniel Rochester</a></li> <li><a href="/wiki/Allen_Newell" title="Allen Newell">Allen Newell</a></li> <li><a href="/wiki/Cliff_Shaw" title="Cliff Shaw">Cliff Shaw</a></li> <li><a href="/wiki/Herbert_A._Simon" title="Herbert A. Simon">Herbert A. Simon</a></li> <li><a href="/wiki/Oliver_Selfridge" title="Oliver Selfridge">Oliver Selfridge</a></li> <li><a href="/wiki/Frank_Rosenblatt" title="Frank Rosenblatt">Frank Rosenblatt</a></li> <li><a href="/wiki/Bernard_Widrow" title="Bernard Widrow">Bernard Widrow</a></li> <li><a href="/wiki/Joseph_Weizenbaum" title="Joseph Weizenbaum">Joseph Weizenbaum</a></li> <li><a href="/wiki/Seymour_Papert" title="Seymour Papert">Seymour Papert</a></li> <li><a href="/wiki/Seppo_Linnainmaa" title="Seppo Linnainmaa">Seppo Linnainmaa</a></li> <li><a href="/wiki/Paul_Werbos" title="Paul Werbos">Paul Werbos</a></li> <li><a href="/wiki/J%C3%BCrgen_Schmidhuber" title="Jürgen Schmidhuber">Jürgen Schmidhuber</a></li> <li><a href="/wiki/Yann_LeCun" title="Yann LeCun">Yann LeCun</a></li> <li><a href="/wiki/Geoffrey_Hinton" title="Geoffrey Hinton">Geoffrey Hinton</a></li> <li><a href="/wiki/John_Hopfield" title="John Hopfield">John Hopfield</a></li> <li><a href="/wiki/Yoshua_Bengio" title="Yoshua Bengio">Yoshua Bengio</a></li> <li><a href="/wiki/Lotfi_A._Zadeh" title="Lotfi A. Zadeh">Lotfi A. Zadeh</a></li> <li><a href="/wiki/Stephen_Grossberg" title="Stephen Grossberg">Stephen Grossberg</a></li> <li><a href="/wiki/Alex_Graves_(computer_scientist)" title="Alex Graves (computer scientist)">Alex Graves</a></li> <li><a href="/wiki/Andrew_Ng" title="Andrew Ng">Andrew Ng</a></li> <li><a href="/wiki/Fei-Fei_Li" title="Fei-Fei Li">Fei-Fei Li</a></li> <li><a href="/wiki/Alex_Krizhevsky" title="Alex Krizhevsky">Alex Krizhevsky</a></li> <li><a href="/wiki/Ilya_Sutskever" title="Ilya Sutskever">Ilya Sutskever</a></li> <li><a href="/wiki/Demis_Hassabis" title="Demis Hassabis">Demis Hassabis</a></li> <li><a href="/wiki/David_Silver_(computer_scientist)" title="David Silver (computer scientist)">David Silver</a></li> <li><a href="/wiki/Ian_Goodfellow" title="Ian Goodfellow">Ian Goodfellow</a></li> <li><a href="/wiki/Andrej_Karpathy" title="Andrej Karpathy">Andrej Karpathy</a></li></ul> </div></td></tr><tr><th scope="row" class="navbox-group" style="width:1%">Architectures</th><td class="navbox-list-with-group navbox-list navbox-odd" style="width:100%;padding:0"><div style="padding:0 0.25em"> <ul><li><a href="/wiki/Neural_Turing_machine" title="Neural Turing machine">Neural Turing machine</a></li> <li><a href="/wiki/Differentiable_neural_computer" title="Differentiable neural computer">Differentiable neural computer</a></li> <li><a href="/wiki/Transformer_(deep_learning_architecture)" title="Transformer (deep learning architecture)">Transformer</a> <ul><li><a href="/wiki/Vision_transformer" title="Vision transformer">Vision transformer (ViT)</a></li></ul></li> <li><a href="/wiki/Recurrent_neural_network" title="Recurrent neural network">Recurrent neural network (RNN)</a></li> <li><a href="/wiki/Long_short-term_memory" title="Long short-term memory">Long short-term memory (LSTM)</a></li> <li><a href="/wiki/Gated_recurrent_unit" title="Gated recurrent unit">Gated recurrent unit (GRU)</a></li> <li><a href="/wiki/Echo_state_network" title="Echo state network">Echo state network</a></li> <li><a href="/wiki/Multilayer_perceptron" title="Multilayer perceptron">Multilayer perceptron (MLP)</a></li> <li><a href="/wiki/Convolutional_neural_network" title="Convolutional neural network">Convolutional neural network (CNN)</a></li> <li><a href="/wiki/Residual_neural_network" title="Residual neural network">Residual neural network (RNN)</a></li> <li><a href="/wiki/Highway_network" title="Highway network">Highway network</a></li> <li><a href="/wiki/Mamba_(deep_learning_architecture)" title="Mamba (deep learning architecture)">Mamba</a></li> <li><a href="/wiki/Autoencoder" title="Autoencoder">Autoencoder</a></li> <li><a href="/wiki/Variational_autoencoder" title="Variational autoencoder">Variational autoencoder (VAE)</a></li> <li><a href="/wiki/Generative_adversarial_network" title="Generative adversarial network">Generative adversarial network (GAN)</a></li> <li><a href="/wiki/Graph_neural_network" title="Graph neural network">Graph neural network (GNN)</a></li></ul> </div></td></tr><tr><td class="navbox-abovebelow" colspan="2"><div> <ul><li><span class="noviewer" typeof="mw:File"><a href="/wiki/File:Symbol_portal_class.svg" class="mw-file-description" title="Portal"><img alt="" src="//upload.wikimedia.org/wikipedia/en/thumb/e/e2/Symbol_portal_class.svg/16px-Symbol_portal_class.svg.png" decoding="async" width="16" height="16" class="mw-file-element" srcset="//upload.wikimedia.org/wikipedia/en/thumb/e/e2/Symbol_portal_class.svg/23px-Symbol_portal_class.svg.png 1.5x, //upload.wikimedia.org/wikipedia/en/thumb/e/e2/Symbol_portal_class.svg/31px-Symbol_portal_class.svg.png 2x" data-file-width="180" data-file-height="185" /></a></span> Portals <ul><li><a href="/wiki/Portal:Technology" title="Portal:Technology">Technology</a></li></ul></li> <li><span class="noviewer" typeof="mw:File"><span title="Category"><img alt="" src="//upload.wikimedia.org/wikipedia/en/thumb/9/96/Symbol_category_class.svg/16px-Symbol_category_class.svg.png" decoding="async" width="16" height="16" class="mw-file-element" srcset="//upload.wikimedia.org/wikipedia/en/thumb/9/96/Symbol_category_class.svg/23px-Symbol_category_class.svg.png 1.5x, //upload.wikimedia.org/wikipedia/en/thumb/9/96/Symbol_category_class.svg/31px-Symbol_category_class.svg.png 2x" data-file-width="180" data-file-height="185" /></span></span> Categories <ul><li><a href="/wiki/Category:Artificial_neural_networks" title="Category:Artificial neural networks">Artificial neural networks</a></li> <li><a href="/wiki/Category:Machine_learning" title="Category:Machine learning">Machine learning</a></li></ul></li> <li><span class="noviewer" typeof="mw:File"><span title="List-Class article"><img alt="" src="//upload.wikimedia.org/wikipedia/en/thumb/d/db/Symbol_list_class.svg/16px-Symbol_list_class.svg.png" decoding="async" width="16" height="16" class="mw-file-element" srcset="//upload.wikimedia.org/wikipedia/en/thumb/d/db/Symbol_list_class.svg/23px-Symbol_list_class.svg.png 1.5x, //upload.wikimedia.org/wikipedia/en/thumb/d/db/Symbol_list_class.svg/31px-Symbol_list_class.svg.png 2x" data-file-width="180" data-file-height="185" /></span></span> List <ul><li><a href="/wiki/List_of_artificial_intelligence_companies" title="List of artificial intelligence companies">Companies</a></li> <li><a href="/wiki/List_of_artificial_intelligence_projects" title="List of artificial intelligence projects">Projects</a></li></ul></li></ul> </div></td></tr></tbody></table></div> <!-- NewPP limit report Parsed by mw‐api‐int.codfw.main‐6fdd9f9b88‐rtfrh Cached time: 20241129131517 Cache expiry: 2592000 Reduced expiry: false Complications: [vary‐revision‐sha1, show‐toc] CPU time usage: 1.235 seconds Real time usage: 1.425 seconds Preprocessor visited node count: 23846/1000000 Post‐expand include size: 160662/2097152 bytes Template argument size: 6465/2097152 bytes Highest expansion depth: 15/100 Expensive parser function count: 5/500 Unstrip recursion depth: 1/20 Unstrip post‐expand size: 178129/5000000 bytes Lua time usage: 0.643/10.000 seconds Lua memory usage: 6507281/52428800 bytes Number of Wikibase entities loaded: 0/400 --> <!-- Transclusion expansion time report (%,ms,calls,template) 100.00% 1245.236 1 -total 43.87% 546.309 1 Template:Reflist 34.44% 428.811 64 Template:R 32.98% 410.661 64 Template:R/ref 18.82% 234.402 64 Template:R/superscript 11.56% 143.906 4 Template:Cite_book 10.48% 130.476 13 Template:Cite_journal 9.95% 123.962 136 Template:R/where 8.55% 106.514 3 Template:Navbox 8.23% 102.492 11 Template:Cite_conference --> <!-- Saved in parser cache with key enwiki:pcache:idhash:47961606-0!canonical and timestamp 20241129131517 and revision id 1257438152. Rendering was triggered because: api-parse --> </div><!--esi <esi:include src="/esitest-fa8a495983347898/content" /> --><noscript><img src="https://login.wikimedia.org/wiki/Special:CentralAutoLogin/start?type=1x1&useformat=desktop" alt="" width="1" height="1" style="border: none; position: absolute;"></noscript> <div class="printfooter" data-nosnippet="">Retrieved from "<a dir="ltr" href="https://en.wikipedia.org/w/index.php?title=Neural_machine_translation&oldid=1257438152">https://en.wikipedia.org/w/index.php?title=Neural_machine_translation&oldid=1257438152</a>"</div></div> <div id="catlinks" class="catlinks" data-mw="interface"><div id="mw-normal-catlinks" class="mw-normal-catlinks"><a href="/wiki/Help:Category" title="Help:Category">Categories</a>: <ul><li><a href="/wiki/Category:Applications_of_artificial_intelligence" title="Category:Applications of artificial intelligence">Applications of artificial intelligence</a></li><li><a href="/wiki/Category:Computational_linguistics" title="Category:Computational linguistics">Computational linguistics</a></li><li><a href="/wiki/Category:Machine_translation" title="Category:Machine translation">Machine translation</a></li><li><a href="/wiki/Category:Tasks_of_natural_language_processing" title="Category:Tasks of natural language processing">Tasks of natural language processing</a></li></ul></div><div id="mw-hidden-catlinks" class="mw-hidden-catlinks mw-hidden-cats-hidden">Hidden categories: <ul><li><a href="/wiki/Category:CS1_errors:_generic_name" title="Category:CS1 errors: generic name">CS1 errors: generic name</a></li><li><a href="/wiki/Category:Articles_with_short_description" title="Category:Articles with short description">Articles with short description</a></li><li><a href="/wiki/Category:Short_description_is_different_from_Wikidata" title="Category:Short description is different from Wikidata">Short description is different from Wikidata</a></li><li><a href="/wiki/Category:All_articles_with_unsourced_statements" title="Category:All articles with unsourced statements">All articles with unsourced statements</a></li><li><a href="/wiki/Category:Articles_with_unsourced_statements_from_December_2023" title="Category:Articles with unsourced statements from December 2023">Articles with unsourced statements from December 2023</a></li></ul></div></div> </div> </main> </div> <div class="mw-footer-container"> <footer id="footer" class="mw-footer" > <ul id="footer-info"> <li id="footer-info-lastmod"> This page was last edited on 14 November 2024, at 22:08<span class="anonymous-show"> (UTC)</span>.</li> <li id="footer-info-copyright">Text is available under the <a href="/wiki/Wikipedia:Text_of_the_Creative_Commons_Attribution-ShareAlike_4.0_International_License" title="Wikipedia:Text of the Creative Commons Attribution-ShareAlike 4.0 International License">Creative Commons Attribution-ShareAlike 4.0 License</a>; additional terms may apply. By using this site, you agree to the <a href="https://foundation.wikimedia.org/wiki/Special:MyLanguage/Policy:Terms_of_Use" class="extiw" title="foundation:Special:MyLanguage/Policy:Terms of Use">Terms of Use</a> and <a href="https://foundation.wikimedia.org/wiki/Special:MyLanguage/Policy:Privacy_policy" class="extiw" title="foundation:Special:MyLanguage/Policy:Privacy policy">Privacy Policy</a>. Wikipedia® is a registered trademark of the <a rel="nofollow" class="external text" href="https://wikimediafoundation.org/">Wikimedia Foundation, Inc.</a>, a non-profit organization.</li> </ul> <ul id="footer-places"> <li id="footer-places-privacy"><a href="https://foundation.wikimedia.org/wiki/Special:MyLanguage/Policy:Privacy_policy">Privacy policy</a></li> <li id="footer-places-about"><a href="/wiki/Wikipedia:About">About Wikipedia</a></li> <li id="footer-places-disclaimers"><a href="/wiki/Wikipedia:General_disclaimer">Disclaimers</a></li> <li id="footer-places-contact"><a href="//en.wikipedia.org/wiki/Wikipedia:Contact_us">Contact Wikipedia</a></li> <li id="footer-places-wm-codeofconduct"><a href="https://foundation.wikimedia.org/wiki/Special:MyLanguage/Policy:Universal_Code_of_Conduct">Code of Conduct</a></li> <li id="footer-places-developers"><a href="https://developer.wikimedia.org">Developers</a></li> <li id="footer-places-statslink"><a href="https://stats.wikimedia.org/#/en.wikipedia.org">Statistics</a></li> <li id="footer-places-cookiestatement"><a href="https://foundation.wikimedia.org/wiki/Special:MyLanguage/Policy:Cookie_statement">Cookie statement</a></li> <li id="footer-places-mobileview"><a href="//en.m.wikipedia.org/w/index.php?title=Neural_machine_translation&mobileaction=toggle_view_mobile" class="noprint stopMobileRedirectToggle">Mobile view</a></li> </ul> <ul id="footer-icons" class="noprint"> <li id="footer-copyrightico"><a href="https://wikimediafoundation.org/" class="cdx-button cdx-button--fake-button cdx-button--size-large cdx-button--fake-button--enabled"><img src="/static/images/footer/wikimedia-button.svg" width="84" height="29" alt="Wikimedia Foundation" loading="lazy"></a></li> <li id="footer-poweredbyico"><a href="https://www.mediawiki.org/" class="cdx-button cdx-button--fake-button cdx-button--size-large cdx-button--fake-button--enabled"><img src="/w/resources/assets/poweredby_mediawiki.svg" alt="Powered by MediaWiki" width="88" height="31" loading="lazy"></a></li> </ul> </footer> </div> </div> </div> <div class="vector-settings" id="p-dock-bottom"> <ul></ul> </div><script>(RLQ=window.RLQ||[]).push(function(){mw.config.set({"wgHostname":"mw-web.codfw.main-5c59558b9d-rt6fq","wgBackendResponseTime":171,"wgPageParseReport":{"limitreport":{"cputime":"1.235","walltime":"1.425","ppvisitednodes":{"value":23846,"limit":1000000},"postexpandincludesize":{"value":160662,"limit":2097152},"templateargumentsize":{"value":6465,"limit":2097152},"expansiondepth":{"value":15,"limit":100},"expensivefunctioncount":{"value":5,"limit":500},"unstrip-depth":{"value":1,"limit":20},"unstrip-size":{"value":178129,"limit":5000000},"entityaccesscount":{"value":0,"limit":400},"timingprofile":["100.00% 1245.236 1 -total"," 43.87% 546.309 1 Template:Reflist"," 34.44% 428.811 64 Template:R"," 32.98% 410.661 64 Template:R/ref"," 18.82% 234.402 64 Template:R/superscript"," 11.56% 143.906 4 Template:Cite_book"," 10.48% 130.476 13 Template:Cite_journal"," 9.95% 123.962 136 Template:R/where"," 8.55% 106.514 3 Template:Navbox"," 8.23% 102.492 11 Template:Cite_conference"]},"scribunto":{"limitreport-timeusage":{"value":"0.643","limit":"10.000"},"limitreport-memusage":{"value":6507281,"limit":52428800}},"cachereport":{"origin":"mw-api-int.codfw.main-6fdd9f9b88-rtfrh","timestamp":"20241129131517","ttl":2592000,"transientcontent":false}}});});</script> <script type="application/ld+json">{"@context":"https:\/\/schema.org","@type":"Article","name":"Neural machine translation","url":"https:\/\/en.wikipedia.org\/wiki\/Neural_machine_translation","sameAs":"http:\/\/www.wikidata.org\/entity\/Q25053937","mainEntity":"http:\/\/www.wikidata.org\/entity\/Q25053937","author":{"@type":"Organization","name":"Contributors to Wikimedia projects"},"publisher":{"@type":"Organization","name":"Wikimedia Foundation, Inc.","logo":{"@type":"ImageObject","url":"https:\/\/www.wikimedia.org\/static\/images\/wmf-hor-googpub.png"}},"datePublished":"2015-09-29T10:08:57Z","dateModified":"2024-11-14T22:08:00Z","headline":"approach to machine translation in which a large neural network is trained to maximize translation performance"}</script> </body> </html>