CINXE.COM
Reinforcement learning - Wikipedia
<!DOCTYPE html> <html class="client-nojs vector-feature-language-in-header-enabled vector-feature-language-in-main-page-header-disabled vector-feature-sticky-header-disabled vector-feature-page-tools-pinned-disabled vector-feature-toc-pinned-clientpref-1 vector-feature-main-menu-pinned-disabled vector-feature-limited-width-clientpref-1 vector-feature-limited-width-content-enabled vector-feature-custom-font-size-clientpref-1 vector-feature-appearance-pinned-clientpref-1 vector-feature-night-mode-enabled skin-theme-clientpref-day vector-toc-available" lang="en" dir="ltr"> <head> <meta charset="UTF-8"> <title>Reinforcement learning - Wikipedia</title> <script>(function(){var className="client-js vector-feature-language-in-header-enabled vector-feature-language-in-main-page-header-disabled vector-feature-sticky-header-disabled vector-feature-page-tools-pinned-disabled vector-feature-toc-pinned-clientpref-1 vector-feature-main-menu-pinned-disabled vector-feature-limited-width-clientpref-1 vector-feature-limited-width-content-enabled vector-feature-custom-font-size-clientpref-1 vector-feature-appearance-pinned-clientpref-1 vector-feature-night-mode-enabled skin-theme-clientpref-day vector-toc-available";var cookie=document.cookie.match(/(?:^|; )enwikimwclientpreferences=([^;]+)/);if(cookie){cookie[1].split('%2C').forEach(function(pref){className=className.replace(new RegExp('(^| )'+pref.replace(/-clientpref-\w+$|[^\w-]+/g,'')+'-clientpref-\\w+( |$)'),'$1'+pref+'$2');});}document.documentElement.className=className;}());RLCONF={"wgBreakFrames":false,"wgSeparatorTransformTable":["",""],"wgDigitTransformTable":["",""],"wgDefaultDateFormat":"dmy", "wgMonthNames":["","January","February","March","April","May","June","July","August","September","October","November","December"],"wgRequestId":"fbad364a-8eed-440a-97e9-f107f4b288bb","wgCanonicalNamespace":"","wgCanonicalSpecialPageName":false,"wgNamespaceNumber":0,"wgPageName":"Reinforcement_learning","wgTitle":"Reinforcement learning","wgCurRevisionId":1259200247,"wgRevisionId":1259200247,"wgArticleId":66294,"wgIsArticle":true,"wgIsRedirect":false,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgCategories":["CS1 maint: location missing publisher","CS1 maint: multiple names: authors list","Articles with short description","Short description matches Wikidata","Wikipedia articles needing clarification from January 2020","Articles needing additional references from October 2022","All articles needing additional references","Reinforcement learning","Markov models","Belief revision"],"wgPageViewLanguage":"en","wgPageContentLanguage":"en","wgPageContentModel":"wikitext", "wgRelevantPageName":"Reinforcement_learning","wgRelevantArticleId":66294,"wgIsProbablyEditable":true,"wgRelevantPageIsProbablyEditable":true,"wgRestrictionEdit":[],"wgRestrictionMove":[],"wgNoticeProject":"wikipedia","wgCiteReferencePreviewsActive":false,"wgFlaggedRevsParams":{"tags":{"status":{"levels":1}}},"wgMediaViewerOnClick":true,"wgMediaViewerEnabledByDefault":true,"wgPopupsFlags":0,"wgVisualEditor":{"pageLanguageCode":"en","pageLanguageDir":"ltr","pageVariantFallbacks":"en"},"wgMFDisplayWikibaseDescriptions":{"search":true,"watchlist":true,"tagline":false,"nearby":true},"wgWMESchemaEditAttemptStepOversample":false,"wgWMEPageLength":60000,"wgRelatedArticlesCompat":[],"wgCentralAuthMobileDomain":false,"wgEditSubmitButtonLabelPublish":true,"wgULSPosition":"interlanguage","wgULSisCompactLinksEnabled":false,"wgVector2022LanguageInHeader":true,"wgULSisLanguageSelectorEmpty":false,"wgWikibaseItemId":"Q830687","wgCheckUserClientHintsHeadersJsApi":["brands","architecture","bitness", "fullVersionList","mobile","model","platform","platformVersion"],"GEHomepageSuggestedEditsEnableTopics":true,"wgGETopicsMatchModeEnabled":false,"wgGEStructuredTaskRejectionReasonTextInputEnabled":false,"wgGELevelingUpEnabledForUser":false};RLSTATE={"ext.globalCssJs.user.styles":"ready","site.styles":"ready","user.styles":"ready","ext.globalCssJs.user":"ready","user":"ready","user.options":"loading","ext.cite.styles":"ready","ext.math.styles":"ready","skins.vector.search.codex.styles":"ready","skins.vector.styles":"ready","skins.vector.icons":"ready","jquery.tablesorter.styles":"ready","jquery.makeCollapsible.styles":"ready","ext.wikimediamessages.styles":"ready","ext.visualEditor.desktopArticleTarget.noscript":"ready","ext.uls.interlanguage":"ready","wikibase.client.init":"ready","ext.wikimediaBadges":"ready"};RLPAGEMODULES=["ext.cite.ux-enhancements","mediawiki.page.media","ext.scribunto.logs","site","mediawiki.page.ready","jquery.tablesorter","jquery.makeCollapsible","mediawiki.toc", "skins.vector.js","ext.centralNotice.geoIP","ext.centralNotice.startUp","ext.gadget.ReferenceTooltips","ext.gadget.switcher","ext.urlShortener.toolbar","ext.centralauth.centralautologin","mmv.bootstrap","ext.popups","ext.visualEditor.desktopArticleTarget.init","ext.visualEditor.targetLoader","ext.echo.centralauth","ext.eventLogging","ext.wikimediaEvents","ext.navigationTiming","ext.uls.interface","ext.cx.eventlogging.campaigns","ext.cx.uls.quick.actions","wikibase.client.vector-2022","ext.checkUser.clientHints","ext.growthExperiments.SuggestedEditSession","wikibase.sidebar.tracking"];</script> <script>(RLQ=window.RLQ||[]).push(function(){mw.loader.impl(function(){return["user.options@12s5i",function($,jQuery,require,module){mw.user.tokens.set({"patrolToken":"+\\","watchToken":"+\\","csrfToken":"+\\"}); }];});});</script> <link rel="stylesheet" href="/w/load.php?lang=en&modules=ext.cite.styles%7Cext.math.styles%7Cext.uls.interlanguage%7Cext.visualEditor.desktopArticleTarget.noscript%7Cext.wikimediaBadges%7Cext.wikimediamessages.styles%7Cjquery.makeCollapsible.styles%7Cjquery.tablesorter.styles%7Cskins.vector.icons%2Cstyles%7Cskins.vector.search.codex.styles%7Cwikibase.client.init&only=styles&skin=vector-2022"> <script async="" src="/w/load.php?lang=en&modules=startup&only=scripts&raw=1&skin=vector-2022"></script> <meta name="ResourceLoaderDynamicStyles" content=""> <link rel="stylesheet" href="/w/load.php?lang=en&modules=site.styles&only=styles&skin=vector-2022"> <meta name="generator" content="MediaWiki 1.44.0-wmf.4"> <meta name="referrer" content="origin"> <meta name="referrer" content="origin-when-cross-origin"> <meta name="robots" content="max-image-preview:standard"> <meta name="format-detection" content="telephone=no"> <meta name="viewport" content="width=1120"> <meta property="og:title" content="Reinforcement learning - Wikipedia"> <meta property="og:type" content="website"> <link rel="preconnect" href="//upload.wikimedia.org"> <link rel="alternate" media="only screen and (max-width: 640px)" href="//en.m.wikipedia.org/wiki/Reinforcement_learning"> <link rel="alternate" type="application/x-wiki" title="Edit this page" href="/w/index.php?title=Reinforcement_learning&action=edit"> <link rel="apple-touch-icon" href="/static/apple-touch/wikipedia.png"> <link rel="icon" href="/static/favicon/wikipedia.ico"> <link rel="search" type="application/opensearchdescription+xml" href="/w/rest.php/v1/search" title="Wikipedia (en)"> <link rel="EditURI" type="application/rsd+xml" href="//en.wikipedia.org/w/api.php?action=rsd"> <link rel="canonical" href="https://en.wikipedia.org/wiki/Reinforcement_learning"> <link rel="license" href="https://creativecommons.org/licenses/by-sa/4.0/deed.en"> <link rel="alternate" type="application/atom+xml" title="Wikipedia Atom feed" href="/w/index.php?title=Special:RecentChanges&feed=atom"> <link rel="dns-prefetch" href="//meta.wikimedia.org" /> <link rel="dns-prefetch" href="//login.wikimedia.org"> </head> <body class="skin--responsive skin-vector skin-vector-search-vue mediawiki ltr sitedir-ltr mw-hide-empty-elt ns-0 ns-subject mw-editable page-Reinforcement_learning rootpage-Reinforcement_learning skin-vector-2022 action-view"><a class="mw-jump-link" href="#bodyContent">Jump to content</a> <div class="vector-header-container"> <header class="vector-header mw-header"> <div class="vector-header-start"> <nav class="vector-main-menu-landmark" aria-label="Site"> <div id="vector-main-menu-dropdown" class="vector-dropdown vector-main-menu-dropdown vector-button-flush-left vector-button-flush-right" > <input type="checkbox" id="vector-main-menu-dropdown-checkbox" role="button" aria-haspopup="true" data-event-name="ui.dropdown-vector-main-menu-dropdown" class="vector-dropdown-checkbox " aria-label="Main menu" > <label id="vector-main-menu-dropdown-label" for="vector-main-menu-dropdown-checkbox" class="vector-dropdown-label cdx-button cdx-button--fake-button cdx-button--fake-button--enabled cdx-button--weight-quiet cdx-button--icon-only " aria-hidden="true" ><span class="vector-icon mw-ui-icon-menu mw-ui-icon-wikimedia-menu"></span> <span class="vector-dropdown-label-text">Main menu</span> </label> <div class="vector-dropdown-content"> <div id="vector-main-menu-unpinned-container" class="vector-unpinned-container"> <div id="vector-main-menu" class="vector-main-menu vector-pinnable-element"> <div class="vector-pinnable-header vector-main-menu-pinnable-header vector-pinnable-header-unpinned" data-feature-name="main-menu-pinned" data-pinnable-element-id="vector-main-menu" data-pinned-container-id="vector-main-menu-pinned-container" data-unpinned-container-id="vector-main-menu-unpinned-container" > <div class="vector-pinnable-header-label">Main menu</div> <button class="vector-pinnable-header-toggle-button vector-pinnable-header-pin-button" data-event-name="pinnable-header.vector-main-menu.pin">move to sidebar</button> <button class="vector-pinnable-header-toggle-button vector-pinnable-header-unpin-button" data-event-name="pinnable-header.vector-main-menu.unpin">hide</button> </div> <div id="p-navigation" class="vector-menu mw-portlet mw-portlet-navigation" > <div class="vector-menu-heading"> Navigation </div> <div class="vector-menu-content"> <ul class="vector-menu-content-list"> <li id="n-mainpage-description" class="mw-list-item"><a href="/wiki/Main_Page" title="Visit the main page [z]" accesskey="z"><span>Main page</span></a></li><li id="n-contents" class="mw-list-item"><a href="/wiki/Wikipedia:Contents" title="Guides to browsing Wikipedia"><span>Contents</span></a></li><li id="n-currentevents" class="mw-list-item"><a href="/wiki/Portal:Current_events" title="Articles related to current events"><span>Current events</span></a></li><li id="n-randompage" class="mw-list-item"><a href="/wiki/Special:Random" title="Visit a randomly selected article [x]" accesskey="x"><span>Random article</span></a></li><li id="n-aboutsite" class="mw-list-item"><a href="/wiki/Wikipedia:About" title="Learn about Wikipedia and how it works"><span>About Wikipedia</span></a></li><li id="n-contactpage" class="mw-list-item"><a href="//en.wikipedia.org/wiki/Wikipedia:Contact_us" title="How to contact Wikipedia"><span>Contact us</span></a></li> </ul> </div> </div> <div id="p-interaction" class="vector-menu mw-portlet mw-portlet-interaction" > <div class="vector-menu-heading"> Contribute </div> <div class="vector-menu-content"> <ul class="vector-menu-content-list"> <li id="n-help" class="mw-list-item"><a href="/wiki/Help:Contents" title="Guidance on how to use and edit Wikipedia"><span>Help</span></a></li><li id="n-introduction" class="mw-list-item"><a href="/wiki/Help:Introduction" title="Learn how to edit Wikipedia"><span>Learn to edit</span></a></li><li id="n-portal" class="mw-list-item"><a href="/wiki/Wikipedia:Community_portal" title="The hub for editors"><span>Community portal</span></a></li><li id="n-recentchanges" class="mw-list-item"><a href="/wiki/Special:RecentChanges" title="A list of recent changes to Wikipedia [r]" accesskey="r"><span>Recent changes</span></a></li><li id="n-upload" class="mw-list-item"><a href="/wiki/Wikipedia:File_upload_wizard" title="Add images or other media for use on Wikipedia"><span>Upload file</span></a></li> </ul> </div> </div> </div> </div> </div> </div> </nav> <a href="/wiki/Main_Page" class="mw-logo"> <img class="mw-logo-icon" src="/static/images/icons/wikipedia.png" alt="" aria-hidden="true" height="50" width="50"> <span class="mw-logo-container skin-invert"> <img class="mw-logo-wordmark" alt="Wikipedia" src="/static/images/mobile/copyright/wikipedia-wordmark-en.svg" style="width: 7.5em; height: 1.125em;"> <img class="mw-logo-tagline" alt="The Free Encyclopedia" src="/static/images/mobile/copyright/wikipedia-tagline-en.svg" width="117" height="13" style="width: 7.3125em; height: 0.8125em;"> </span> </a> </div> <div class="vector-header-end"> <div id="p-search" role="search" class="vector-search-box-vue vector-search-box-collapses vector-search-box-show-thumbnail vector-search-box-auto-expand-width vector-search-box"> <a href="/wiki/Special:Search" class="cdx-button cdx-button--fake-button cdx-button--fake-button--enabled cdx-button--weight-quiet cdx-button--icon-only search-toggle" title="Search Wikipedia [f]" accesskey="f"><span class="vector-icon mw-ui-icon-search mw-ui-icon-wikimedia-search"></span> <span>Search</span> </a> <div class="vector-typeahead-search-container"> <div class="cdx-typeahead-search cdx-typeahead-search--show-thumbnail cdx-typeahead-search--auto-expand-width"> <form action="/w/index.php" id="searchform" class="cdx-search-input cdx-search-input--has-end-button"> <div id="simpleSearch" class="cdx-search-input__input-wrapper" data-search-loc="header-moved"> <div class="cdx-text-input cdx-text-input--has-start-icon"> <input class="cdx-text-input__input" type="search" name="search" placeholder="Search Wikipedia" aria-label="Search Wikipedia" autocapitalize="sentences" title="Search Wikipedia [f]" accesskey="f" id="searchInput" > <span class="cdx-text-input__icon cdx-text-input__start-icon"></span> </div> <input type="hidden" name="title" value="Special:Search"> </div> <button class="cdx-button cdx-search-input__end-button">Search</button> </form> </div> </div> </div> <nav class="vector-user-links vector-user-links-wide" aria-label="Personal tools"> <div class="vector-user-links-main"> <div id="p-vector-user-menu-preferences" class="vector-menu mw-portlet emptyPortlet" > <div class="vector-menu-content"> <ul class="vector-menu-content-list"> </ul> </div> </div> <div id="p-vector-user-menu-userpage" class="vector-menu mw-portlet emptyPortlet" > <div class="vector-menu-content"> <ul class="vector-menu-content-list"> </ul> </div> </div> <nav class="vector-appearance-landmark" aria-label="Appearance"> <div id="vector-appearance-dropdown" class="vector-dropdown " title="Change the appearance of the page's font size, width, and color" > <input type="checkbox" id="vector-appearance-dropdown-checkbox" role="button" aria-haspopup="true" data-event-name="ui.dropdown-vector-appearance-dropdown" class="vector-dropdown-checkbox " aria-label="Appearance" > <label id="vector-appearance-dropdown-label" for="vector-appearance-dropdown-checkbox" class="vector-dropdown-label cdx-button cdx-button--fake-button cdx-button--fake-button--enabled cdx-button--weight-quiet cdx-button--icon-only " aria-hidden="true" ><span class="vector-icon mw-ui-icon-appearance mw-ui-icon-wikimedia-appearance"></span> <span class="vector-dropdown-label-text">Appearance</span> </label> <div class="vector-dropdown-content"> <div id="vector-appearance-unpinned-container" class="vector-unpinned-container"> </div> </div> </div> </nav> <div id="p-vector-user-menu-notifications" class="vector-menu mw-portlet emptyPortlet" > <div class="vector-menu-content"> <ul class="vector-menu-content-list"> </ul> </div> </div> <div id="p-vector-user-menu-overflow" class="vector-menu mw-portlet" > <div class="vector-menu-content"> <ul class="vector-menu-content-list"> <li id="pt-sitesupport-2" class="user-links-collapsible-item mw-list-item user-links-collapsible-item"><a data-mw="interface" href="https://donate.wikimedia.org/wiki/Special:FundraiserRedirector?utm_source=donate&utm_medium=sidebar&utm_campaign=C13_en.wikipedia.org&uselang=en" class=""><span>Donate</span></a> </li> <li id="pt-createaccount-2" class="user-links-collapsible-item mw-list-item user-links-collapsible-item"><a data-mw="interface" href="/w/index.php?title=Special:CreateAccount&returnto=Reinforcement+learning" title="You are encouraged to create an account and log in; however, it is not mandatory" class=""><span>Create account</span></a> </li> <li id="pt-login-2" class="user-links-collapsible-item mw-list-item user-links-collapsible-item"><a data-mw="interface" href="/w/index.php?title=Special:UserLogin&returnto=Reinforcement+learning" title="You're encouraged to log in; however, it's not mandatory. [o]" accesskey="o" class=""><span>Log in</span></a> </li> </ul> </div> </div> </div> <div id="vector-user-links-dropdown" class="vector-dropdown vector-user-menu vector-button-flush-right vector-user-menu-logged-out" title="Log in and more options" > <input type="checkbox" id="vector-user-links-dropdown-checkbox" role="button" aria-haspopup="true" data-event-name="ui.dropdown-vector-user-links-dropdown" class="vector-dropdown-checkbox " aria-label="Personal tools" > <label id="vector-user-links-dropdown-label" for="vector-user-links-dropdown-checkbox" class="vector-dropdown-label cdx-button cdx-button--fake-button cdx-button--fake-button--enabled cdx-button--weight-quiet cdx-button--icon-only " aria-hidden="true" ><span class="vector-icon mw-ui-icon-ellipsis mw-ui-icon-wikimedia-ellipsis"></span> <span class="vector-dropdown-label-text">Personal tools</span> </label> <div class="vector-dropdown-content"> <div id="p-personal" class="vector-menu mw-portlet mw-portlet-personal user-links-collapsible-item" title="User menu" > <div class="vector-menu-content"> <ul class="vector-menu-content-list"> <li id="pt-sitesupport" class="user-links-collapsible-item mw-list-item"><a href="https://donate.wikimedia.org/wiki/Special:FundraiserRedirector?utm_source=donate&utm_medium=sidebar&utm_campaign=C13_en.wikipedia.org&uselang=en"><span>Donate</span></a></li><li id="pt-createaccount" class="user-links-collapsible-item mw-list-item"><a href="/w/index.php?title=Special:CreateAccount&returnto=Reinforcement+learning" title="You are encouraged to create an account and log in; however, it is not mandatory"><span class="vector-icon mw-ui-icon-userAdd mw-ui-icon-wikimedia-userAdd"></span> <span>Create account</span></a></li><li id="pt-login" class="user-links-collapsible-item mw-list-item"><a href="/w/index.php?title=Special:UserLogin&returnto=Reinforcement+learning" title="You're encouraged to log in; however, it's not mandatory. [o]" accesskey="o"><span class="vector-icon mw-ui-icon-logIn mw-ui-icon-wikimedia-logIn"></span> <span>Log in</span></a></li> </ul> </div> </div> <div id="p-user-menu-anon-editor" class="vector-menu mw-portlet mw-portlet-user-menu-anon-editor" > <div class="vector-menu-heading"> Pages for logged out editors <a href="/wiki/Help:Introduction" aria-label="Learn more about editing"><span>learn more</span></a> </div> <div class="vector-menu-content"> <ul class="vector-menu-content-list"> <li id="pt-anoncontribs" class="mw-list-item"><a href="/wiki/Special:MyContributions" title="A list of edits made from this IP address [y]" accesskey="y"><span>Contributions</span></a></li><li id="pt-anontalk" class="mw-list-item"><a href="/wiki/Special:MyTalk" title="Discussion about edits from this IP address [n]" accesskey="n"><span>Talk</span></a></li> </ul> </div> </div> </div> </div> </nav> </div> </header> </div> <div class="mw-page-container"> <div class="mw-page-container-inner"> <div class="vector-sitenotice-container"> <div id="siteNotice"><!-- CentralNotice --></div> </div> <div class="vector-column-start"> <div class="vector-main-menu-container"> <div id="mw-navigation"> <nav id="mw-panel" class="vector-main-menu-landmark" aria-label="Site"> <div id="vector-main-menu-pinned-container" class="vector-pinned-container"> </div> </nav> </div> </div> <div class="vector-sticky-pinned-container"> <nav id="mw-panel-toc" aria-label="Contents" data-event-name="ui.sidebar-toc" class="mw-table-of-contents-container vector-toc-landmark"> <div id="vector-toc-pinned-container" class="vector-pinned-container"> <div id="vector-toc" class="vector-toc vector-pinnable-element"> <div class="vector-pinnable-header vector-toc-pinnable-header vector-pinnable-header-pinned" data-feature-name="toc-pinned" data-pinnable-element-id="vector-toc" > <h2 class="vector-pinnable-header-label">Contents</h2> <button class="vector-pinnable-header-toggle-button vector-pinnable-header-pin-button" data-event-name="pinnable-header.vector-toc.pin">move to sidebar</button> <button class="vector-pinnable-header-toggle-button vector-pinnable-header-unpin-button" data-event-name="pinnable-header.vector-toc.unpin">hide</button> </div> <ul class="vector-toc-contents" id="mw-panel-toc-list"> <li id="toc-mw-content-text" class="vector-toc-list-item vector-toc-level-1"> <a href="#" class="vector-toc-link"> <div class="vector-toc-text">(Top)</div> </a> </li> <li id="toc-Introduction" class="vector-toc-list-item vector-toc-level-1"> <a class="vector-toc-link" href="#Introduction"> <div class="vector-toc-text"> <span class="vector-toc-numb">1</span> <span>Introduction</span> </div> </a> <ul id="toc-Introduction-sublist" class="vector-toc-list"> </ul> </li> <li id="toc-Exploration" class="vector-toc-list-item vector-toc-level-1"> <a class="vector-toc-link" href="#Exploration"> <div class="vector-toc-text"> <span class="vector-toc-numb">2</span> <span>Exploration</span> </div> </a> <ul id="toc-Exploration-sublist" class="vector-toc-list"> </ul> </li> <li id="toc-Algorithms_for_control_learning" class="vector-toc-list-item vector-toc-level-1"> <a class="vector-toc-link" href="#Algorithms_for_control_learning"> <div class="vector-toc-text"> <span class="vector-toc-numb">3</span> <span>Algorithms for control learning</span> </div> </a> <button aria-controls="toc-Algorithms_for_control_learning-sublist" class="cdx-button cdx-button--weight-quiet cdx-button--icon-only vector-toc-toggle"> <span class="vector-icon mw-ui-icon-wikimedia-expand"></span> <span>Toggle Algorithms for control learning subsection</span> </button> <ul id="toc-Algorithms_for_control_learning-sublist" class="vector-toc-list"> <li id="toc-Criterion_of_optimality" class="vector-toc-list-item vector-toc-level-2"> <a class="vector-toc-link" href="#Criterion_of_optimality"> <div class="vector-toc-text"> <span class="vector-toc-numb">3.1</span> <span>Criterion of optimality</span> </div> </a> <ul id="toc-Criterion_of_optimality-sublist" class="vector-toc-list"> <li id="toc-Policy" class="vector-toc-list-item vector-toc-level-3"> <a class="vector-toc-link" href="#Policy"> <div class="vector-toc-text"> <span class="vector-toc-numb">3.1.1</span> <span>Policy</span> </div> </a> <ul id="toc-Policy-sublist" class="vector-toc-list"> </ul> </li> <li id="toc-State-value_function" class="vector-toc-list-item vector-toc-level-3"> <a class="vector-toc-link" href="#State-value_function"> <div class="vector-toc-text"> <span class="vector-toc-numb">3.1.2</span> <span>State-value function</span> </div> </a> <ul id="toc-State-value_function-sublist" class="vector-toc-list"> </ul> </li> </ul> </li> <li id="toc-Brute_force" class="vector-toc-list-item vector-toc-level-2"> <a class="vector-toc-link" href="#Brute_force"> <div class="vector-toc-text"> <span class="vector-toc-numb">3.2</span> <span>Brute force</span> </div> </a> <ul id="toc-Brute_force-sublist" class="vector-toc-list"> </ul> </li> <li id="toc-Value_function" class="vector-toc-list-item vector-toc-level-2"> <a class="vector-toc-link" href="#Value_function"> <div class="vector-toc-text"> <span class="vector-toc-numb">3.3</span> <span>Value function</span> </div> </a> <ul id="toc-Value_function-sublist" class="vector-toc-list"> <li id="toc-Monte_Carlo_methods" class="vector-toc-list-item vector-toc-level-3"> <a class="vector-toc-link" href="#Monte_Carlo_methods"> <div class="vector-toc-text"> <span class="vector-toc-numb">3.3.1</span> <span>Monte Carlo methods</span> </div> </a> <ul id="toc-Monte_Carlo_methods-sublist" class="vector-toc-list"> </ul> </li> <li id="toc-Temporal_difference_methods" class="vector-toc-list-item vector-toc-level-3"> <a class="vector-toc-link" href="#Temporal_difference_methods"> <div class="vector-toc-text"> <span class="vector-toc-numb">3.3.2</span> <span>Temporal difference methods</span> </div> </a> <ul id="toc-Temporal_difference_methods-sublist" class="vector-toc-list"> </ul> </li> <li id="toc-Function_approximation_methods" class="vector-toc-list-item vector-toc-level-3"> <a class="vector-toc-link" href="#Function_approximation_methods"> <div class="vector-toc-text"> <span class="vector-toc-numb">3.3.3</span> <span>Function approximation methods</span> </div> </a> <ul id="toc-Function_approximation_methods-sublist" class="vector-toc-list"> </ul> </li> </ul> </li> <li id="toc-Direct_policy_search" class="vector-toc-list-item vector-toc-level-2"> <a class="vector-toc-link" href="#Direct_policy_search"> <div class="vector-toc-text"> <span class="vector-toc-numb">3.4</span> <span>Direct policy search</span> </div> </a> <ul id="toc-Direct_policy_search-sublist" class="vector-toc-list"> </ul> </li> <li id="toc-Model-based_algorithms" class="vector-toc-list-item vector-toc-level-2"> <a class="vector-toc-link" href="#Model-based_algorithms"> <div class="vector-toc-text"> <span class="vector-toc-numb">3.5</span> <span>Model-based algorithms</span> </div> </a> <ul id="toc-Model-based_algorithms-sublist" class="vector-toc-list"> </ul> </li> </ul> </li> <li id="toc-Theory" class="vector-toc-list-item vector-toc-level-1"> <a class="vector-toc-link" href="#Theory"> <div class="vector-toc-text"> <span class="vector-toc-numb">4</span> <span>Theory</span> </div> </a> <ul id="toc-Theory-sublist" class="vector-toc-list"> </ul> </li> <li id="toc-Research" class="vector-toc-list-item vector-toc-level-1"> <a class="vector-toc-link" href="#Research"> <div class="vector-toc-text"> <span class="vector-toc-numb">5</span> <span>Research</span> </div> </a> <ul id="toc-Research-sublist" class="vector-toc-list"> </ul> </li> <li id="toc-Comparison_of_key_algorithms" class="vector-toc-list-item vector-toc-level-1"> <a class="vector-toc-link" href="#Comparison_of_key_algorithms"> <div class="vector-toc-text"> <span class="vector-toc-numb">6</span> <span>Comparison of key algorithms</span> </div> </a> <button aria-controls="toc-Comparison_of_key_algorithms-sublist" class="cdx-button cdx-button--weight-quiet cdx-button--icon-only vector-toc-toggle"> <span class="vector-icon mw-ui-icon-wikimedia-expand"></span> <span>Toggle Comparison of key algorithms subsection</span> </button> <ul id="toc-Comparison_of_key_algorithms-sublist" class="vector-toc-list"> <li id="toc-Associative_reinforcement_learning" class="vector-toc-list-item vector-toc-level-2"> <a class="vector-toc-link" href="#Associative_reinforcement_learning"> <div class="vector-toc-text"> <span class="vector-toc-numb">6.1</span> <span>Associative reinforcement learning</span> </div> </a> <ul id="toc-Associative_reinforcement_learning-sublist" class="vector-toc-list"> </ul> </li> <li id="toc-Deep_reinforcement_learning" class="vector-toc-list-item vector-toc-level-2"> <a class="vector-toc-link" href="#Deep_reinforcement_learning"> <div class="vector-toc-text"> <span class="vector-toc-numb">6.2</span> <span>Deep reinforcement learning</span> </div> </a> <ul id="toc-Deep_reinforcement_learning-sublist" class="vector-toc-list"> </ul> </li> <li id="toc-Adversarial_deep_reinforcement_learning" class="vector-toc-list-item vector-toc-level-2"> <a class="vector-toc-link" href="#Adversarial_deep_reinforcement_learning"> <div class="vector-toc-text"> <span class="vector-toc-numb">6.3</span> <span>Adversarial deep reinforcement learning</span> </div> </a> <ul id="toc-Adversarial_deep_reinforcement_learning-sublist" class="vector-toc-list"> </ul> </li> <li id="toc-Fuzzy_reinforcement_learning" class="vector-toc-list-item vector-toc-level-2"> <a class="vector-toc-link" href="#Fuzzy_reinforcement_learning"> <div class="vector-toc-text"> <span class="vector-toc-numb">6.4</span> <span>Fuzzy reinforcement learning</span> </div> </a> <ul id="toc-Fuzzy_reinforcement_learning-sublist" class="vector-toc-list"> </ul> </li> <li id="toc-Inverse_reinforcement_learning" class="vector-toc-list-item vector-toc-level-2"> <a class="vector-toc-link" href="#Inverse_reinforcement_learning"> <div class="vector-toc-text"> <span class="vector-toc-numb">6.5</span> <span>Inverse reinforcement learning</span> </div> </a> <ul id="toc-Inverse_reinforcement_learning-sublist" class="vector-toc-list"> </ul> </li> <li id="toc-Safe_reinforcement_learning" class="vector-toc-list-item vector-toc-level-2"> <a class="vector-toc-link" href="#Safe_reinforcement_learning"> <div class="vector-toc-text"> <span class="vector-toc-numb">6.6</span> <span>Safe reinforcement learning</span> </div> </a> <ul id="toc-Safe_reinforcement_learning-sublist" class="vector-toc-list"> </ul> </li> <li id="toc-Self-reinforcement_learning" class="vector-toc-list-item vector-toc-level-2"> <a class="vector-toc-link" href="#Self-reinforcement_learning"> <div class="vector-toc-text"> <span class="vector-toc-numb">6.7</span> <span>Self-reinforcement learning</span> </div> </a> <ul id="toc-Self-reinforcement_learning-sublist" class="vector-toc-list"> </ul> </li> </ul> </li> <li id="toc-Statistical_comparison_of_reinforcement_learning_algorithms" class="vector-toc-list-item vector-toc-level-1"> <a class="vector-toc-link" href="#Statistical_comparison_of_reinforcement_learning_algorithms"> <div class="vector-toc-text"> <span class="vector-toc-numb">7</span> <span>Statistical comparison of reinforcement learning algorithms</span> </div> </a> <ul id="toc-Statistical_comparison_of_reinforcement_learning_algorithms-sublist" class="vector-toc-list"> </ul> </li> <li id="toc-See_also" class="vector-toc-list-item vector-toc-level-1"> <a class="vector-toc-link" href="#See_also"> <div class="vector-toc-text"> <span class="vector-toc-numb">8</span> <span>See also</span> </div> </a> <ul id="toc-See_also-sublist" class="vector-toc-list"> </ul> </li> <li id="toc-References" class="vector-toc-list-item vector-toc-level-1"> <a class="vector-toc-link" href="#References"> <div class="vector-toc-text"> <span class="vector-toc-numb">9</span> <span>References</span> </div> </a> <ul id="toc-References-sublist" class="vector-toc-list"> </ul> </li> <li id="toc-Further_reading" class="vector-toc-list-item vector-toc-level-1"> <a class="vector-toc-link" href="#Further_reading"> <div class="vector-toc-text"> <span class="vector-toc-numb">10</span> <span>Further reading</span> </div> </a> <ul id="toc-Further_reading-sublist" class="vector-toc-list"> </ul> </li> <li id="toc-External_links" class="vector-toc-list-item vector-toc-level-1"> <a class="vector-toc-link" href="#External_links"> <div class="vector-toc-text"> <span class="vector-toc-numb">11</span> <span>External links</span> </div> </a> <ul id="toc-External_links-sublist" class="vector-toc-list"> </ul> </li> </ul> </div> </div> </nav> </div> </div> <div class="mw-content-container"> <main id="content" class="mw-body"> <header class="mw-body-header vector-page-titlebar"> <nav aria-label="Contents" class="vector-toc-landmark"> <div id="vector-page-titlebar-toc" class="vector-dropdown vector-page-titlebar-toc vector-button-flush-left" > <input type="checkbox" id="vector-page-titlebar-toc-checkbox" role="button" aria-haspopup="true" data-event-name="ui.dropdown-vector-page-titlebar-toc" class="vector-dropdown-checkbox " aria-label="Toggle the table of contents" > <label id="vector-page-titlebar-toc-label" for="vector-page-titlebar-toc-checkbox" class="vector-dropdown-label cdx-button cdx-button--fake-button cdx-button--fake-button--enabled cdx-button--weight-quiet cdx-button--icon-only " aria-hidden="true" ><span class="vector-icon mw-ui-icon-listBullet mw-ui-icon-wikimedia-listBullet"></span> <span class="vector-dropdown-label-text">Toggle the table of contents</span> </label> <div class="vector-dropdown-content"> <div id="vector-page-titlebar-toc-unpinned-container" class="vector-unpinned-container"> </div> </div> </div> </nav> <h1 id="firstHeading" class="firstHeading mw-first-heading"><span class="mw-page-title-main">Reinforcement learning</span></h1> <div id="p-lang-btn" class="vector-dropdown mw-portlet mw-portlet-lang" > <input type="checkbox" id="p-lang-btn-checkbox" role="button" aria-haspopup="true" data-event-name="ui.dropdown-p-lang-btn" class="vector-dropdown-checkbox mw-interlanguage-selector" aria-label="Go to an article in another language. Available in 38 languages" > <label id="p-lang-btn-label" for="p-lang-btn-checkbox" class="vector-dropdown-label cdx-button cdx-button--fake-button cdx-button--fake-button--enabled cdx-button--weight-quiet cdx-button--action-progressive mw-portlet-lang-heading-38" aria-hidden="true" ><span class="vector-icon mw-ui-icon-language-progressive mw-ui-icon-wikimedia-language-progressive"></span> <span class="vector-dropdown-label-text">38 languages</span> </label> <div class="vector-dropdown-content"> <div class="vector-menu-content"> <ul class="vector-menu-content-list"> <li class="interlanguage-link interwiki-ar mw-list-item"><a href="https://ar.wikipedia.org/wiki/%D8%AA%D8%B9%D9%84%D9%85_%D8%A8%D8%A7%D9%84%D8%AA%D8%B9%D8%B2%D9%8A%D8%B2" title="تعلم بالتعزيز – Arabic" lang="ar" hreflang="ar" data-title="تعلم بالتعزيز" data-language-autonym="العربية" data-language-local-name="Arabic" class="interlanguage-link-target"><span>العربية</span></a></li><li class="interlanguage-link interwiki-bn mw-list-item"><a href="https://bn.wikipedia.org/wiki/%E0%A6%AC%E0%A6%B2%E0%A6%AC%E0%A6%B0%E0%A7%8D%E0%A6%A7%E0%A6%A8%E0%A6%AE%E0%A7%82%E0%A6%B2%E0%A6%95_%E0%A6%B6%E0%A6%BF%E0%A6%96%E0%A6%A8" title="বলবর্ধনমূলক শিখন – Bangla" lang="bn" hreflang="bn" data-title="বলবর্ধনমূলক শিখন" data-language-autonym="বাংলা" data-language-local-name="Bangla" class="interlanguage-link-target"><span>বাংলা</span></a></li><li class="interlanguage-link interwiki-bg mw-list-item"><a href="https://bg.wikipedia.org/wiki/%D0%9E%D0%B1%D1%83%D1%87%D0%B5%D0%BD%D0%B8%D0%B5_%D1%81_%D1%83%D1%82%D0%B2%D1%8A%D1%80%D0%B6%D0%B4%D0%B5%D0%BD%D0%B8%D0%B5" title="Обучение с утвърждение – Bulgarian" lang="bg" hreflang="bg" data-title="Обучение с утвърждение" data-language-autonym="Български" data-language-local-name="Bulgarian" class="interlanguage-link-target"><span>Български</span></a></li><li class="interlanguage-link interwiki-bs mw-list-item"><a href="https://bs.wikipedia.org/wiki/Podr%C5%BEano_u%C4%8Denje" title="Podržano učenje – Bosnian" lang="bs" hreflang="bs" data-title="Podržano učenje" data-language-autonym="Bosanski" data-language-local-name="Bosnian" class="interlanguage-link-target"><span>Bosanski</span></a></li><li class="interlanguage-link interwiki-ca mw-list-item"><a href="https://ca.wikipedia.org/wiki/Aprenentatge_per_refor%C3%A7" title="Aprenentatge per reforç – Catalan" lang="ca" hreflang="ca" data-title="Aprenentatge per reforç" data-language-autonym="Català" data-language-local-name="Catalan" class="interlanguage-link-target"><span>Català</span></a></li><li class="interlanguage-link interwiki-cs mw-list-item"><a href="https://cs.wikipedia.org/wiki/Zp%C4%9Btnovazebn%C3%AD_u%C4%8Den%C3%AD" title="Zpětnovazební učení – Czech" lang="cs" hreflang="cs" data-title="Zpětnovazební učení" data-language-autonym="Čeština" data-language-local-name="Czech" class="interlanguage-link-target"><span>Čeština</span></a></li><li class="interlanguage-link interwiki-de mw-list-item"><a href="https://de.wikipedia.org/wiki/Best%C3%A4rkendes_Lernen" title="Bestärkendes Lernen – German" lang="de" hreflang="de" data-title="Bestärkendes Lernen" data-language-autonym="Deutsch" data-language-local-name="German" class="interlanguage-link-target"><span>Deutsch</span></a></li><li class="interlanguage-link interwiki-et mw-list-item"><a href="https://et.wikipedia.org/wiki/Stiimul%C3%B5pe" title="Stiimulõpe – Estonian" lang="et" hreflang="et" data-title="Stiimulõpe" data-language-autonym="Eesti" data-language-local-name="Estonian" class="interlanguage-link-target"><span>Eesti</span></a></li><li class="interlanguage-link interwiki-el mw-list-item"><a href="https://el.wikipedia.org/wiki/%CE%95%CE%BD%CE%B9%CF%83%CF%87%CF%85%CF%84%CE%B9%CE%BA%CE%AE_%CE%BC%CE%AC%CE%B8%CE%B7%CF%83%CE%B7" title="Ενισχυτική μάθηση – Greek" lang="el" hreflang="el" data-title="Ενισχυτική μάθηση" data-language-autonym="Ελληνικά" data-language-local-name="Greek" class="interlanguage-link-target"><span>Ελληνικά</span></a></li><li class="interlanguage-link interwiki-es mw-list-item"><a href="https://es.wikipedia.org/wiki/Aprendizaje_por_refuerzo" title="Aprendizaje por refuerzo – Spanish" lang="es" hreflang="es" data-title="Aprendizaje por refuerzo" data-language-autonym="Español" data-language-local-name="Spanish" class="interlanguage-link-target"><span>Español</span></a></li><li class="interlanguage-link interwiki-eu mw-list-item"><a href="https://eu.wikipedia.org/wiki/Indargarri_bidezko_ikaskuntza" title="Indargarri bidezko ikaskuntza – Basque" lang="eu" hreflang="eu" data-title="Indargarri bidezko ikaskuntza" data-language-autonym="Euskara" data-language-local-name="Basque" class="interlanguage-link-target"><span>Euskara</span></a></li><li class="interlanguage-link interwiki-fa mw-list-item"><a href="https://fa.wikipedia.org/wiki/%DB%8C%D8%A7%D8%AF%DA%AF%DB%8C%D8%B1%DB%8C_%D8%AA%D9%82%D9%88%DB%8C%D8%AA%DB%8C" title="یادگیری تقویتی – Persian" lang="fa" hreflang="fa" data-title="یادگیری تقویتی" data-language-autonym="فارسی" data-language-local-name="Persian" class="interlanguage-link-target"><span>فارسی</span></a></li><li class="interlanguage-link interwiki-fr mw-list-item"><a href="https://fr.wikipedia.org/wiki/Apprentissage_par_renforcement" title="Apprentissage par renforcement – French" lang="fr" hreflang="fr" data-title="Apprentissage par renforcement" data-language-autonym="Français" data-language-local-name="French" class="interlanguage-link-target"><span>Français</span></a></li><li class="interlanguage-link interwiki-ko mw-list-item"><a href="https://ko.wikipedia.org/wiki/%EA%B0%95%ED%99%94_%ED%95%99%EC%8A%B5" title="강화 학습 – Korean" lang="ko" hreflang="ko" data-title="강화 학습" data-language-autonym="한국어" data-language-local-name="Korean" class="interlanguage-link-target"><span>한국어</span></a></li><li class="interlanguage-link interwiki-hy mw-list-item"><a href="https://hy.wikipedia.org/wiki/%D4%B1%D5%B4%D6%80%D5%A1%D5%BA%D5%B6%D5%A4%D5%B4%D5%A1%D5%B6_%D5%B8%D6%82%D5%BD%D5%B8%D6%82%D6%81%D5%B8%D6%82%D5%B4" title="Ամրապնդման ուսուցում – Armenian" lang="hy" hreflang="hy" data-title="Ամրապնդման ուսուցում" data-language-autonym="Հայերեն" data-language-local-name="Armenian" class="interlanguage-link-target"><span>Հայերեն</span></a></li><li class="interlanguage-link interwiki-id mw-list-item"><a href="https://id.wikipedia.org/wiki/Reinforcement_learning_(pemelajaran_mesin)" title="Reinforcement learning (pemelajaran mesin) – Indonesian" lang="id" hreflang="id" data-title="Reinforcement learning (pemelajaran mesin)" data-language-autonym="Bahasa Indonesia" data-language-local-name="Indonesian" class="interlanguage-link-target"><span>Bahasa Indonesia</span></a></li><li class="interlanguage-link interwiki-it mw-list-item"><a href="https://it.wikipedia.org/wiki/Apprendimento_per_rinforzo" title="Apprendimento per rinforzo – Italian" lang="it" hreflang="it" data-title="Apprendimento per rinforzo" data-language-autonym="Italiano" data-language-local-name="Italian" class="interlanguage-link-target"><span>Italiano</span></a></li><li class="interlanguage-link interwiki-he mw-list-item"><a href="https://he.wikipedia.org/wiki/%D7%9C%D7%9E%D7%99%D7%93%D7%AA_%D7%97%D7%99%D7%96%D7%95%D7%A7" title="למידת חיזוק – Hebrew" lang="he" hreflang="he" data-title="למידת חיזוק" data-language-autonym="עברית" data-language-local-name="Hebrew" class="interlanguage-link-target"><span>עברית</span></a></li><li class="interlanguage-link interwiki-ms mw-list-item"><a href="https://ms.wikipedia.org/wiki/Pembelajaran_pengukuhan" title="Pembelajaran pengukuhan – Malay" lang="ms" hreflang="ms" data-title="Pembelajaran pengukuhan" data-language-autonym="Bahasa Melayu" data-language-local-name="Malay" class="interlanguage-link-target"><span>Bahasa Melayu</span></a></li><li class="interlanguage-link interwiki-nl mw-list-item"><a href="https://nl.wikipedia.org/wiki/Reinforcement_learning" title="Reinforcement learning – Dutch" lang="nl" hreflang="nl" data-title="Reinforcement learning" data-language-autonym="Nederlands" data-language-local-name="Dutch" class="interlanguage-link-target"><span>Nederlands</span></a></li><li class="interlanguage-link interwiki-ja mw-list-item"><a href="https://ja.wikipedia.org/wiki/%E5%BC%B7%E5%8C%96%E5%AD%A6%E7%BF%92" title="強化学習 – Japanese" lang="ja" hreflang="ja" data-title="強化学習" data-language-autonym="日本語" data-language-local-name="Japanese" class="interlanguage-link-target"><span>日本語</span></a></li><li class="interlanguage-link interwiki-no mw-list-item"><a href="https://no.wikipedia.org/wiki/Forsterkende_l%C3%A6ring" title="Forsterkende læring – Norwegian Bokmål" lang="nb" hreflang="nb" data-title="Forsterkende læring" data-language-autonym="Norsk bokmål" data-language-local-name="Norwegian Bokmål" class="interlanguage-link-target"><span>Norsk bokmål</span></a></li><li class="interlanguage-link interwiki-or mw-list-item"><a href="https://or.wikipedia.org/wiki/%E0%AC%B0%E0%AC%BF%E0%AC%8F%E0%AC%A8%E0%AD%8D%E0%AC%AB%E0%AD%8B%E0%AC%B0%E0%AD%8D%E0%AC%B8%E0%AC%AE%E0%AD%87%E0%AC%A3%E0%AD%8D%E0%AC%9F_%E0%AC%B2%E0%AC%B0%E0%AD%8D%E0%AC%A3%E0%AD%8D%E0%AC%A3%E0%AC%BF%E0%AC%99%E0%AD%8D%E0%AC%97%E0%AD%8D" title="ରିଏନ୍ଫୋର୍ସମେଣ୍ଟ ଲର୍ଣ୍ଣିଙ୍ଗ୍ – Odia" lang="or" hreflang="or" data-title="ରିଏନ୍ଫୋର୍ସମେଣ୍ଟ ଲର୍ଣ୍ଣିଙ୍ଗ୍" data-language-autonym="ଓଡ଼ିଆ" data-language-local-name="Odia" class="interlanguage-link-target"><span>ଓଡ଼ିଆ</span></a></li><li class="interlanguage-link interwiki-pl mw-list-item"><a href="https://pl.wikipedia.org/wiki/Uczenie_przez_wzmacnianie" title="Uczenie przez wzmacnianie – Polish" lang="pl" hreflang="pl" data-title="Uczenie przez wzmacnianie" data-language-autonym="Polski" data-language-local-name="Polish" class="interlanguage-link-target"><span>Polski</span></a></li><li class="interlanguage-link interwiki-qu mw-list-item"><a href="https://qu.wikipedia.org/wiki/Kallpanchana_yachay" title="Kallpanchana yachay – Quechua" lang="qu" hreflang="qu" data-title="Kallpanchana yachay" data-language-autonym="Runa Simi" data-language-local-name="Quechua" class="interlanguage-link-target"><span>Runa Simi</span></a></li><li class="interlanguage-link interwiki-ru mw-list-item"><a href="https://ru.wikipedia.org/wiki/%D0%9E%D0%B1%D1%83%D1%87%D0%B5%D0%BD%D0%B8%D0%B5_%D1%81_%D0%BF%D0%BE%D0%B4%D0%BA%D1%80%D0%B5%D0%BF%D0%BB%D0%B5%D0%BD%D0%B8%D0%B5%D0%BC" title="Обучение с подкреплением – Russian" lang="ru" hreflang="ru" data-title="Обучение с подкреплением" data-language-autonym="Русский" data-language-local-name="Russian" class="interlanguage-link-target"><span>Русский</span></a></li><li class="interlanguage-link interwiki-simple mw-list-item"><a href="https://simple.wikipedia.org/wiki/Reinforcement_learning" title="Reinforcement learning – Simple English" lang="en-simple" hreflang="en-simple" data-title="Reinforcement learning" data-language-autonym="Simple English" data-language-local-name="Simple English" class="interlanguage-link-target"><span>Simple English</span></a></li><li class="interlanguage-link interwiki-sl mw-list-item"><a href="https://sl.wikipedia.org/wiki/Spodbujevano_u%C4%8Denje" title="Spodbujevano učenje – Slovenian" lang="sl" hreflang="sl" data-title="Spodbujevano učenje" data-language-autonym="Slovenščina" data-language-local-name="Slovenian" class="interlanguage-link-target"><span>Slovenščina</span></a></li><li class="interlanguage-link interwiki-ckb mw-list-item"><a href="https://ckb.wikipedia.org/wiki/%D9%81%DB%8E%D8%B1%D8%A8%D9%88%D9%88%D9%86%DB%8C_%D8%A8%DB%95%DA%BE%DB%8E%D8%B2%DA%A9%D8%B1%D8%AF%D9%86%DB%95%D9%88%DB%95" title="فێربوونی بەھێزکردنەوە – Central Kurdish" lang="ckb" hreflang="ckb" data-title="فێربوونی بەھێزکردنەوە" data-language-autonym="کوردی" data-language-local-name="Central Kurdish" class="interlanguage-link-target"><span>کوردی</span></a></li><li class="interlanguage-link interwiki-sr mw-list-item"><a href="https://sr.wikipedia.org/wiki/Podr%C5%BEano_u%C4%8Denje" title="Podržano učenje – Serbian" lang="sr" hreflang="sr" data-title="Podržano učenje" data-language-autonym="Српски / srpski" data-language-local-name="Serbian" class="interlanguage-link-target"><span>Српски / srpski</span></a></li><li class="interlanguage-link interwiki-fi mw-list-item"><a href="https://fi.wikipedia.org/wiki/Vahvistusoppiminen" title="Vahvistusoppiminen – Finnish" lang="fi" hreflang="fi" data-title="Vahvistusoppiminen" data-language-autonym="Suomi" data-language-local-name="Finnish" class="interlanguage-link-target"><span>Suomi</span></a></li><li class="interlanguage-link interwiki-sv mw-list-item"><a href="https://sv.wikipedia.org/wiki/F%C3%B6rst%C3%A4rkningsinl%C3%A4rning" title="Förstärkningsinlärning – Swedish" lang="sv" hreflang="sv" data-title="Förstärkningsinlärning" data-language-autonym="Svenska" data-language-local-name="Swedish" class="interlanguage-link-target"><span>Svenska</span></a></li><li class="interlanguage-link interwiki-tr mw-list-item"><a href="https://tr.wikipedia.org/wiki/Peki%C5%9Ftirmeli_%C3%B6%C4%9Frenme" title="Pekiştirmeli öğrenme – Turkish" lang="tr" hreflang="tr" data-title="Pekiştirmeli öğrenme" data-language-autonym="Türkçe" data-language-local-name="Turkish" class="interlanguage-link-target"><span>Türkçe</span></a></li><li class="interlanguage-link interwiki-uk mw-list-item"><a href="https://uk.wikipedia.org/wiki/%D0%9D%D0%B0%D0%B2%D1%87%D0%B0%D0%BD%D0%BD%D1%8F_%D0%B7_%D0%BF%D1%96%D0%B4%D0%BA%D1%80%D1%96%D0%BF%D0%BB%D0%B5%D0%BD%D0%BD%D1%8F%D0%BC" title="Навчання з підкріпленням – Ukrainian" lang="uk" hreflang="uk" data-title="Навчання з підкріпленням" data-language-autonym="Українська" data-language-local-name="Ukrainian" class="interlanguage-link-target"><span>Українська</span></a></li><li class="interlanguage-link interwiki-vi mw-list-item"><a href="https://vi.wikipedia.org/wiki/H%E1%BB%8Dc_t%C4%83ng_c%C6%B0%E1%BB%9Dng" title="Học tăng cường – Vietnamese" lang="vi" hreflang="vi" data-title="Học tăng cường" data-language-autonym="Tiếng Việt" data-language-local-name="Vietnamese" class="interlanguage-link-target"><span>Tiếng Việt</span></a></li><li class="interlanguage-link interwiki-wuu mw-list-item"><a href="https://wuu.wikipedia.org/wiki/%E5%BC%BA%E5%8C%96%E5%AD%A6%E4%B9%A0" title="强化学习 – Wu" lang="wuu" hreflang="wuu" data-title="强化学习" data-language-autonym="吴语" data-language-local-name="Wu" class="interlanguage-link-target"><span>吴语</span></a></li><li class="interlanguage-link interwiki-zh-yue mw-list-item"><a href="https://zh-yue.wikipedia.org/wiki/%E5%BC%B7%E5%8C%96%E5%AD%B8%E7%BF%92" title="強化學習 – Cantonese" lang="yue" hreflang="yue" data-title="強化學習" data-language-autonym="粵語" data-language-local-name="Cantonese" class="interlanguage-link-target"><span>粵語</span></a></li><li class="interlanguage-link interwiki-zh mw-list-item"><a href="https://zh.wikipedia.org/wiki/%E5%BC%BA%E5%8C%96%E5%AD%A6%E4%B9%A0" title="强化学习 – Chinese" lang="zh" hreflang="zh" data-title="强化学习" data-language-autonym="中文" data-language-local-name="Chinese" class="interlanguage-link-target"><span>中文</span></a></li> </ul> <div class="after-portlet after-portlet-lang"><span class="wb-langlinks-edit wb-langlinks-link"><a href="https://www.wikidata.org/wiki/Special:EntityPage/Q830687#sitelinks-wikipedia" title="Edit interlanguage links" class="wbc-editpage">Edit links</a></span></div> </div> </div> </div> </header> <div class="vector-page-toolbar"> <div class="vector-page-toolbar-container"> <div id="left-navigation"> <nav aria-label="Namespaces"> <div id="p-associated-pages" class="vector-menu vector-menu-tabs mw-portlet mw-portlet-associated-pages" > <div class="vector-menu-content"> <ul class="vector-menu-content-list"> <li id="ca-nstab-main" class="selected vector-tab-noicon mw-list-item"><a href="/wiki/Reinforcement_learning" title="View the content page [c]" accesskey="c"><span>Article</span></a></li><li id="ca-talk" class="vector-tab-noicon mw-list-item"><a href="/wiki/Talk:Reinforcement_learning" rel="discussion" title="Discuss improvements to the content page [t]" accesskey="t"><span>Talk</span></a></li> </ul> </div> </div> <div id="vector-variants-dropdown" class="vector-dropdown emptyPortlet" > <input type="checkbox" id="vector-variants-dropdown-checkbox" role="button" aria-haspopup="true" data-event-name="ui.dropdown-vector-variants-dropdown" class="vector-dropdown-checkbox " aria-label="Change language variant" > <label id="vector-variants-dropdown-label" for="vector-variants-dropdown-checkbox" class="vector-dropdown-label cdx-button cdx-button--fake-button cdx-button--fake-button--enabled cdx-button--weight-quiet" aria-hidden="true" ><span class="vector-dropdown-label-text">English</span> </label> <div class="vector-dropdown-content"> <div id="p-variants" class="vector-menu mw-portlet mw-portlet-variants emptyPortlet" > <div class="vector-menu-content"> <ul class="vector-menu-content-list"> </ul> </div> </div> </div> </div> </nav> </div> <div id="right-navigation" class="vector-collapsible"> <nav aria-label="Views"> <div id="p-views" class="vector-menu vector-menu-tabs mw-portlet mw-portlet-views" > <div class="vector-menu-content"> <ul class="vector-menu-content-list"> <li id="ca-view" class="selected vector-tab-noicon mw-list-item"><a href="/wiki/Reinforcement_learning"><span>Read</span></a></li><li id="ca-edit" class="vector-tab-noicon mw-list-item"><a href="/w/index.php?title=Reinforcement_learning&action=edit" title="Edit this page [e]" accesskey="e"><span>Edit</span></a></li><li id="ca-history" class="vector-tab-noicon mw-list-item"><a href="/w/index.php?title=Reinforcement_learning&action=history" title="Past revisions of this page [h]" accesskey="h"><span>View history</span></a></li> </ul> </div> </div> </nav> <nav class="vector-page-tools-landmark" aria-label="Page tools"> <div id="vector-page-tools-dropdown" class="vector-dropdown vector-page-tools-dropdown" > <input type="checkbox" id="vector-page-tools-dropdown-checkbox" role="button" aria-haspopup="true" data-event-name="ui.dropdown-vector-page-tools-dropdown" class="vector-dropdown-checkbox " aria-label="Tools" > <label id="vector-page-tools-dropdown-label" for="vector-page-tools-dropdown-checkbox" class="vector-dropdown-label cdx-button cdx-button--fake-button cdx-button--fake-button--enabled cdx-button--weight-quiet" aria-hidden="true" ><span class="vector-dropdown-label-text">Tools</span> </label> <div class="vector-dropdown-content"> <div id="vector-page-tools-unpinned-container" class="vector-unpinned-container"> <div id="vector-page-tools" class="vector-page-tools vector-pinnable-element"> <div class="vector-pinnable-header vector-page-tools-pinnable-header vector-pinnable-header-unpinned" data-feature-name="page-tools-pinned" data-pinnable-element-id="vector-page-tools" data-pinned-container-id="vector-page-tools-pinned-container" data-unpinned-container-id="vector-page-tools-unpinned-container" > <div class="vector-pinnable-header-label">Tools</div> <button class="vector-pinnable-header-toggle-button vector-pinnable-header-pin-button" data-event-name="pinnable-header.vector-page-tools.pin">move to sidebar</button> <button class="vector-pinnable-header-toggle-button vector-pinnable-header-unpin-button" data-event-name="pinnable-header.vector-page-tools.unpin">hide</button> </div> <div id="p-cactions" class="vector-menu mw-portlet mw-portlet-cactions emptyPortlet vector-has-collapsible-items" title="More options" > <div class="vector-menu-heading"> Actions </div> <div class="vector-menu-content"> <ul class="vector-menu-content-list"> <li id="ca-more-view" class="selected vector-more-collapsible-item mw-list-item"><a href="/wiki/Reinforcement_learning"><span>Read</span></a></li><li id="ca-more-edit" class="vector-more-collapsible-item mw-list-item"><a href="/w/index.php?title=Reinforcement_learning&action=edit" title="Edit this page [e]" accesskey="e"><span>Edit</span></a></li><li id="ca-more-history" class="vector-more-collapsible-item mw-list-item"><a href="/w/index.php?title=Reinforcement_learning&action=history"><span>View history</span></a></li> </ul> </div> </div> <div id="p-tb" class="vector-menu mw-portlet mw-portlet-tb" > <div class="vector-menu-heading"> General </div> <div class="vector-menu-content"> <ul class="vector-menu-content-list"> <li id="t-whatlinkshere" class="mw-list-item"><a href="/wiki/Special:WhatLinksHere/Reinforcement_learning" title="List of all English Wikipedia pages containing links to this page [j]" accesskey="j"><span>What links here</span></a></li><li id="t-recentchangeslinked" class="mw-list-item"><a href="/wiki/Special:RecentChangesLinked/Reinforcement_learning" rel="nofollow" title="Recent changes in pages linked from this page [k]" accesskey="k"><span>Related changes</span></a></li><li id="t-upload" class="mw-list-item"><a href="/wiki/Wikipedia:File_Upload_Wizard" title="Upload files [u]" accesskey="u"><span>Upload file</span></a></li><li id="t-specialpages" class="mw-list-item"><a href="/wiki/Special:SpecialPages" title="A list of all special pages [q]" accesskey="q"><span>Special pages</span></a></li><li id="t-permalink" class="mw-list-item"><a href="/w/index.php?title=Reinforcement_learning&oldid=1259200247" title="Permanent link to this revision of this page"><span>Permanent link</span></a></li><li id="t-info" class="mw-list-item"><a href="/w/index.php?title=Reinforcement_learning&action=info" title="More information about this page"><span>Page information</span></a></li><li id="t-cite" class="mw-list-item"><a href="/w/index.php?title=Special:CiteThisPage&page=Reinforcement_learning&id=1259200247&wpFormIdentifier=titleform" title="Information on how to cite this page"><span>Cite this page</span></a></li><li id="t-urlshortener" class="mw-list-item"><a href="/w/index.php?title=Special:UrlShortener&url=https%3A%2F%2Fen.wikipedia.org%2Fwiki%2FReinforcement_learning"><span>Get shortened URL</span></a></li><li id="t-urlshortener-qrcode" class="mw-list-item"><a href="/w/index.php?title=Special:QrCode&url=https%3A%2F%2Fen.wikipedia.org%2Fwiki%2FReinforcement_learning"><span>Download QR code</span></a></li> </ul> </div> </div> <div id="p-coll-print_export" class="vector-menu mw-portlet mw-portlet-coll-print_export" > <div class="vector-menu-heading"> Print/export </div> <div class="vector-menu-content"> <ul class="vector-menu-content-list"> <li id="coll-download-as-rl" class="mw-list-item"><a href="/w/index.php?title=Special:DownloadAsPdf&page=Reinforcement_learning&action=show-download-screen" title="Download this page as a PDF file"><span>Download as PDF</span></a></li><li id="t-print" class="mw-list-item"><a href="/w/index.php?title=Reinforcement_learning&printable=yes" title="Printable version of this page [p]" accesskey="p"><span>Printable version</span></a></li> </ul> </div> </div> <div id="p-wikibase-otherprojects" class="vector-menu mw-portlet mw-portlet-wikibase-otherprojects" > <div class="vector-menu-heading"> In other projects </div> <div class="vector-menu-content"> <ul class="vector-menu-content-list"> <li class="wb-otherproject-link wb-otherproject-commons mw-list-item"><a href="https://commons.wikimedia.org/wiki/Category:Reinforcement_learning" hreflang="en"><span>Wikimedia Commons</span></a></li><li id="t-wikibase" class="wb-otherproject-link wb-otherproject-wikibase-dataitem mw-list-item"><a href="https://www.wikidata.org/wiki/Special:EntityPage/Q830687" title="Structured data on this page hosted by Wikidata [g]" accesskey="g"><span>Wikidata item</span></a></li> </ul> </div> </div> </div> </div> </div> </div> </nav> </div> </div> </div> <div class="vector-column-end"> <div class="vector-sticky-pinned-container"> <nav class="vector-page-tools-landmark" aria-label="Page tools"> <div id="vector-page-tools-pinned-container" class="vector-pinned-container"> </div> </nav> <nav class="vector-appearance-landmark" aria-label="Appearance"> <div id="vector-appearance-pinned-container" class="vector-pinned-container"> <div id="vector-appearance" class="vector-appearance vector-pinnable-element"> <div class="vector-pinnable-header vector-appearance-pinnable-header vector-pinnable-header-pinned" data-feature-name="appearance-pinned" data-pinnable-element-id="vector-appearance" data-pinned-container-id="vector-appearance-pinned-container" data-unpinned-container-id="vector-appearance-unpinned-container" > <div class="vector-pinnable-header-label">Appearance</div> <button class="vector-pinnable-header-toggle-button vector-pinnable-header-pin-button" data-event-name="pinnable-header.vector-appearance.pin">move to sidebar</button> <button class="vector-pinnable-header-toggle-button vector-pinnable-header-unpin-button" data-event-name="pinnable-header.vector-appearance.unpin">hide</button> </div> </div> </div> </nav> </div> </div> <div id="bodyContent" class="vector-body" aria-labelledby="firstHeading" data-mw-ve-target-container> <div class="vector-body-before-content"> <div class="mw-indicators"> </div> <div id="siteSub" class="noprint">From Wikipedia, the free encyclopedia</div> </div> <div id="contentSub"><div id="mw-content-subtitle"></div></div> <div id="mw-content-text" class="mw-body-content"><div class="mw-content-ltr mw-parser-output" lang="en" dir="ltr"><div class="shortdescription nomobile noexcerpt noprint searchaux" style="display:none">Field of machine learning</div> <style data-mw-deduplicate="TemplateStyles:r1236090951">.mw-parser-output .hatnote{font-style:italic}.mw-parser-output div.hatnote{padding-left:1.6em;margin-bottom:0.5em}.mw-parser-output .hatnote i{font-style:normal}.mw-parser-output .hatnote+link+.hatnote{margin-top:-0.5em}@media print{body.ns-0 .mw-parser-output .hatnote{display:none!important}}</style><div role="note" class="hatnote navigation-not-searchable">For reinforcement learning in psychology, see <a href="/wiki/Reinforcement" title="Reinforcement">Reinforcement</a> and <a href="/wiki/Operant_conditioning" title="Operant conditioning">Operant conditioning</a>.</div> <style data-mw-deduplicate="TemplateStyles:r1244144826">.mw-parser-output .machine-learning-list-title{background-color:#ddddff}html.skin-theme-clientpref-night .mw-parser-output .machine-learning-list-title{background-color:#222}@media(prefers-color-scheme:dark){html.skin-theme-clientpref-os .mw-parser-output .machine-learning-list-title{background-color:#222}}</style> <style data-mw-deduplicate="TemplateStyles:r1129693374">.mw-parser-output .hlist dl,.mw-parser-output .hlist ol,.mw-parser-output .hlist ul{margin:0;padding:0}.mw-parser-output .hlist dd,.mw-parser-output .hlist dt,.mw-parser-output .hlist li{margin:0;display:inline}.mw-parser-output .hlist.inline,.mw-parser-output .hlist.inline dl,.mw-parser-output .hlist.inline ol,.mw-parser-output .hlist.inline ul,.mw-parser-output .hlist dl dl,.mw-parser-output .hlist dl ol,.mw-parser-output .hlist dl ul,.mw-parser-output .hlist ol dl,.mw-parser-output .hlist ol ol,.mw-parser-output .hlist ol ul,.mw-parser-output .hlist ul dl,.mw-parser-output .hlist ul ol,.mw-parser-output .hlist ul ul{display:inline}.mw-parser-output .hlist .mw-empty-li{display:none}.mw-parser-output .hlist dt::after{content:": "}.mw-parser-output .hlist dd::after,.mw-parser-output .hlist li::after{content:" · ";font-weight:bold}.mw-parser-output .hlist dd:last-child::after,.mw-parser-output .hlist dt:last-child::after,.mw-parser-output .hlist li:last-child::after{content:none}.mw-parser-output .hlist dd dd:first-child::before,.mw-parser-output .hlist dd dt:first-child::before,.mw-parser-output .hlist dd li:first-child::before,.mw-parser-output .hlist dt dd:first-child::before,.mw-parser-output .hlist dt dt:first-child::before,.mw-parser-output .hlist dt li:first-child::before,.mw-parser-output .hlist li dd:first-child::before,.mw-parser-output .hlist li dt:first-child::before,.mw-parser-output .hlist li li:first-child::before{content:" (";font-weight:normal}.mw-parser-output .hlist dd dd:last-child::after,.mw-parser-output .hlist dd dt:last-child::after,.mw-parser-output .hlist dd li:last-child::after,.mw-parser-output .hlist dt dd:last-child::after,.mw-parser-output .hlist dt dt:last-child::after,.mw-parser-output .hlist dt li:last-child::after,.mw-parser-output .hlist li dd:last-child::after,.mw-parser-output .hlist li dt:last-child::after,.mw-parser-output .hlist li li:last-child::after{content:")";font-weight:normal}.mw-parser-output .hlist ol{counter-reset:listitem}.mw-parser-output .hlist ol>li{counter-increment:listitem}.mw-parser-output .hlist ol>li::before{content:" "counter(listitem)"\a0 "}.mw-parser-output .hlist dd ol>li:first-child::before,.mw-parser-output .hlist dt ol>li:first-child::before,.mw-parser-output .hlist li ol>li:first-child::before{content:" ("counter(listitem)"\a0 "}</style><style data-mw-deduplicate="TemplateStyles:r1246091330">.mw-parser-output .sidebar{width:22em;float:right;clear:right;margin:0.5em 0 1em 1em;background:var(--background-color-neutral-subtle,#f8f9fa);border:1px solid var(--border-color-base,#a2a9b1);padding:0.2em;text-align:center;line-height:1.4em;font-size:88%;border-collapse:collapse;display:table}body.skin-minerva .mw-parser-output .sidebar{display:table!important;float:right!important;margin:0.5em 0 1em 1em!important}.mw-parser-output .sidebar-subgroup{width:100%;margin:0;border-spacing:0}.mw-parser-output .sidebar-left{float:left;clear:left;margin:0.5em 1em 1em 0}.mw-parser-output .sidebar-none{float:none;clear:both;margin:0.5em 1em 1em 0}.mw-parser-output .sidebar-outer-title{padding:0 0.4em 0.2em;font-size:125%;line-height:1.2em;font-weight:bold}.mw-parser-output .sidebar-top-image{padding:0.4em}.mw-parser-output .sidebar-top-caption,.mw-parser-output .sidebar-pretitle-with-top-image,.mw-parser-output .sidebar-caption{padding:0.2em 0.4em 0;line-height:1.2em}.mw-parser-output .sidebar-pretitle{padding:0.4em 0.4em 0;line-height:1.2em}.mw-parser-output .sidebar-title,.mw-parser-output .sidebar-title-with-pretitle{padding:0.2em 0.8em;font-size:145%;line-height:1.2em}.mw-parser-output .sidebar-title-with-pretitle{padding:0.1em 0.4em}.mw-parser-output .sidebar-image{padding:0.2em 0.4em 0.4em}.mw-parser-output .sidebar-heading{padding:0.1em 0.4em}.mw-parser-output .sidebar-content{padding:0 0.5em 0.4em}.mw-parser-output .sidebar-content-with-subgroup{padding:0.1em 0.4em 0.2em}.mw-parser-output .sidebar-above,.mw-parser-output .sidebar-below{padding:0.3em 0.8em;font-weight:bold}.mw-parser-output .sidebar-collapse .sidebar-above,.mw-parser-output .sidebar-collapse .sidebar-below{border-top:1px solid #aaa;border-bottom:1px solid #aaa}.mw-parser-output .sidebar-navbar{text-align:right;font-size:115%;padding:0 0.4em 0.4em}.mw-parser-output .sidebar-list-title{padding:0 0.4em;text-align:left;font-weight:bold;line-height:1.6em;font-size:105%}.mw-parser-output .sidebar-list-title-c{padding:0 0.4em;text-align:center;margin:0 3.3em}@media(max-width:640px){body.mediawiki .mw-parser-output .sidebar{width:100%!important;clear:both;float:none!important;margin-left:0!important;margin-right:0!important}}body.skin--responsive .mw-parser-output .sidebar a>img{max-width:none!important}@media screen{html.skin-theme-clientpref-night .mw-parser-output .sidebar:not(.notheme) .sidebar-list-title,html.skin-theme-clientpref-night .mw-parser-output .sidebar:not(.notheme) .sidebar-title-with-pretitle{background:transparent!important}html.skin-theme-clientpref-night .mw-parser-output .sidebar:not(.notheme) .sidebar-title-with-pretitle a{color:var(--color-progressive)!important}}@media screen and (prefers-color-scheme:dark){html.skin-theme-clientpref-os .mw-parser-output .sidebar:not(.notheme) .sidebar-list-title,html.skin-theme-clientpref-os .mw-parser-output .sidebar:not(.notheme) .sidebar-title-with-pretitle{background:transparent!important}html.skin-theme-clientpref-os .mw-parser-output .sidebar:not(.notheme) .sidebar-title-with-pretitle a{color:var(--color-progressive)!important}}@media print{body.ns-0 .mw-parser-output .sidebar{display:none!important}}</style><style data-mw-deduplicate="TemplateStyles:r886047488">.mw-parser-output .nobold{font-weight:normal}</style><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r886047488"><table class="sidebar sidebar-collapse nomobile nowraplinks"><tbody><tr><td class="sidebar-pretitle">Part of a series on</td></tr><tr><th class="sidebar-title-with-pretitle"><a href="/wiki/Machine_learning" title="Machine learning">Machine learning</a><br />and <a href="/wiki/Data_mining" title="Data mining">data mining</a></th></tr><tr><td class="sidebar-content"> <div class="sidebar-list mw-collapsible mw-collapsed machine-learning-list-title"><div class="sidebar-list-title" style="border-top:1px solid #aaa; text-align:center;;color: var(--color-base)">Paradigms</div><div class="sidebar-list-content mw-collapsible-content hlist"> <ul><li><a href="/wiki/Supervised_learning" title="Supervised learning">Supervised learning</a></li> <li><a href="/wiki/Unsupervised_learning" title="Unsupervised learning">Unsupervised learning</a></li> <li><a href="/wiki/Semi-supervised_learning" class="mw-redirect" title="Semi-supervised learning">Semi-supervised learning</a></li> <li><a href="/wiki/Self-supervised_learning" title="Self-supervised learning">Self-supervised learning</a></li> <li><a class="mw-selflink selflink">Reinforcement learning</a></li> <li><a href="/wiki/Meta-learning_(computer_science)" title="Meta-learning (computer science)">Meta-learning</a></li> <li><a href="/wiki/Online_machine_learning" title="Online machine learning">Online learning</a></li> <li><a href="/wiki/Batch_learning" class="mw-redirect" title="Batch learning">Batch learning</a></li> <li><a href="/wiki/Curriculum_learning" title="Curriculum learning">Curriculum learning</a></li> <li><a href="/wiki/Rule-based_machine_learning" title="Rule-based machine learning">Rule-based learning</a></li> <li><a href="/wiki/Neuro-symbolic_AI" title="Neuro-symbolic AI">Neuro-symbolic AI</a></li> <li><a href="/wiki/Neuromorphic_engineering" class="mw-redirect" title="Neuromorphic engineering">Neuromorphic engineering</a></li> <li><a href="/wiki/Quantum_machine_learning" title="Quantum machine learning">Quantum machine learning</a></li></ul></div></div></td> </tr><tr><td class="sidebar-content"> <div class="sidebar-list mw-collapsible mw-collapsed machine-learning-list-title"><div class="sidebar-list-title" style="border-top:1px solid #aaa; text-align:center;;color: var(--color-base)">Problems</div><div class="sidebar-list-content mw-collapsible-content hlist"> <ul><li><a href="/wiki/Statistical_classification" title="Statistical classification">Classification</a></li> <li><a href="/wiki/Generative_model" title="Generative model">Generative modeling</a></li> <li><a href="/wiki/Regression_analysis" title="Regression analysis">Regression</a></li> <li><a href="/wiki/Cluster_analysis" title="Cluster analysis">Clustering</a></li> <li><a href="/wiki/Dimensionality_reduction" title="Dimensionality reduction">Dimensionality reduction</a></li> <li><a href="/wiki/Density_estimation" title="Density estimation">Density estimation</a></li> <li><a href="/wiki/Anomaly_detection" title="Anomaly detection">Anomaly detection</a></li> <li><a href="/wiki/Data_cleaning" class="mw-redirect" title="Data cleaning">Data cleaning</a></li> <li><a href="/wiki/Automated_machine_learning" title="Automated machine learning">AutoML</a></li> <li><a href="/wiki/Association_rule_learning" title="Association rule learning">Association rules</a></li> <li><a href="/wiki/Semantic_analysis_(machine_learning)" title="Semantic analysis (machine learning)">Semantic analysis</a></li> <li><a href="/wiki/Structured_prediction" title="Structured prediction">Structured prediction</a></li> <li><a href="/wiki/Feature_engineering" title="Feature engineering">Feature engineering</a></li> <li><a href="/wiki/Feature_learning" title="Feature learning">Feature learning</a></li> <li><a href="/wiki/Learning_to_rank" title="Learning to rank">Learning to rank</a></li> <li><a href="/wiki/Grammar_induction" title="Grammar induction">Grammar induction</a></li> <li><a href="/wiki/Ontology_learning" title="Ontology learning">Ontology learning</a></li> <li><a href="/wiki/Multimodal_learning" title="Multimodal learning">Multimodal learning</a></li></ul></div></div></td> </tr><tr><td class="sidebar-content"> <div class="sidebar-list mw-collapsible mw-collapsed machine-learning-list-title"><div class="sidebar-list-title" style="border-top:1px solid #aaa; text-align:center;;color: var(--color-base)"><div style="display: inline-block; line-height: 1.2em; padding: .1em 0;"><a href="/wiki/Supervised_learning" title="Supervised learning">Supervised learning</a><br /><span class="nobold"><span style="font-size:85%;">(<b><a href="/wiki/Statistical_classification" title="Statistical classification">classification</a></b> • <b><a href="/wiki/Regression_analysis" title="Regression analysis">regression</a></b>)</span></span> </div></div><div class="sidebar-list-content mw-collapsible-content hlist"> <ul><li><a href="/wiki/Apprenticeship_learning" title="Apprenticeship learning">Apprenticeship learning</a></li> <li><a href="/wiki/Decision_tree_learning" title="Decision tree learning">Decision trees</a></li> <li><a href="/wiki/Ensemble_learning" title="Ensemble learning">Ensembles</a> <ul><li><a href="/wiki/Bootstrap_aggregating" title="Bootstrap aggregating">Bagging</a></li> <li><a href="/wiki/Boosting_(machine_learning)" title="Boosting (machine learning)">Boosting</a></li> <li><a href="/wiki/Random_forest" title="Random forest">Random forest</a></li></ul></li> <li><a href="/wiki/K-nearest_neighbors_algorithm" title="K-nearest neighbors algorithm"><i>k</i>-NN</a></li> <li><a href="/wiki/Linear_regression" title="Linear regression">Linear regression</a></li> <li><a href="/wiki/Naive_Bayes_classifier" title="Naive Bayes classifier">Naive Bayes</a></li> <li><a href="/wiki/Artificial_neural_network" class="mw-redirect" title="Artificial neural network">Artificial neural networks</a></li> <li><a href="/wiki/Logistic_regression" title="Logistic regression">Logistic regression</a></li> <li><a href="/wiki/Perceptron" title="Perceptron">Perceptron</a></li> <li><a href="/wiki/Relevance_vector_machine" title="Relevance vector machine">Relevance vector machine (RVM)</a></li> <li><a href="/wiki/Support_vector_machine" title="Support vector machine">Support vector machine (SVM)</a></li></ul></div></div></td> </tr><tr><td class="sidebar-content"> <div class="sidebar-list mw-collapsible mw-collapsed machine-learning-list-title"><div class="sidebar-list-title" style="border-top:1px solid #aaa; text-align:center;;color: var(--color-base)"><a href="/wiki/Cluster_analysis" title="Cluster analysis">Clustering</a></div><div class="sidebar-list-content mw-collapsible-content hlist"> <ul><li><a href="/wiki/BIRCH" title="BIRCH">BIRCH</a></li> <li><a href="/wiki/CURE_algorithm" title="CURE algorithm">CURE</a></li> <li><a href="/wiki/Hierarchical_clustering" title="Hierarchical clustering">Hierarchical</a></li> <li><a href="/wiki/K-means_clustering" title="K-means clustering"><i>k</i>-means</a></li> <li><a href="/wiki/Fuzzy_clustering" title="Fuzzy clustering">Fuzzy</a></li> <li><a href="/wiki/Expectation%E2%80%93maximization_algorithm" title="Expectation–maximization algorithm">Expectation–maximization (EM)</a></li> <li><br /><a href="/wiki/DBSCAN" title="DBSCAN">DBSCAN</a></li> <li><a href="/wiki/OPTICS_algorithm" title="OPTICS algorithm">OPTICS</a></li> <li><a href="/wiki/Mean_shift" title="Mean shift">Mean shift</a></li></ul></div></div></td> </tr><tr><td class="sidebar-content"> <div class="sidebar-list mw-collapsible mw-collapsed machine-learning-list-title"><div class="sidebar-list-title" style="border-top:1px solid #aaa; text-align:center;;color: var(--color-base)"><a href="/wiki/Dimensionality_reduction" title="Dimensionality reduction">Dimensionality reduction</a></div><div class="sidebar-list-content mw-collapsible-content hlist"> <ul><li><a href="/wiki/Factor_analysis" title="Factor analysis">Factor analysis</a></li> <li><a href="/wiki/Canonical_correlation" title="Canonical correlation">CCA</a></li> <li><a href="/wiki/Independent_component_analysis" title="Independent component analysis">ICA</a></li> <li><a href="/wiki/Linear_discriminant_analysis" title="Linear discriminant analysis">LDA</a></li> <li><a href="/wiki/Non-negative_matrix_factorization" title="Non-negative matrix factorization">NMF</a></li> <li><a href="/wiki/Principal_component_analysis" title="Principal component analysis">PCA</a></li> <li><a href="/wiki/Proper_generalized_decomposition" title="Proper generalized decomposition">PGD</a></li> <li><a href="/wiki/T-distributed_stochastic_neighbor_embedding" title="T-distributed stochastic neighbor embedding">t-SNE</a></li> <li><a href="/wiki/Sparse_dictionary_learning" title="Sparse dictionary learning">SDL</a></li></ul></div></div></td> </tr><tr><td class="sidebar-content"> <div class="sidebar-list mw-collapsible mw-collapsed machine-learning-list-title"><div class="sidebar-list-title" style="border-top:1px solid #aaa; text-align:center;;color: var(--color-base)"><a href="/wiki/Structured_prediction" title="Structured prediction">Structured prediction</a></div><div class="sidebar-list-content mw-collapsible-content hlist"> <ul><li><a href="/wiki/Graphical_model" title="Graphical model">Graphical models</a> <ul><li><a href="/wiki/Bayesian_network" title="Bayesian network">Bayes net</a></li> <li><a href="/wiki/Conditional_random_field" title="Conditional random field">Conditional random field</a></li> <li><a href="/wiki/Hidden_Markov_model" title="Hidden Markov model">Hidden Markov</a></li></ul></li></ul></div></div></td> </tr><tr><td class="sidebar-content"> <div class="sidebar-list mw-collapsible mw-collapsed machine-learning-list-title"><div class="sidebar-list-title" style="border-top:1px solid #aaa; text-align:center;;color: var(--color-base)"><a href="/wiki/Anomaly_detection" title="Anomaly detection">Anomaly detection</a></div><div class="sidebar-list-content mw-collapsible-content hlist"> <ul><li><a href="/wiki/Random_sample_consensus" title="Random sample consensus">RANSAC</a></li> <li><a href="/wiki/K-nearest_neighbors_algorithm" title="K-nearest neighbors algorithm"><i>k</i>-NN</a></li> <li><a href="/wiki/Local_outlier_factor" title="Local outlier factor">Local outlier factor</a></li> <li><a href="/wiki/Isolation_forest" title="Isolation forest">Isolation forest</a></li></ul></div></div></td> </tr><tr><td class="sidebar-content"> <div class="sidebar-list mw-collapsible mw-collapsed machine-learning-list-title"><div class="sidebar-list-title" style="border-top:1px solid #aaa; text-align:center;;color: var(--color-base)"><a href="/wiki/Artificial_neural_network" class="mw-redirect" title="Artificial neural network">Artificial neural network</a></div><div class="sidebar-list-content mw-collapsible-content hlist"> <ul><li><a href="/wiki/Autoencoder" title="Autoencoder">Autoencoder</a></li> <li><a href="/wiki/Deep_learning" title="Deep learning">Deep learning</a></li> <li><a href="/wiki/Feedforward_neural_network" title="Feedforward neural network">Feedforward neural network</a></li> <li><a href="/wiki/Recurrent_neural_network" title="Recurrent neural network">Recurrent neural network</a> <ul><li><a href="/wiki/Long_short-term_memory" title="Long short-term memory">LSTM</a></li> <li><a href="/wiki/Gated_recurrent_unit" title="Gated recurrent unit">GRU</a></li> <li><a href="/wiki/Echo_state_network" title="Echo state network">ESN</a></li> <li><a href="/wiki/Reservoir_computing" title="Reservoir computing">reservoir computing</a></li></ul></li> <li><a href="/wiki/Boltzmann_machine" title="Boltzmann machine">Boltzmann machine</a> <ul><li><a href="/wiki/Restricted_Boltzmann_machine" title="Restricted Boltzmann machine">Restricted</a></li></ul></li> <li><a href="/wiki/Generative_adversarial_network" title="Generative adversarial network">GAN</a></li> <li><a href="/wiki/Diffusion_model" title="Diffusion model">Diffusion model</a></li> <li><a href="/wiki/Self-organizing_map" title="Self-organizing map">SOM</a></li> <li><a href="/wiki/Convolutional_neural_network" title="Convolutional neural network">Convolutional neural network</a> <ul><li><a href="/wiki/U-Net" title="U-Net">U-Net</a></li> <li><a href="/wiki/LeNet" title="LeNet">LeNet</a></li> <li><a href="/wiki/AlexNet" title="AlexNet">AlexNet</a></li> <li><a href="/wiki/DeepDream" title="DeepDream">DeepDream</a></li></ul></li> <li><a href="/wiki/Neural_radiance_field" title="Neural radiance field">Neural radiance field</a></li> <li><a href="/wiki/Transformer_(machine_learning_model)" class="mw-redirect" title="Transformer (machine learning model)">Transformer</a> <ul><li><a href="/wiki/Vision_transformer" title="Vision transformer">Vision</a></li></ul></li> <li><a href="/wiki/Mamba_(deep_learning_architecture)" title="Mamba (deep learning architecture)">Mamba</a></li> <li><a href="/wiki/Spiking_neural_network" title="Spiking neural network">Spiking neural network</a></li> <li><a href="/wiki/Memtransistor" title="Memtransistor">Memtransistor</a></li> <li><a href="/wiki/Electrochemical_RAM" title="Electrochemical RAM">Electrochemical RAM</a> (ECRAM)</li></ul></div></div></td> </tr><tr><td class="sidebar-content"> <div class="sidebar-list mw-collapsible machine-learning-list-title"><div class="sidebar-list-title" style="border-top:1px solid #aaa; text-align:center;;color: var(--color-base)"><a class="mw-selflink selflink">Reinforcement learning</a></div><div class="sidebar-list-content mw-collapsible-content hlist"> <ul><li><a href="/wiki/Q-learning" title="Q-learning">Q-learning</a></li> <li><a href="/wiki/State%E2%80%93action%E2%80%93reward%E2%80%93state%E2%80%93action" title="State–action–reward–state–action">SARSA</a></li> <li><a href="/wiki/Temporal_difference_learning" title="Temporal difference learning">Temporal difference (TD)</a></li> <li><a href="/wiki/Multi-agent_reinforcement_learning" title="Multi-agent reinforcement learning">Multi-agent</a> <ul><li><a href="/wiki/Self-play_(reinforcement_learning_technique)" class="mw-redirect" title="Self-play (reinforcement learning technique)">Self-play</a></li></ul></li></ul></div></div></td> </tr><tr><td class="sidebar-content"> <div class="sidebar-list mw-collapsible mw-collapsed machine-learning-list-title"><div class="sidebar-list-title" style="border-top:1px solid #aaa; text-align:center;;color: var(--color-base)">Learning with humans</div><div class="sidebar-list-content mw-collapsible-content hlist"> <ul><li><a href="/wiki/Active_learning_(machine_learning)" title="Active learning (machine learning)">Active learning</a></li> <li><a href="/wiki/Crowdsourcing" title="Crowdsourcing">Crowdsourcing</a></li> <li><a href="/wiki/Human-in-the-loop" title="Human-in-the-loop">Human-in-the-loop</a></li> <li><a href="/wiki/Reinforcement_learning_from_human_feedback" title="Reinforcement learning from human feedback">RLHF</a></li></ul></div></div></td> </tr><tr><td class="sidebar-content"> <div class="sidebar-list mw-collapsible mw-collapsed machine-learning-list-title"><div class="sidebar-list-title" style="border-top:1px solid #aaa; text-align:center;;color: var(--color-base)">Model diagnostics</div><div class="sidebar-list-content mw-collapsible-content hlist"> <ul><li><a href="/wiki/Coefficient_of_determination" title="Coefficient of determination">Coefficient of determination</a></li> <li><a href="/wiki/Confusion_matrix" title="Confusion matrix">Confusion matrix</a></li> <li><a href="/wiki/Learning_curve_(machine_learning)" title="Learning curve (machine learning)">Learning curve</a></li> <li><a href="/wiki/Receiver_operating_characteristic" title="Receiver operating characteristic">ROC curve</a></li></ul></div></div></td> </tr><tr><td class="sidebar-content"> <div class="sidebar-list mw-collapsible mw-collapsed machine-learning-list-title"><div class="sidebar-list-title" style="border-top:1px solid #aaa; text-align:center;;color: var(--color-base)">Mathematical foundations</div><div class="sidebar-list-content mw-collapsible-content hlist"> <ul><li><a href="/wiki/Kernel_machines" class="mw-redirect" title="Kernel machines">Kernel machines</a></li> <li><a href="/wiki/Bias%E2%80%93variance_tradeoff" title="Bias–variance tradeoff">Bias–variance tradeoff</a></li> <li><a href="/wiki/Computational_learning_theory" title="Computational learning theory">Computational learning theory</a></li> <li><a href="/wiki/Empirical_risk_minimization" title="Empirical risk minimization">Empirical risk minimization</a></li> <li><a href="/wiki/Occam_learning" title="Occam learning">Occam learning</a></li> <li><a href="/wiki/Probably_approximately_correct_learning" title="Probably approximately correct learning">PAC learning</a></li> <li><a href="/wiki/Statistical_learning_theory" title="Statistical learning theory">Statistical learning</a></li> <li><a href="/wiki/Vapnik%E2%80%93Chervonenkis_theory" title="Vapnik–Chervonenkis theory">VC theory</a></li></ul></div></div></td> </tr><tr><td class="sidebar-content"> <div class="sidebar-list mw-collapsible mw-collapsed machine-learning-list-title"><div class="sidebar-list-title" style="border-top:1px solid #aaa; text-align:center;;color: var(--color-base)">Journals and conferences</div><div class="sidebar-list-content mw-collapsible-content hlist"> <ul><li><a href="/wiki/ECML_PKDD" title="ECML PKDD">ECML PKDD</a></li> <li><a href="/wiki/Conference_on_Neural_Information_Processing_Systems" title="Conference on Neural Information Processing Systems">NeurIPS</a></li> <li><a href="/wiki/International_Conference_on_Machine_Learning" title="International Conference on Machine Learning">ICML</a></li> <li><a href="/wiki/International_Conference_on_Learning_Representations" title="International Conference on Learning Representations">ICLR</a></li> <li><a href="/wiki/International_Joint_Conference_on_Artificial_Intelligence" title="International Joint Conference on Artificial Intelligence">IJCAI</a></li> <li><a href="/wiki/Machine_Learning_(journal)" title="Machine Learning (journal)">ML</a></li> <li><a href="/wiki/Journal_of_Machine_Learning_Research" title="Journal of Machine Learning Research">JMLR</a></li></ul></div></div></td> </tr><tr><td class="sidebar-content"> <div class="sidebar-list mw-collapsible mw-collapsed machine-learning-list-title"><div class="sidebar-list-title" style="border-top:1px solid #aaa; text-align:center;;color: var(--color-base)">Related articles</div><div class="sidebar-list-content mw-collapsible-content hlist"> <ul><li><a href="/wiki/Glossary_of_artificial_intelligence" title="Glossary of artificial intelligence">Glossary of artificial intelligence</a></li> <li><a href="/wiki/List_of_datasets_for_machine-learning_research" title="List of datasets for machine-learning research">List of datasets for machine-learning research</a> <ul><li><a href="/wiki/List_of_datasets_in_computer_vision_and_image_processing" title="List of datasets in computer vision and image processing">List of datasets in computer vision and image processing</a></li></ul></li> <li><a href="/wiki/Outline_of_machine_learning" title="Outline of machine learning">Outline of machine learning</a></li></ul></div></div></td> </tr><tr><td class="sidebar-navbar"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1129693374"><style data-mw-deduplicate="TemplateStyles:r1239400231">.mw-parser-output .navbar{display:inline;font-size:88%;font-weight:normal}.mw-parser-output .navbar-collapse{float:left;text-align:left}.mw-parser-output .navbar-boxtext{word-spacing:0}.mw-parser-output .navbar ul{display:inline-block;white-space:nowrap;line-height:inherit}.mw-parser-output .navbar-brackets::before{margin-right:-0.125em;content:"[ "}.mw-parser-output .navbar-brackets::after{margin-left:-0.125em;content:" ]"}.mw-parser-output .navbar li{word-spacing:-0.125em}.mw-parser-output .navbar a>span,.mw-parser-output .navbar a>abbr{text-decoration:inherit}.mw-parser-output .navbar-mini abbr{font-variant:small-caps;border-bottom:none;text-decoration:none;cursor:inherit}.mw-parser-output .navbar-ct-full{font-size:114%;margin:0 7em}.mw-parser-output .navbar-ct-mini{font-size:114%;margin:0 4em}html.skin-theme-clientpref-night .mw-parser-output .navbar li a abbr{color:var(--color-base)!important}@media(prefers-color-scheme:dark){html.skin-theme-clientpref-os .mw-parser-output .navbar li a abbr{color:var(--color-base)!important}}@media print{.mw-parser-output .navbar{display:none!important}}</style><div class="navbar plainlinks hlist navbar-mini"><ul><li class="nv-view"><a href="/wiki/Template:Machine_learning" title="Template:Machine learning"><abbr title="View this template">v</abbr></a></li><li class="nv-talk"><a href="/wiki/Template_talk:Machine_learning" title="Template talk:Machine learning"><abbr title="Discuss this template">t</abbr></a></li><li class="nv-edit"><a href="/wiki/Special:EditPage/Template:Machine_learning" title="Special:EditPage/Template:Machine learning"><abbr title="Edit this template">e</abbr></a></li></ul></div></td></tr></tbody></table> <p><b>Reinforcement learning</b> (<b>RL</b>) is an interdisciplinary area of <a href="/wiki/Machine_learning" title="Machine learning">machine learning</a> and <a href="/wiki/Optimal_control" title="Optimal control">optimal control</a> concerned with how an <a href="/wiki/Intelligent_agent" title="Intelligent agent">intelligent agent</a> should <a href="/wiki/Action_selection" title="Action selection">take actions</a> in a dynamic environment in order to <a href="/wiki/Reward-based_selection" title="Reward-based selection">maximize a reward</a> signal. Reinforcement learning is one of the <a href="/wiki/Machine_learning#Approaches" title="Machine learning">three basic machine learning paradigms</a>, alongside <a href="/wiki/Supervised_learning" title="Supervised learning">supervised learning</a> and <a href="/wiki/Unsupervised_learning" title="Unsupervised learning">unsupervised learning</a>. </p><p><a href="/wiki/Q-learning" title="Q-learning">Q-learning</a> at its simplest stores data in tables. This approach becomes <a href="/wiki/Computational_complexity_theory" title="Computational complexity theory">infeasible</a> as the number of states/actions increases (e.g., if the state space or action space were continuous), as the probability of the agent visiting a particular state and performing a particular action diminishes. </p><p>Reinforcement learning differs from supervised learning in not needing labelled input-output pairs to be presented, and in not needing sub-optimal actions to be explicitly corrected. Instead, the focus is on finding a balance between exploration (of uncharted territory) and exploitation (of current knowledge) with the goal of maximizing the cumulative reward (the feedback of which might be incomplete or delayed).<sup id="cite_ref-kaelbling_1-0" class="reference"><a href="#cite_note-kaelbling-1"><span class="cite-bracket">[</span>1<span class="cite-bracket">]</span></a></sup> The search for this balance is known as the <a href="/wiki/Exploration-exploitation_dilemma" title="Exploration-exploitation dilemma">exploration-exploitation dilemma</a>. </p><p> The environment is typically stated in the form of a <a href="/wiki/Markov_decision_process" title="Markov decision process">Markov decision process</a> (MDP), as many reinforcement learning algorithms use <a href="/wiki/Dynamic_programming" title="Dynamic programming">dynamic programming</a> techniques.<sup id="cite_ref-2" class="reference"><a href="#cite_note-2"><span class="cite-bracket">[</span>2<span class="cite-bracket">]</span></a></sup> The main difference between classical dynamic programming methods and reinforcement learning algorithms is that the latter do not assume knowledge of an exact mathematical model of the Markov decision process, and they target large MDPs where exact methods become infeasible.<sup id="cite_ref-Li-2023_3-0" class="reference"><a href="#cite_note-Li-2023-3"><span class="cite-bracket">[</span>3<span class="cite-bracket">]</span></a></sup> <style data-mw-deduplicate="TemplateStyles:r886046785">.mw-parser-output .toclimit-2 .toclevel-1 ul,.mw-parser-output .toclimit-3 .toclevel-2 ul,.mw-parser-output .toclimit-4 .toclevel-3 ul,.mw-parser-output .toclimit-5 .toclevel-4 ul,.mw-parser-output .toclimit-6 .toclevel-5 ul,.mw-parser-output .toclimit-7 .toclevel-6 ul{display:none}</style></p><div class="toclimit-3"><meta property="mw:PageProp/toc" /></div> <div class="mw-heading mw-heading2"><h2 id="Introduction">Introduction</h2><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Reinforcement_learning&action=edit&section=1" title="Edit section: Introduction"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <figure class="mw-halign-right" typeof="mw:File/Thumb"><a href="/wiki/File:Reinforcement_learning_diagram.svg" class="mw-file-description"><img src="//upload.wikimedia.org/wikipedia/commons/thumb/1/1b/Reinforcement_learning_diagram.svg/250px-Reinforcement_learning_diagram.svg.png" decoding="async" width="250" height="242" class="mw-file-element" srcset="//upload.wikimedia.org/wikipedia/commons/thumb/1/1b/Reinforcement_learning_diagram.svg/375px-Reinforcement_learning_diagram.svg.png 1.5x, //upload.wikimedia.org/wikipedia/commons/thumb/1/1b/Reinforcement_learning_diagram.svg/500px-Reinforcement_learning_diagram.svg.png 2x" data-file-width="300" data-file-height="290" /></a><figcaption> The typical framing of a Reinforcement Learning (RL) scenario: an agent takes actions in an environment, which is interpreted into a reward and a state representation, which are fed back to the agent.</figcaption></figure> <p>Due to its generality, reinforcement learning is studied in many disciplines, such as <a href="/wiki/Game_theory" title="Game theory">game theory</a>, <a href="/wiki/Control_theory" title="Control theory">control theory</a>, <a href="/wiki/Operations_research" title="Operations research">operations research</a>, <a href="/wiki/Information_theory" title="Information theory">information theory</a>, <a href="/wiki/Simulation-based_optimization" title="Simulation-based optimization">simulation-based optimization</a>, <a href="/wiki/Multi-agent_system" title="Multi-agent system">multi-agent systems</a>, <a href="/wiki/Swarm_intelligence" title="Swarm intelligence">swarm intelligence</a>, and <a href="/wiki/Statistics" title="Statistics">statistics</a>. In the operations research and control literature, RL is called <i>approximate dynamic programming</i>, or <i>neuro-dynamic programming.</i> The problems of interest in RL have also been studied in the <a href="/wiki/Optimal_control_theory" class="mw-redirect" title="Optimal control theory">theory of optimal control</a>, which is concerned mostly with the existence and characterization of optimal solutions, and algorithms for their exact computation, and less with learning or approximation (particularly in the absence of a mathematical model of the environment). </p><p>Basic reinforcement learning is modeled as a <a href="/wiki/Markov_decision_process" title="Markov decision process">Markov decision process</a>: </p> <ul><li>A set of environment and agent states (the state space), <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle {\mathcal {S}}}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mrow class="MJX-TeXAtom-ORD"> <mrow class="MJX-TeXAtom-ORD"> <mi class="MJX-tex-caligraphic" mathvariant="script">S</mi> </mrow> </mrow> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle {\mathcal {S}}}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/2302a18e269dbecc43c57c0c2aced3bfae15278d" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.338ex; width:1.492ex; height:2.176ex;" alt="{\displaystyle {\mathcal {S}}}"></span>;</li> <li>A set of actions (the action space), <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle {\mathcal {A}}}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mrow class="MJX-TeXAtom-ORD"> <mrow class="MJX-TeXAtom-ORD"> <mi class="MJX-tex-caligraphic" mathvariant="script">A</mi> </mrow> </mrow> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle {\mathcal {A}}}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/280ae03440942ab348c2ca9b8db6b56ffa9618f8" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.338ex; width:1.903ex; height:2.343ex;" alt="{\displaystyle {\mathcal {A}}}"></span>, of the agent;</li> <li><span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle P_{a}(s,s')=\Pr(S_{t+1}=s'\mid S_{t}=s,A_{t}=a)}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <msub> <mi>P</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>a</mi> </mrow> </msub> <mo stretchy="false">(</mo> <mi>s</mi> <mo>,</mo> <msup> <mi>s</mi> <mo>′</mo> </msup> <mo stretchy="false">)</mo> <mo>=</mo> <mo movablelimits="true" form="prefix">Pr</mo> <mo stretchy="false">(</mo> <msub> <mi>S</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>t</mi> <mo>+</mo> <mn>1</mn> </mrow> </msub> <mo>=</mo> <msup> <mi>s</mi> <mo>′</mo> </msup> <mo>∣<!-- ∣ --></mo> <msub> <mi>S</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>t</mi> </mrow> </msub> <mo>=</mo> <mi>s</mi> <mo>,</mo> <msub> <mi>A</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>t</mi> </mrow> </msub> <mo>=</mo> <mi>a</mi> <mo stretchy="false">)</mo> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle P_{a}(s,s')=\Pr(S_{t+1}=s'\mid S_{t}=s,A_{t}=a)}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/e76fa3fccba8ec1365ed78c382313331381fb625" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.838ex; width:41.238ex; height:3.009ex;" alt="{\displaystyle P_{a}(s,s')=\Pr(S_{t+1}=s'\mid S_{t}=s,A_{t}=a)}"></span>, the transition probability (at time <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle t}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mi>t</mi> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle t}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/65658b7b223af9e1acc877d848888ecdb4466560" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.338ex; width:0.84ex; height:2.009ex;" alt="{\displaystyle t}"></span>) from state <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle s}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mi>s</mi> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle s}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/01d131dfd7673938b947072a13a9744fe997e632" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.338ex; width:1.09ex; height:1.676ex;" alt="{\displaystyle s}"></span> to state <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle s'}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <msup> <mi>s</mi> <mo>′</mo> </msup> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle s'}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/5136680c63706cfd17ceddb4acddbfdd0ba5ef2d" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.338ex; width:1.775ex; height:2.509ex;" alt="{\displaystyle s'}"></span> under action <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle a}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mi>a</mi> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle a}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/ffd2487510aa438433a2579450ab2b3d557e5edc" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.338ex; width:1.23ex; height:1.676ex;" alt="{\displaystyle a}"></span>.</li> <li><span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle R_{a}(s,s')}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <msub> <mi>R</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>a</mi> </mrow> </msub> <mo stretchy="false">(</mo> <mi>s</mi> <mo>,</mo> <msup> <mi>s</mi> <mo>′</mo> </msup> <mo stretchy="false">)</mo> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle R_{a}(s,s')}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/5f842d4a4b1340e194d8014d63163e5e27f94215" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.838ex; width:8.575ex; height:3.009ex;" alt="{\displaystyle R_{a}(s,s')}"></span>, the immediate reward after transition from <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle s}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mi>s</mi> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle s}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/01d131dfd7673938b947072a13a9744fe997e632" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.338ex; width:1.09ex; height:1.676ex;" alt="{\displaystyle s}"></span> to <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle s'}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <msup> <mi>s</mi> <mo>′</mo> </msup> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle s'}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/5136680c63706cfd17ceddb4acddbfdd0ba5ef2d" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.338ex; width:1.775ex; height:2.509ex;" alt="{\displaystyle s'}"></span> under action <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle a}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mi>a</mi> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle a}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/ffd2487510aa438433a2579450ab2b3d557e5edc" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.338ex; width:1.23ex; height:1.676ex;" alt="{\displaystyle a}"></span>.</li></ul> <p>The purpose of reinforcement learning is for the agent to learn an optimal (or near-optimal) policy that maximizes the reward function or other user-provided reinforcement signal that accumulates from immediate rewards. This is similar to <a href="/wiki/Reinforcement" title="Reinforcement">processes</a> that appear to occur in animal psychology. For example, biological brains are hardwired to interpret signals such as pain and hunger as negative reinforcements, and interpret pleasure and food intake as positive reinforcements. In some circumstances, animals learn to adopt behaviors that optimize these rewards. This suggests that animals are capable of reinforcement learning.<sup id="cite_ref-4" class="reference"><a href="#cite_note-4"><span class="cite-bracket">[</span>4<span class="cite-bracket">]</span></a></sup><sup id="cite_ref-5" class="reference"><a href="#cite_note-5"><span class="cite-bracket">[</span>5<span class="cite-bracket">]</span></a></sup> </p><p>A basic reinforcement learning agent interacts with its environment in discrete time steps. At each time step <span class="texhtml mvar" style="font-style:italic;">t</span>, the agent receives the current state <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle S_{t}}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <msub> <mi>S</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>t</mi> </mrow> </msub> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle S_{t}}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/7e2391e6e796fbf718be3828080775ac2ac3d3d4" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.671ex; width:2.251ex; height:2.509ex;" alt="{\displaystyle S_{t}}"></span> and reward <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle R_{t}}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <msub> <mi>R</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>t</mi> </mrow> </msub> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle R_{t}}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/d65b678ee539ac36de96b554af181ac03b7f16a8" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.671ex; width:2.59ex; height:2.509ex;" alt="{\displaystyle R_{t}}"></span>. It then chooses an action <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle A_{t}}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <msub> <mi>A</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>t</mi> </mrow> </msub> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle A_{t}}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/265483c517cb98cde609f03a31964d86cdcb05c9" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.671ex; width:2.569ex; height:2.509ex;" alt="{\displaystyle A_{t}}"></span> from the set of available actions, which is subsequently sent to the environment. The environment moves to a new state <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle S_{t+1}}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <msub> <mi>S</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>t</mi> <mo>+</mo> <mn>1</mn> </mrow> </msub> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle S_{t+1}}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/7fde0b9d29bcb7e8d3eee3a8f42ead14aa3b8cee" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.671ex; width:4.351ex; height:2.509ex;" alt="{\displaystyle S_{t+1}}"></span> and the reward <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle R_{t+1}}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <msub> <mi>R</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>t</mi> <mo>+</mo> <mn>1</mn> </mrow> </msub> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle R_{t+1}}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/78bc3f077c19f7db4e89b5529a5861b06ae782b7" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.671ex; width:4.69ex; height:2.509ex;" alt="{\displaystyle R_{t+1}}"></span> associated with the <i>transition</i> <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle (S_{t},A_{t},S_{t+1})}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mo stretchy="false">(</mo> <msub> <mi>S</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>t</mi> </mrow> </msub> <mo>,</mo> <msub> <mi>A</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>t</mi> </mrow> </msub> <mo>,</mo> <msub> <mi>S</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>t</mi> <mo>+</mo> <mn>1</mn> </mrow> </msub> <mo stretchy="false">)</mo> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle (S_{t},A_{t},S_{t+1})}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/efa800ac9fbf81e68eb97515e04cf99ef35b730d" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.838ex; width:13.048ex; height:2.843ex;" alt="{\displaystyle (S_{t},A_{t},S_{t+1})}"></span> is determined. The goal of a reinforcement learning agent is to learn a <i>policy</i>: </p><p><span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle \pi :{\mathcal {S}}\times {\mathcal {A}}\rightarrow [0,1]}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mi>π<!-- π --></mi> <mo>:</mo> <mrow class="MJX-TeXAtom-ORD"> <mrow class="MJX-TeXAtom-ORD"> <mi class="MJX-tex-caligraphic" mathvariant="script">S</mi> </mrow> </mrow> <mo>×<!-- × --></mo> <mrow class="MJX-TeXAtom-ORD"> <mrow class="MJX-TeXAtom-ORD"> <mi class="MJX-tex-caligraphic" mathvariant="script">A</mi> </mrow> </mrow> <mo stretchy="false">→<!-- → --></mo> <mo stretchy="false">[</mo> <mn>0</mn> <mo>,</mo> <mn>1</mn> <mo stretchy="false">]</mo> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle \pi :{\mathcal {S}}\times {\mathcal {A}}\rightarrow [0,1]}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/1a69266603ad25d7978aa2bb47fddf8a28aa3399" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.838ex; width:17.772ex; height:2.843ex;" alt="{\displaystyle \pi :{\mathcal {S}}\times {\mathcal {A}}\rightarrow [0,1]}"></span>, <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle \pi (s,a)=\Pr(A_{t}=a\mid S_{t}=s)}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mi>π<!-- π --></mi> <mo stretchy="false">(</mo> <mi>s</mi> <mo>,</mo> <mi>a</mi> <mo stretchy="false">)</mo> <mo>=</mo> <mo movablelimits="true" form="prefix">Pr</mo> <mo stretchy="false">(</mo> <msub> <mi>A</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>t</mi> </mrow> </msub> <mo>=</mo> <mi>a</mi> <mo>∣<!-- ∣ --></mo> <msub> <mi>S</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>t</mi> </mrow> </msub> <mo>=</mo> <mi>s</mi> <mo stretchy="false">)</mo> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle \pi (s,a)=\Pr(A_{t}=a\mid S_{t}=s)}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/a0e6b2d563f62175e044ff901314c4557840ca9d" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.838ex; width:29.172ex; height:2.843ex;" alt="{\displaystyle \pi (s,a)=\Pr(A_{t}=a\mid S_{t}=s)}"></span> </p><p>that maximizes the expected cumulative reward. </p><p>Formulating the problem as a Markov decision process assumes the agent directly observes the current environmental state; in this case, the problem is said to have <i>full observability</i>. If the agent only has access to a subset of states, or if the observed states are corrupted by noise, the agent is said to have <i>partial observability</i>, and formally the problem must be formulated as a <a href="/wiki/Partially_observable_Markov_decision_process" title="Partially observable Markov decision process">partially observable Markov decision process</a>. In both cases, the set of actions available to the agent can be restricted. For example, the state of an account balance could be restricted to be positive; if the current value of the state is 3 and the state transition attempts to reduce the value by 4, the transition will not be allowed. </p><p>When the agent's performance is compared to that of an agent that acts optimally, the difference in performance yields the notion of <a href="/wiki/Regret_(decision_theory)" title="Regret (decision theory)">regret</a>. In order to act near optimally, the agent must reason about long-term consequences of its actions (i.e., maximize future rewards), although the immediate reward associated with this might be negative. </p><p>Thus, reinforcement learning is particularly well-suited to problems that include a long-term versus short-term reward trade-off. It has been applied successfully to various problems, including <a href="/wiki/Energy_storage" title="Energy storage">energy storage</a>,<sup id="cite_ref-6" class="reference"><a href="#cite_note-6"><span class="cite-bracket">[</span>6<span class="cite-bracket">]</span></a></sup> <a href="/wiki/Robot_control" title="Robot control">robot control</a>,<sup id="cite_ref-7" class="reference"><a href="#cite_note-7"><span class="cite-bracket">[</span>7<span class="cite-bracket">]</span></a></sup> <a href="/wiki/Photovoltaic_system" title="Photovoltaic system">photovoltaic generators</a>,<sup id="cite_ref-8" class="reference"><a href="#cite_note-8"><span class="cite-bracket">[</span>8<span class="cite-bracket">]</span></a></sup> <a href="/wiki/Backgammon" title="Backgammon">backgammon</a>, <a href="/wiki/Checkers" title="Checkers">checkers</a>,<sup id="cite_ref-FOOTNOTESuttonBarto2018Chapter_11_9-0" class="reference"><a href="#cite_note-FOOTNOTESuttonBarto2018Chapter_11-9"><span class="cite-bracket">[</span>9<span class="cite-bracket">]</span></a></sup> <a href="/wiki/Go_(game)" title="Go (game)">Go</a> (<a href="/wiki/AlphaGo" title="AlphaGo">AlphaGo</a>), and <a href="/wiki/Self-driving_car" title="Self-driving car">autonomous driving systems</a>.<sup id="cite_ref-Ren-2022_10-0" class="reference"><a href="#cite_note-Ren-2022-10"><span class="cite-bracket">[</span>10<span class="cite-bracket">]</span></a></sup> </p><p>Two elements make reinforcement learning powerful: the use of samples to optimize performance, and the use of <a href="/wiki/Neural_network_(machine_learning)" title="Neural network (machine learning)">function approximation</a> to deal with large environments. Thanks to these two key components, RL can be used in large environments in the following situations: </p> <ul><li>A model of the environment is known, but an <a href="/wiki/Closed-form_expression" title="Closed-form expression">analytic solution</a> is not available;</li> <li>Only a simulation model of the environment is given (the subject of <a href="/wiki/Simulation-based_optimization" title="Simulation-based optimization">simulation-based optimization</a>);<sup id="cite_ref-11" class="reference"><a href="#cite_note-11"><span class="cite-bracket">[</span>11<span class="cite-bracket">]</span></a></sup></li> <li>The only way to collect information about the environment is to interact with it.</li></ul> <p>The first two of these problems could be considered planning problems (since some form of model is available), while the last one could be considered to be a genuine learning problem. However, reinforcement learning converts both planning problems to <a href="/wiki/Machine_learning" title="Machine learning">machine learning</a> problems. </p> <div class="mw-heading mw-heading2"><h2 id="Exploration">Exploration</h2><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Reinforcement_learning&action=edit&section=2" title="Edit section: Exploration"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <p>The exploration vs. exploitation trade-off has been most thoroughly studied through the <a href="/wiki/Multi-armed_bandit" title="Multi-armed bandit">multi-armed bandit</a> problem and for finite state space Markov decision processes in Burnetas and Katehakis (1997).<sup id="cite_ref-Optimal_adaptive_policies_for_Marko_12-0" class="reference"><a href="#cite_note-Optimal_adaptive_policies_for_Marko-12"><span class="cite-bracket">[</span>12<span class="cite-bracket">]</span></a></sup> </p><p>Reinforcement learning requires clever exploration mechanisms; randomly selecting actions, without reference to an estimated probability distribution, shows poor performance. The case of (small) finite Markov decision processes is relatively well understood. However, due to the lack of algorithms that scale well with the number of states (or scale to problems with infinite state spaces), simple exploration methods are the most practical. </p><p>One such method is <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle \varepsilon }"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mi>ε<!-- ε --></mi> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle \varepsilon }</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/a30c89172e5b88edbd45d3e2772c7f5e562e5173" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.338ex; width:1.083ex; height:1.676ex;" alt="{\displaystyle \varepsilon }"></span>-greedy, where <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle 0<\varepsilon <1}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mn>0</mn> <mo><</mo> <mi>ε<!-- ε --></mi> <mo><</mo> <mn>1</mn> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle 0<\varepsilon <1}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/a62c0bbc36352f492bde6217430cfe342032487d" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.338ex; width:9.605ex; height:2.176ex;" alt="{\displaystyle 0<\varepsilon <1}"></span> is a parameter controlling the amount of exploration vs. exploitation. With probability <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle 1-\varepsilon }"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mn>1</mn> <mo>−<!-- − --></mo> <mi>ε<!-- ε --></mi> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle 1-\varepsilon }</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/09646b74d0bca9befab4b7e9456a44a20d250ace" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.505ex; width:5.086ex; height:2.343ex;" alt="{\displaystyle 1-\varepsilon }"></span>, exploitation is chosen, and the agent chooses the action that it believes has the best long-term effect (ties between actions are broken uniformly at random). Alternatively, with probability <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle \varepsilon }"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mi>ε<!-- ε --></mi> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle \varepsilon }</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/a30c89172e5b88edbd45d3e2772c7f5e562e5173" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.338ex; width:1.083ex; height:1.676ex;" alt="{\displaystyle \varepsilon }"></span>, exploration is chosen, and the action is chosen uniformly at random. <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle \varepsilon }"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mi>ε<!-- ε --></mi> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle \varepsilon }</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/a30c89172e5b88edbd45d3e2772c7f5e562e5173" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.338ex; width:1.083ex; height:1.676ex;" alt="{\displaystyle \varepsilon }"></span> is usually a fixed parameter but can be adjusted either according to a schedule (making the agent explore progressively less), or adaptively based on heuristics.<sup id="cite_ref-13" class="reference"><a href="#cite_note-13"><span class="cite-bracket">[</span>13<span class="cite-bracket">]</span></a></sup> </p> <div class="mw-heading mw-heading2"><h2 id="Algorithms_for_control_learning">Algorithms for control learning</h2><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Reinforcement_learning&action=edit&section=3" title="Edit section: Algorithms for control learning"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <p>Even if the issue of exploration is disregarded and even if the state was observable (assumed hereafter), the problem remains to use past experience to find out which actions lead to higher cumulative rewards. </p> <div class="mw-heading mw-heading3"><h3 id="Criterion_of_optimality">Criterion of optimality</h3><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Reinforcement_learning&action=edit&section=4" title="Edit section: Criterion of optimality"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <div class="mw-heading mw-heading4"><h4 id="Policy">Policy</h4><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Reinforcement_learning&action=edit&section=5" title="Edit section: Policy"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <p>The agent's action selection is modeled as a map called <i>policy</i>: </p> <dl><dd><span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle \pi :{\mathcal {A}}\times {\mathcal {S}}\rightarrow [0,1]}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mi>π<!-- π --></mi> <mo>:</mo> <mrow class="MJX-TeXAtom-ORD"> <mrow class="MJX-TeXAtom-ORD"> <mi class="MJX-tex-caligraphic" mathvariant="script">A</mi> </mrow> </mrow> <mo>×<!-- × --></mo> <mrow class="MJX-TeXAtom-ORD"> <mrow class="MJX-TeXAtom-ORD"> <mi class="MJX-tex-caligraphic" mathvariant="script">S</mi> </mrow> </mrow> <mo stretchy="false">→<!-- → --></mo> <mo stretchy="false">[</mo> <mn>0</mn> <mo>,</mo> <mn>1</mn> <mo stretchy="false">]</mo> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle \pi :{\mathcal {A}}\times {\mathcal {S}}\rightarrow [0,1]}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/8874b7ef5cb4db70c52a318d73a094611a33f5f4" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.838ex; width:17.772ex; height:2.843ex;" alt="{\displaystyle \pi :{\mathcal {A}}\times {\mathcal {S}}\rightarrow [0,1]}"></span></dd> <dd><span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle \pi (a,s)=\Pr(A_{t}=a\mid S_{t}=s)}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mi>π<!-- π --></mi> <mo stretchy="false">(</mo> <mi>a</mi> <mo>,</mo> <mi>s</mi> <mo stretchy="false">)</mo> <mo>=</mo> <mo movablelimits="true" form="prefix">Pr</mo> <mo stretchy="false">(</mo> <msub> <mi>A</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>t</mi> </mrow> </msub> <mo>=</mo> <mi>a</mi> <mo>∣<!-- ∣ --></mo> <msub> <mi>S</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>t</mi> </mrow> </msub> <mo>=</mo> <mi>s</mi> <mo stretchy="false">)</mo> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle \pi (a,s)=\Pr(A_{t}=a\mid S_{t}=s)}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/ae6511085767b3088048f2a78a5e581bda4b7349" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.838ex; width:29.172ex; height:2.843ex;" alt="{\displaystyle \pi (a,s)=\Pr(A_{t}=a\mid S_{t}=s)}"></span></dd></dl> <p>The policy map gives the probability of taking action <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle a}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mi>a</mi> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle a}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/ffd2487510aa438433a2579450ab2b3d557e5edc" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.338ex; width:1.23ex; height:1.676ex;" alt="{\displaystyle a}"></span> when in state <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle s}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mi>s</mi> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle s}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/01d131dfd7673938b947072a13a9744fe997e632" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.338ex; width:1.09ex; height:1.676ex;" alt="{\displaystyle s}"></span>.<sup id="cite_ref-:0_14-0" class="reference"><a href="#cite_note-:0-14"><span class="cite-bracket">[</span>14<span class="cite-bracket">]</span></a></sup><sup class="reference nowrap"><span title="Page / location: 61">: 61 </span></sup> There are also deterministic policies. </p> <div class="mw-heading mw-heading4"><h4 id="State-value_function">State-value function</h4><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Reinforcement_learning&action=edit&section=6" title="Edit section: State-value function"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <p>The state-value function <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle V_{\pi }(s)}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <msub> <mi>V</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>π<!-- π --></mi> </mrow> </msub> <mo stretchy="false">(</mo> <mi>s</mi> <mo stretchy="false">)</mo> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle V_{\pi }(s)}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/04f08b168d05891950e565017e284abee8bf7cf1" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.838ex; width:5.429ex; height:2.843ex;" alt="{\displaystyle V_{\pi }(s)}"></span> is defined as, <i>expected discounted return</i> starting with state <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle s}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mi>s</mi> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle s}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/01d131dfd7673938b947072a13a9744fe997e632" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.338ex; width:1.09ex; height:1.676ex;" alt="{\displaystyle s}"></span>, i.e. <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle S_{0}=s}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <msub> <mi>S</mi> <mrow class="MJX-TeXAtom-ORD"> <mn>0</mn> </mrow> </msub> <mo>=</mo> <mi>s</mi> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle S_{0}=s}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/76b9df1ecbdbd4dbae77931a7aa185959d3b55f3" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.671ex; width:6.668ex; height:2.509ex;" alt="{\displaystyle S_{0}=s}"></span>, and successively following policy <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle \pi }"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mi>π<!-- π --></mi> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle \pi }</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/9be4ba0bb8df3af72e90a0535fabcc17431e540a" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.338ex; width:1.332ex; height:1.676ex;" alt="{\displaystyle \pi }"></span>. Hence, roughly speaking, the value function estimates "how good" it is to be in a given state.<sup id="cite_ref-:0_14-1" class="reference"><a href="#cite_note-:0-14"><span class="cite-bracket">[</span>14<span class="cite-bracket">]</span></a></sup><sup class="reference nowrap"><span title="Page / location: 60">: 60 </span></sup> </p> <dl><dd><span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle V_{\pi }(s)=\operatorname {\mathbb {E} } [G\mid S_{0}=s]=\operatorname {\mathbb {E} } \left[\sum _{t=0}^{\infty }\gamma ^{t}R_{t+1}\mid S_{0}=s\right],}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <msub> <mi>V</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>π<!-- π --></mi> </mrow> </msub> <mo stretchy="false">(</mo> <mi>s</mi> <mo stretchy="false">)</mo> <mo>=</mo> <mrow class="MJX-TeXAtom-OP MJX-fixedlimits"> <mrow class="MJX-TeXAtom-ORD"> <mi mathvariant="double-struck">E</mi> </mrow> </mrow> <mo>⁡<!-- --></mo> <mo stretchy="false">[</mo> <mi>G</mi> <mo>∣<!-- ∣ --></mo> <msub> <mi>S</mi> <mrow class="MJX-TeXAtom-ORD"> <mn>0</mn> </mrow> </msub> <mo>=</mo> <mi>s</mi> <mo stretchy="false">]</mo> <mo>=</mo> <mrow class="MJX-TeXAtom-OP MJX-fixedlimits"> <mrow class="MJX-TeXAtom-ORD"> <mi mathvariant="double-struck">E</mi> </mrow> </mrow> <mo>⁡<!-- --></mo> <mrow> <mo>[</mo> <mrow> <munderover> <mo>∑<!-- ∑ --></mo> <mrow class="MJX-TeXAtom-ORD"> <mi>t</mi> <mo>=</mo> <mn>0</mn> </mrow> <mrow class="MJX-TeXAtom-ORD"> <mi mathvariant="normal">∞<!-- ∞ --></mi> </mrow> </munderover> <msup> <mi>γ<!-- γ --></mi> <mrow class="MJX-TeXAtom-ORD"> <mi>t</mi> </mrow> </msup> <msub> <mi>R</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>t</mi> <mo>+</mo> <mn>1</mn> </mrow> </msub> <mo>∣<!-- ∣ --></mo> <msub> <mi>S</mi> <mrow class="MJX-TeXAtom-ORD"> <mn>0</mn> </mrow> </msub> <mo>=</mo> <mi>s</mi> </mrow> <mo>]</mo> </mrow> <mo>,</mo> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle V_{\pi }(s)=\operatorname {\mathbb {E} } [G\mid S_{0}=s]=\operatorname {\mathbb {E} } \left[\sum _{t=0}^{\infty }\gamma ^{t}R_{t+1}\mid S_{0}=s\right],}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/f84b6e9913e812561947e07eb7bee813a398e879" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -3.171ex; width:48.953ex; height:7.509ex;" alt="{\displaystyle V_{\pi }(s)=\operatorname {\mathbb {E} } [G\mid S_{0}=s]=\operatorname {\mathbb {E} } \left[\sum _{t=0}^{\infty }\gamma ^{t}R_{t+1}\mid S_{0}=s\right],}"></span></dd></dl> <p>where the random variable <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle G}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mi>G</mi> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle G}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/f5f3c8921a3b352de45446a6789b104458c9f90b" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.338ex; width:1.827ex; height:2.176ex;" alt="{\displaystyle G}"></span> denotes the <b>discounted return</b>, and is defined as the sum of future discounted rewards: </p> <dl><dd><span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle G=\sum _{t=0}^{\infty }\gamma ^{t}R_{t+1}=R_{1}+\gamma R_{2}+\gamma ^{2}R_{3}+\dots ,}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mi>G</mi> <mo>=</mo> <munderover> <mo>∑<!-- ∑ --></mo> <mrow class="MJX-TeXAtom-ORD"> <mi>t</mi> <mo>=</mo> <mn>0</mn> </mrow> <mrow class="MJX-TeXAtom-ORD"> <mi mathvariant="normal">∞<!-- ∞ --></mi> </mrow> </munderover> <msup> <mi>γ<!-- γ --></mi> <mrow class="MJX-TeXAtom-ORD"> <mi>t</mi> </mrow> </msup> <msub> <mi>R</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>t</mi> <mo>+</mo> <mn>1</mn> </mrow> </msub> <mo>=</mo> <msub> <mi>R</mi> <mrow class="MJX-TeXAtom-ORD"> <mn>1</mn> </mrow> </msub> <mo>+</mo> <mi>γ<!-- γ --></mi> <msub> <mi>R</mi> <mrow class="MJX-TeXAtom-ORD"> <mn>2</mn> </mrow> </msub> <mo>+</mo> <msup> <mi>γ<!-- γ --></mi> <mrow class="MJX-TeXAtom-ORD"> <mn>2</mn> </mrow> </msup> <msub> <mi>R</mi> <mrow class="MJX-TeXAtom-ORD"> <mn>3</mn> </mrow> </msub> <mo>+</mo> <mo>…<!-- … --></mo> <mo>,</mo> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle G=\sum _{t=0}^{\infty }\gamma ^{t}R_{t+1}=R_{1}+\gamma R_{2}+\gamma ^{2}R_{3}+\dots ,}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/47c6a37cb9ed68a9560efd8521569e05d7378cba" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -3.005ex; width:42.892ex; height:6.843ex;" alt="{\displaystyle G=\sum _{t=0}^{\infty }\gamma ^{t}R_{t+1}=R_{1}+\gamma R_{2}+\gamma ^{2}R_{3}+\dots ,}"></span></dd></dl> <p>where <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle R_{t+1}}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <msub> <mi>R</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>t</mi> <mo>+</mo> <mn>1</mn> </mrow> </msub> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle R_{t+1}}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/78bc3f077c19f7db4e89b5529a5861b06ae782b7" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.671ex; width:4.69ex; height:2.509ex;" alt="{\displaystyle R_{t+1}}"></span> is the reward for transitioning from state <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle S_{t}}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <msub> <mi>S</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>t</mi> </mrow> </msub> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle S_{t}}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/7e2391e6e796fbf718be3828080775ac2ac3d3d4" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.671ex; width:2.251ex; height:2.509ex;" alt="{\displaystyle S_{t}}"></span> to <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle S_{t+1}}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <msub> <mi>S</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>t</mi> <mo>+</mo> <mn>1</mn> </mrow> </msub> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle S_{t+1}}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/7fde0b9d29bcb7e8d3eee3a8f42ead14aa3b8cee" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.671ex; width:4.351ex; height:2.509ex;" alt="{\displaystyle S_{t+1}}"></span>, <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle 0\leq \gamma <1}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mn>0</mn> <mo>≤<!-- ≤ --></mo> <mi>γ<!-- γ --></mi> <mo><</mo> <mn>1</mn> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle 0\leq \gamma <1}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/84796b874665109176d8773a2e6495f00c7cc360" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.838ex; width:9.784ex; height:2.676ex;" alt="{\displaystyle 0\leq \gamma <1}"></span> is the <a href="/wiki/Q-learning#Discount_factor" title="Q-learning">discount rate</a>. <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle \gamma }"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mi>γ<!-- γ --></mi> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle \gamma }</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/a223c880b0ce3da8f64ee33c4f0010beee400b1a" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.838ex; width:1.262ex; height:2.176ex;" alt="{\displaystyle \gamma }"></span> is less than 1, so rewards in the distant future are weighted less than rewards in the immediate future. </p><p>The algorithm must find a policy with maximum expected discounted return. From the theory of Markov decision processes it is known that, without loss of generality, the search can be restricted to the set of so-called <i>stationary</i> policies. A policy is <i>stationary</i> if the action-distribution returned by it depends only on the last state visited (from the observation agent's history). The search can be further restricted to <i>deterministic</i> stationary policies. A <i>deterministic stationary</i> policy deterministically selects actions based on the current state. Since any such policy can be identified with a mapping from the set of states to the set of actions, these policies can be identified with such mappings with no loss of generality. </p> <div class="mw-heading mw-heading3"><h3 id="Brute_force">Brute force</h3><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Reinforcement_learning&action=edit&section=7" title="Edit section: Brute force"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <p>The <a href="/wiki/Brute-force_search" title="Brute-force search">brute force</a> approach entails two steps: </p> <ul><li>For each possible policy, sample returns while following it</li> <li>Choose the policy with the largest expected discounted return</li></ul> <p>One problem with this is that the number of policies can be large, or even infinite. Another is that the variance of the returns may be large, which requires many samples to accurately estimate the discounted return of each policy. </p><p>These problems can be ameliorated if we assume some structure and allow samples generated from one policy to influence the estimates made for others. The two main approaches for achieving this are <a href="#Value_function">value function estimation</a> and <a href="#Direct_policy_search">direct policy search</a>. </p> <div class="mw-heading mw-heading3"><h3 id="Value_function">Value function</h3><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Reinforcement_learning&action=edit&section=8" title="Edit section: Value function"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1236090951"><div role="note" class="hatnote navigation-not-searchable">See also: <a href="/wiki/Value_function" title="Value function">Value function</a></div> <p>Value function approaches attempt to find a policy that maximizes the discounted return by maintaining a set of estimates of expected discounted returns <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle \operatorname {\mathbb {E} } [G]}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mrow class="MJX-TeXAtom-OP MJX-fixedlimits"> <mrow class="MJX-TeXAtom-ORD"> <mi mathvariant="double-struck">E</mi> </mrow> </mrow> <mo>⁡<!-- --></mo> <mo stretchy="false">[</mo> <mi>G</mi> <mo stretchy="false">]</mo> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle \operatorname {\mathbb {E} } [G]}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/c17deb2341b047040bbddcbb56bae20d587311cb" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.838ex; width:4.671ex; height:2.843ex;" alt="{\displaystyle \operatorname {\mathbb {E} } [G]}"></span> for some policy (usually either the "current" [on-policy] or the optimal [off-policy] one). </p><p>These methods rely on the theory of Markov decision processes, where optimality is defined in a sense stronger than the one above: A policy is optimal if it achieves the best-expected discounted return from <i>any</i> initial state (i.e., initial distributions play no role in this definition). Again, an optimal policy can always be found among stationary policies. </p><p>To define optimality in a formal manner, define the state-value of a policy <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle \pi }"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mi>π<!-- π --></mi> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle \pi }</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/9be4ba0bb8df3af72e90a0535fabcc17431e540a" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.338ex; width:1.332ex; height:1.676ex;" alt="{\displaystyle \pi }"></span> by </p> <dl><dd><span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle V^{\pi }(s)=\operatorname {\mathbb {E} } [G\mid s,\pi ],}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <msup> <mi>V</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>π<!-- π --></mi> </mrow> </msup> <mo stretchy="false">(</mo> <mi>s</mi> <mo stretchy="false">)</mo> <mo>=</mo> <mrow class="MJX-TeXAtom-OP MJX-fixedlimits"> <mrow class="MJX-TeXAtom-ORD"> <mi mathvariant="double-struck">E</mi> </mrow> </mrow> <mo>⁡<!-- --></mo> <mo stretchy="false">[</mo> <mi>G</mi> <mo>∣<!-- ∣ --></mo> <mi>s</mi> <mo>,</mo> <mi>π<!-- π --></mi> <mo stretchy="false">]</mo> <mo>,</mo> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle V^{\pi }(s)=\operatorname {\mathbb {E} } [G\mid s,\pi ],}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/f79fad295cbb4d688fe27b8e58110fddde33549f" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.838ex; width:19.8ex; height:2.843ex;" alt="{\displaystyle V^{\pi }(s)=\operatorname {\mathbb {E} } [G\mid s,\pi ],}"></span></dd></dl> <p>where <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle G}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mi>G</mi> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle G}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/f5f3c8921a3b352de45446a6789b104458c9f90b" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.338ex; width:1.827ex; height:2.176ex;" alt="{\displaystyle G}"></span> stands for the discounted return associated with following <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle \pi }"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mi>π<!-- π --></mi> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle \pi }</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/9be4ba0bb8df3af72e90a0535fabcc17431e540a" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.338ex; width:1.332ex; height:1.676ex;" alt="{\displaystyle \pi }"></span> from the initial state <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle s}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mi>s</mi> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle s}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/01d131dfd7673938b947072a13a9744fe997e632" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.338ex; width:1.09ex; height:1.676ex;" alt="{\displaystyle s}"></span>. Defining <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle V^{*}(s)}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <msup> <mi>V</mi> <mrow class="MJX-TeXAtom-ORD"> <mo>∗<!-- ∗ --></mo> </mrow> </msup> <mo stretchy="false">(</mo> <mi>s</mi> <mo stretchy="false">)</mo> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle V^{*}(s)}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/b0c55d312621b3ac9cbd28f1af88d01d4cb4d110" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.838ex; width:5.871ex; height:2.843ex;" alt="{\displaystyle V^{*}(s)}"></span> as the maximum possible state-value of <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle V^{\pi }(s)}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <msup> <mi>V</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>π<!-- π --></mi> </mrow> </msup> <mo stretchy="false">(</mo> <mi>s</mi> <mo stretchy="false">)</mo> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle V^{\pi }(s)}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/a42ff0c4c2e0d6e3bbfc997e81544641805798a4" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.838ex; width:5.991ex; height:2.843ex;" alt="{\displaystyle V^{\pi }(s)}"></span>, where <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle \pi }"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mi>π<!-- π --></mi> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle \pi }</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/9be4ba0bb8df3af72e90a0535fabcc17431e540a" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.338ex; width:1.332ex; height:1.676ex;" alt="{\displaystyle \pi }"></span> is allowed to change, </p> <dl><dd><span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle V^{*}(s)=\max _{\pi }V^{\pi }(s).}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <msup> <mi>V</mi> <mrow class="MJX-TeXAtom-ORD"> <mo>∗<!-- ∗ --></mo> </mrow> </msup> <mo stretchy="false">(</mo> <mi>s</mi> <mo stretchy="false">)</mo> <mo>=</mo> <munder> <mo movablelimits="true" form="prefix">max</mo> <mrow class="MJX-TeXAtom-ORD"> <mi>π<!-- π --></mi> </mrow> </munder> <msup> <mi>V</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>π<!-- π --></mi> </mrow> </msup> <mo stretchy="false">(</mo> <mi>s</mi> <mo stretchy="false">)</mo> <mo>.</mo> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle V^{*}(s)=\max _{\pi }V^{\pi }(s).}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/3a1364980b05ea95dedcefb2082869eab54dd1cc" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -2.005ex; width:20.32ex; height:4.009ex;" alt="{\displaystyle V^{*}(s)=\max _{\pi }V^{\pi }(s).}"></span></dd></dl> <p>A policy that achieves these optimal state-values in each state is called <i>optimal</i>. Clearly, a policy that is optimal in this sense is also optimal in the sense that it maximizes the expected discounted return, since <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle V^{*}(s)=\max _{\pi }\mathbb {E} [G\mid s,\pi ]}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <msup> <mi>V</mi> <mrow class="MJX-TeXAtom-ORD"> <mo>∗<!-- ∗ --></mo> </mrow> </msup> <mo stretchy="false">(</mo> <mi>s</mi> <mo stretchy="false">)</mo> <mo>=</mo> <munder> <mo movablelimits="true" form="prefix">max</mo> <mrow class="MJX-TeXAtom-ORD"> <mi>π<!-- π --></mi> </mrow> </munder> <mrow class="MJX-TeXAtom-ORD"> <mi mathvariant="double-struck">E</mi> </mrow> <mo stretchy="false">[</mo> <mi>G</mi> <mo>∣<!-- ∣ --></mo> <mi>s</mi> <mo>,</mo> <mi>π<!-- π --></mi> <mo stretchy="false">]</mo> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle V^{*}(s)=\max _{\pi }\mathbb {E} [G\mid s,\pi ]}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/83afb99d87052615899fbc3413a1e02467143957" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -2.005ex; width:23.747ex; height:4.009ex;" alt="{\displaystyle V^{*}(s)=\max _{\pi }\mathbb {E} [G\mid s,\pi ]}"></span>, where <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle s}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mi>s</mi> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle s}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/01d131dfd7673938b947072a13a9744fe997e632" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.338ex; width:1.09ex; height:1.676ex;" alt="{\displaystyle s}"></span> is a state randomly sampled from the distribution <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle \mu }"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mi>μ<!-- μ --></mi> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle \mu }</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/9fd47b2a39f7a7856952afec1f1db72c67af6161" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.838ex; width:1.402ex; height:2.176ex;" alt="{\displaystyle \mu }"></span> of initial states (so <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle \mu (s)=\Pr(S_{0}=s)}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mi>μ<!-- μ --></mi> <mo stretchy="false">(</mo> <mi>s</mi> <mo stretchy="false">)</mo> <mo>=</mo> <mo movablelimits="true" form="prefix">Pr</mo> <mo stretchy="false">(</mo> <msub> <mi>S</mi> <mrow class="MJX-TeXAtom-ORD"> <mn>0</mn> </mrow> </msub> <mo>=</mo> <mi>s</mi> <mo stretchy="false">)</mo> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle \mu (s)=\Pr(S_{0}=s)}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/de42c6766fe6d309149deb462214e1d2c9f5fa36" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.838ex; width:18.372ex; height:2.843ex;" alt="{\displaystyle \mu (s)=\Pr(S_{0}=s)}"></span>). </p><p>Although state-values suffice to define optimality, it is useful to define action-values. Given a state <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle s}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mi>s</mi> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle s}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/01d131dfd7673938b947072a13a9744fe997e632" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.338ex; width:1.09ex; height:1.676ex;" alt="{\displaystyle s}"></span>, an action <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle a}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mi>a</mi> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle a}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/ffd2487510aa438433a2579450ab2b3d557e5edc" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.338ex; width:1.23ex; height:1.676ex;" alt="{\displaystyle a}"></span> and a policy <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle \pi }"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mi>π<!-- π --></mi> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle \pi }</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/9be4ba0bb8df3af72e90a0535fabcc17431e540a" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.338ex; width:1.332ex; height:1.676ex;" alt="{\displaystyle \pi }"></span>, the action-value of the pair <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle (s,a)}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mo stretchy="false">(</mo> <mi>s</mi> <mo>,</mo> <mi>a</mi> <mo stretchy="false">)</mo> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle (s,a)}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/9e483c36a133514a0cd6d604f5cce56d2fd4cae9" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.838ex; width:5.164ex; height:2.843ex;" alt="{\displaystyle (s,a)}"></span> under <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle \pi }"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mi>π<!-- π --></mi> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle \pi }</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/9be4ba0bb8df3af72e90a0535fabcc17431e540a" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.338ex; width:1.332ex; height:1.676ex;" alt="{\displaystyle \pi }"></span> is defined by </p> <dl><dd><span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle Q^{\pi }(s,a)=\operatorname {\mathbb {E} } [G\mid s,a,\pi ],\,}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <msup> <mi>Q</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>π<!-- π --></mi> </mrow> </msup> <mo stretchy="false">(</mo> <mi>s</mi> <mo>,</mo> <mi>a</mi> <mo stretchy="false">)</mo> <mo>=</mo> <mrow class="MJX-TeXAtom-OP MJX-fixedlimits"> <mrow class="MJX-TeXAtom-ORD"> <mi mathvariant="double-struck">E</mi> </mrow> </mrow> <mo>⁡<!-- --></mo> <mo stretchy="false">[</mo> <mi>G</mi> <mo>∣<!-- ∣ --></mo> <mi>s</mi> <mo>,</mo> <mi>a</mi> <mo>,</mo> <mi>π<!-- π --></mi> <mo stretchy="false">]</mo> <mo>,</mo> <mspace width="thinmathspace" /> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle Q^{\pi }(s,a)=\operatorname {\mathbb {E} } [G\mid s,a,\pi ],\,}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/a8d16c880426ef6a8e8723bc804fbec99bb81b08" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.838ex; width:24.636ex; height:2.843ex;" alt="{\displaystyle Q^{\pi }(s,a)=\operatorname {\mathbb {E} } [G\mid s,a,\pi ],\,}"></span></dd></dl> <p>where <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle G}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mi>G</mi> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle G}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/f5f3c8921a3b352de45446a6789b104458c9f90b" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.338ex; width:1.827ex; height:2.176ex;" alt="{\displaystyle G}"></span> now stands for the random discounted return associated with first taking action <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle a}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mi>a</mi> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle a}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/ffd2487510aa438433a2579450ab2b3d557e5edc" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.338ex; width:1.23ex; height:1.676ex;" alt="{\displaystyle a}"></span> in state <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle s}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mi>s</mi> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle s}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/01d131dfd7673938b947072a13a9744fe997e632" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.338ex; width:1.09ex; height:1.676ex;" alt="{\displaystyle s}"></span> and following <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle \pi }"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mi>π<!-- π --></mi> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle \pi }</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/9be4ba0bb8df3af72e90a0535fabcc17431e540a" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.338ex; width:1.332ex; height:1.676ex;" alt="{\displaystyle \pi }"></span>, thereafter. </p><p>The theory of Markov decision processes states that if <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle \pi ^{*}}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <msup> <mi>π<!-- π --></mi> <mrow class="MJX-TeXAtom-ORD"> <mo>∗<!-- ∗ --></mo> </mrow> </msup> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle \pi ^{*}}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/f44ad69ec033a9a86437b2edaf620ea0b2c3f494" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.338ex; width:2.388ex; height:2.343ex;" alt="{\displaystyle \pi ^{*}}"></span> is an optimal policy, we act optimally (take the optimal action) by choosing the action from <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle Q^{\pi ^{*}}(s,\cdot )}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <msup> <mi>Q</mi> <mrow class="MJX-TeXAtom-ORD"> <msup> <mi>π<!-- π --></mi> <mrow class="MJX-TeXAtom-ORD"> <mo>∗<!-- ∗ --></mo> </mrow> </msup> </mrow> </msup> <mo stretchy="false">(</mo> <mi>s</mi> <mo>,</mo> <mo>⋅<!-- ⋅ --></mo> <mo stretchy="false">)</mo> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle Q^{\pi ^{*}}(s,\cdot )}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/fafa0cb84fd68dbe99e9b5baa0b421837c2835d8" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.838ex; width:8.426ex; height:3.343ex;" alt="{\displaystyle Q^{\pi ^{*}}(s,\cdot )}"></span> with the highest action-value at each state, <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle s}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mi>s</mi> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle s}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/01d131dfd7673938b947072a13a9744fe997e632" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.338ex; width:1.09ex; height:1.676ex;" alt="{\displaystyle s}"></span>. The <i>action-value function</i> of such an optimal policy (<span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle Q^{\pi ^{*}}}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <msup> <mi>Q</mi> <mrow class="MJX-TeXAtom-ORD"> <msup> <mi>π<!-- π --></mi> <mrow class="MJX-TeXAtom-ORD"> <mo>∗<!-- ∗ --></mo> </mrow> </msup> </mrow> </msup> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle Q^{\pi ^{*}}}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/efdae1153bf5fc57c8777369756d7c5cc20fd90a" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.671ex; width:3.846ex; height:3.176ex;" alt="{\displaystyle Q^{\pi ^{*}}}"></span>) is called the <i>optimal action-value function</i> and is commonly denoted by <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle Q^{*}}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <msup> <mi>Q</mi> <mrow class="MJX-TeXAtom-ORD"> <mo>∗<!-- ∗ --></mo> </mrow> </msup> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle Q^{*}}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/c1785c0a77ab5a06684e8a7ac4f5e59d59ec0319" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.671ex; width:2.893ex; height:2.676ex;" alt="{\displaystyle Q^{*}}"></span>. In summary, the knowledge of the optimal action-value function alone suffices to know how to act optimally. </p><p>Assuming full knowledge of the Markov decision process, the two basic approaches to compute the optimal action-value function are <a href="/wiki/Value_iteration" class="mw-redirect" title="Value iteration">value iteration</a> and <a href="/wiki/Policy_iteration" class="mw-redirect" title="Policy iteration">policy iteration</a>. Both algorithms compute a sequence of functions <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle Q_{k}}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <msub> <mi>Q</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>k</mi> </mrow> </msub> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle Q_{k}}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/2fa6cb6cf7ffc157202e52dd5711e755892d1015" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.671ex; width:2.927ex; height:2.509ex;" alt="{\displaystyle Q_{k}}"></span> (<span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle k=0,1,2,\ldots }"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mi>k</mi> <mo>=</mo> <mn>0</mn> <mo>,</mo> <mn>1</mn> <mo>,</mo> <mn>2</mn> <mo>,</mo> <mo>…<!-- … --></mo> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle k=0,1,2,\ldots }</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/b6d4b81efe1ce928c56c7b143cb2591f2100246c" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.671ex; width:13.622ex; height:2.509ex;" alt="{\displaystyle k=0,1,2,\ldots }"></span>) that converge to <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle Q^{*}}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <msup> <mi>Q</mi> <mrow class="MJX-TeXAtom-ORD"> <mo>∗<!-- ∗ --></mo> </mrow> </msup> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle Q^{*}}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/c1785c0a77ab5a06684e8a7ac4f5e59d59ec0319" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.671ex; width:2.893ex; height:2.676ex;" alt="{\displaystyle Q^{*}}"></span>. Computing these functions involves computing expectations over the whole state-space, which is impractical for all but the smallest (finite) Markov decision processes. In reinforcement learning methods, expectations are approximated by averaging over samples and using function approximation techniques to cope with the need to represent value functions over large state-action spaces. </p> <div class="mw-heading mw-heading4"><h4 id="Monte_Carlo_methods">Monte Carlo methods</h4><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Reinforcement_learning&action=edit&section=9" title="Edit section: Monte Carlo methods"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <p><a href="/wiki/Monte_Carlo_sampling" class="mw-redirect" title="Monte Carlo sampling">Monte Carlo methods</a><sup id="cite_ref-15" class="reference"><a href="#cite_note-15"><span class="cite-bracket">[</span>15<span class="cite-bracket">]</span></a></sup> are used to solve reinforcement learning problems by averaging sample returns. Unlike methods that require full knowledge of the environment’s dynamics, Monte Carlo methods rely solely on actual or <a href="/wiki/Simulation" title="Simulation">simulated</a> experience—sequences of states, actions, and rewards obtained from interaction with an environment. This makes them applicable in situations where the complete dynamics are unknown. Learning from actual experience does not require prior knowledge of the environment and can still lead to optimal behavior. When using simulated experience, only a model capable of generating sample transitions is required, rather than a full specification of <a href="/wiki/Markov_chain" title="Markov chain">transition probabilities</a>, which is necessary for <a href="/wiki/Dynamic_programming" title="Dynamic programming">dynamic programming</a> methods. </p><p>Monte Carlo methods apply to episodic tasks, where experience is divided into episodes that eventually terminate. Policy and value function updates occur only after the completion of an episode, making these methods incremental on an episode-by-episode basis, though not on a step-by-step (online) basis. The term “Monte Carlo” generally refers to any method involving <a href="/wiki/Random_sampling" class="mw-redirect" title="Random sampling">random sampling</a>; however, in this context, it specifically refers to methods that compute averages from <i>complete</i> returns, rather than <i>partial</i> returns. </p><p>These methods function similarly to the <a href="/wiki/Multi-armed_bandit" title="Multi-armed bandit">bandit algorithms</a>, in which returns are averaged for each state-action pair. The key difference is that actions taken in one state affect the returns of subsequent states within the same episode, making the problem <a href="/wiki/Non-stationary" class="mw-redirect" title="Non-stationary">non-stationary</a>. To address this non-stationarity, Monte Carlo methods use the framework of general policy iteration (GPI). While dynamic programming computes <a href="/wiki/Value_function" title="Value function">value functions</a> using full knowledge of the <a href="/wiki/Markov_decision_process" title="Markov decision process">Markov decision process</a> (MDP), Monte Carlo methods learn these functions through sample returns. The value functions and policies interact similarly to dynamic programming to achieve <a href="/wiki/Mathematical_optimization" title="Mathematical optimization">optimality</a>, first addressing the prediction problem and then extending to policy improvement and control, all based on sampled experience.<sup id="cite_ref-:0_14-2" class="reference"><a href="#cite_note-:0-14"><span class="cite-bracket">[</span>14<span class="cite-bracket">]</span></a></sup> </p> <div class="mw-heading mw-heading4"><h4 id="Temporal_difference_methods">Temporal difference methods</h4><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Reinforcement_learning&action=edit&section=10" title="Edit section: Temporal difference methods"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1236090951"><div role="note" class="hatnote navigation-not-searchable">Main article: <a href="/wiki/Temporal_difference_learning" title="Temporal difference learning">Temporal difference learning</a></div> <p>The first problem is corrected by allowing the procedure to change the policy (at some or all states) before the values settle. This too may be problematic as it might prevent convergence. Most current algorithms do this, giving rise to the class of <i>generalized policy iteration</i> algorithms. Many <i>actor-critic</i> methods belong to this category. </p><p>The second issue can be corrected by allowing trajectories to contribute to any state-action pair in them. This may also help to some extent with the third problem, although a better solution when returns have high variance is Sutton's <a href="/wiki/Temporal_difference" class="mw-redirect" title="Temporal difference">temporal difference</a> (TD) methods that are based on the recursive <a href="/wiki/Bellman_equation" title="Bellman equation">Bellman equation</a>.<sup id="cite_ref-16" class="reference"><a href="#cite_note-16"><span class="cite-bracket">[</span>16<span class="cite-bracket">]</span></a></sup><sup id="cite_ref-FOOTNOTESuttonBarto2018[httpincompleteideasnetsuttonbookebooknode60html_§6._Temporal-Difference_Learning]_17-0" class="reference"><a href="#cite_note-FOOTNOTESuttonBarto2018[httpincompleteideasnetsuttonbookebooknode60html_§6._Temporal-Difference_Learning]-17"><span class="cite-bracket">[</span>17<span class="cite-bracket">]</span></a></sup> The computation in TD methods can be incremental (when after each transition the memory is changed and the transition is thrown away), or batch (when the transitions are batched and the estimates are computed once based on the batch). Batch methods, such as the least-squares temporal difference method,<sup id="cite_ref-18" class="reference"><a href="#cite_note-18"><span class="cite-bracket">[</span>18<span class="cite-bracket">]</span></a></sup> may use the information in the samples better, while incremental methods are the only choice when batch methods are infeasible due to their high computational or memory complexity. Some methods try to combine the two approaches. Methods based on temporal differences also overcome the fourth issue. </p><p>Another problem specific to TD comes from their reliance on the recursive Bellman equation. Most TD methods have a so-called <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle \lambda }"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mi>λ<!-- λ --></mi> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle \lambda }</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/b43d0ea3c9c025af1be9128e62a18fa74bedda2a" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.338ex; width:1.355ex; height:2.176ex;" alt="{\displaystyle \lambda }"></span> parameter <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle (0\leq \lambda \leq 1)}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mo stretchy="false">(</mo> <mn>0</mn> <mo>≤<!-- ≤ --></mo> <mi>λ<!-- λ --></mi> <mo>≤<!-- ≤ --></mo> <mn>1</mn> <mo stretchy="false">)</mo> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle (0\leq \lambda \leq 1)}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/65907c1d6a086eeb039ebeca34502f7463d06b56" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.838ex; width:11.686ex; height:2.843ex;" alt="{\displaystyle (0\leq \lambda \leq 1)}"></span> that can continuously interpolate between Monte Carlo methods that do not rely on the Bellman equations and the basic TD methods that rely entirely on the Bellman equations. This can be effective in palliating this issue. </p> <div class="mw-heading mw-heading4"><h4 id="Function_approximation_methods">Function approximation methods</h4><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Reinforcement_learning&action=edit&section=11" title="Edit section: Function approximation methods"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <p>In order to address the fifth issue, <i>function approximation methods</i> are used. <i>Linear function approximation</i> starts with a mapping <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle \phi }"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mi>ϕ<!-- ϕ --></mi> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle \phi }</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/72b1f30316670aee6270a28334bdf4f5072cdde4" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.671ex; width:1.385ex; height:2.509ex;" alt="{\displaystyle \phi }"></span> that assigns a finite-dimensional vector to each state-action pair. Then, the action values of a state-action pair <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle (s,a)}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mo stretchy="false">(</mo> <mi>s</mi> <mo>,</mo> <mi>a</mi> <mo stretchy="false">)</mo> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle (s,a)}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/9e483c36a133514a0cd6d604f5cce56d2fd4cae9" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.838ex; width:5.164ex; height:2.843ex;" alt="{\displaystyle (s,a)}"></span> are obtained by linearly combining the components of <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle \phi (s,a)}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mi>ϕ<!-- ϕ --></mi> <mo stretchy="false">(</mo> <mi>s</mi> <mo>,</mo> <mi>a</mi> <mo stretchy="false">)</mo> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle \phi (s,a)}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/679d681f66578f81accbbebe2598038097c95965" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.838ex; width:6.549ex; height:2.843ex;" alt="{\displaystyle \phi (s,a)}"></span> with some <i>weights</i> <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle \theta }"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mi>θ<!-- θ --></mi> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle \theta }</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/6e5ab2664b422d53eb0c7df3b87e1360d75ad9af" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.338ex; width:1.09ex; height:2.176ex;" alt="{\displaystyle \theta }"></span>: </p> <dl><dd><span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle Q(s,a)=\sum _{i=1}^{d}\theta _{i}\phi _{i}(s,a).}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mi>Q</mi> <mo stretchy="false">(</mo> <mi>s</mi> <mo>,</mo> <mi>a</mi> <mo stretchy="false">)</mo> <mo>=</mo> <munderover> <mo>∑<!-- ∑ --></mo> <mrow class="MJX-TeXAtom-ORD"> <mi>i</mi> <mo>=</mo> <mn>1</mn> </mrow> <mrow class="MJX-TeXAtom-ORD"> <mi>d</mi> </mrow> </munderover> <msub> <mi>θ<!-- θ --></mi> <mrow class="MJX-TeXAtom-ORD"> <mi>i</mi> </mrow> </msub> <msub> <mi>ϕ<!-- ϕ --></mi> <mrow class="MJX-TeXAtom-ORD"> <mi>i</mi> </mrow> </msub> <mo stretchy="false">(</mo> <mi>s</mi> <mo>,</mo> <mi>a</mi> <mo stretchy="false">)</mo> <mo>.</mo> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle Q(s,a)=\sum _{i=1}^{d}\theta _{i}\phi _{i}(s,a).}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/b8fb9c17e9850dfa123aac5cf0541b629df47de7" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -3.005ex; width:23.728ex; height:7.343ex;" alt="{\displaystyle Q(s,a)=\sum _{i=1}^{d}\theta _{i}\phi _{i}(s,a).}"></span></dd></dl> <p>The algorithms then adjust the weights, instead of adjusting the values associated with the individual state-action pairs. Methods based on ideas from <a href="/wiki/Nonparametric_statistics" title="Nonparametric statistics">nonparametric statistics</a> (which can be seen to construct their own features) have been explored. </p><p>Value iteration can also be used as a starting point, giving rise to the <a href="/wiki/Q-learning" title="Q-learning">Q-learning</a> algorithm and its many variants.<sup id="cite_ref-19" class="reference"><a href="#cite_note-19"><span class="cite-bracket">[</span>19<span class="cite-bracket">]</span></a></sup> Including Deep Q-learning methods when a neural network is used to represent Q, with various applications in stochastic search problems.<sup id="cite_ref-MBK_20-0" class="reference"><a href="#cite_note-MBK-20"><span class="cite-bracket">[</span>20<span class="cite-bracket">]</span></a></sup> </p><p>The problem with using action-values is that they may need highly precise estimates of the competing action values that can be hard to obtain when the returns are noisy, though this problem is mitigated to some extent by temporal difference methods. Using the so-called compatible function approximation method compromises generality and efficiency. </p> <div class="mw-heading mw-heading3"><h3 id="Direct_policy_search">Direct policy search</h3><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Reinforcement_learning&action=edit&section=12" title="Edit section: Direct policy search"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <p>An alternative method is to search directly in (some subset of) the policy space, in which case the problem becomes a case of <a href="/wiki/Stochastic_optimization" title="Stochastic optimization">stochastic optimization</a>. The two approaches available are gradient-based and gradient-free methods. </p><p><a href="/wiki/Gradient" title="Gradient">Gradient</a>-based methods (<i>policy gradient methods</i>) start with a mapping from a finite-dimensional (parameter) space to the space of policies: given the parameter vector <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle \theta }"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mi>θ<!-- θ --></mi> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle \theta }</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/6e5ab2664b422d53eb0c7df3b87e1360d75ad9af" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.338ex; width:1.09ex; height:2.176ex;" alt="{\displaystyle \theta }"></span>, let <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle \pi _{\theta }}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <msub> <mi>π<!-- π --></mi> <mrow class="MJX-TeXAtom-ORD"> <mi>θ<!-- θ --></mi> </mrow> </msub> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle \pi _{\theta }}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/2a9f6266b9302e47ccd37e867f4a364ff52fb8af" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.671ex; width:2.328ex; height:2.009ex;" alt="{\displaystyle \pi _{\theta }}"></span> denote the policy associated to <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle \theta }"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mi>θ<!-- θ --></mi> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle \theta }</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/6e5ab2664b422d53eb0c7df3b87e1360d75ad9af" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.338ex; width:1.09ex; height:2.176ex;" alt="{\displaystyle \theta }"></span>. Defining the performance function by <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle \rho (\theta )=\rho ^{\pi _{\theta }}}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mi>ρ<!-- ρ --></mi> <mo stretchy="false">(</mo> <mi>θ<!-- θ --></mi> <mo stretchy="false">)</mo> <mo>=</mo> <msup> <mi>ρ<!-- ρ --></mi> <mrow class="MJX-TeXAtom-ORD"> <msub> <mi>π<!-- π --></mi> <mrow class="MJX-TeXAtom-ORD"> <mi>θ<!-- θ --></mi> </mrow> </msub> </mrow> </msup> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle \rho (\theta )=\rho ^{\pi _{\theta }}}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/a66bf6220d37e57309930a00168cf36d105393b9" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.838ex; width:10.362ex; height:2.843ex;" alt="{\displaystyle \rho (\theta )=\rho ^{\pi _{\theta }}}"></span> under mild conditions this function will be differentiable as a function of the parameter vector <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle \theta }"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mi>θ<!-- θ --></mi> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle \theta }</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/6e5ab2664b422d53eb0c7df3b87e1360d75ad9af" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.338ex; width:1.09ex; height:2.176ex;" alt="{\displaystyle \theta }"></span>. If the gradient of <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle \rho }"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mi>ρ<!-- ρ --></mi> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle \rho }</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/1f7d439671d1289b6a816e6af7a304be40608d64" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.838ex; width:1.202ex; height:2.176ex;" alt="{\displaystyle \rho }"></span> was known, one could use <a href="/wiki/Gradient_descent" title="Gradient descent">gradient ascent</a>. Since an analytic expression for the gradient is not available, only a noisy estimate is available. Such an estimate can be constructed in many ways, giving rise to algorithms such as Williams' REINFORCE method<sup id="cite_ref-21" class="reference"><a href="#cite_note-21"><span class="cite-bracket">[</span>21<span class="cite-bracket">]</span></a></sup> (which is known as the likelihood ratio method in the <a href="/wiki/Simulation-based_optimization" title="Simulation-based optimization">simulation-based optimization</a> literature).<sup id="cite_ref-22" class="reference"><a href="#cite_note-22"><span class="cite-bracket">[</span>22<span class="cite-bracket">]</span></a></sup> </p><p>A large class of methods avoids relying on gradient information. These include <a href="/wiki/Simulated_annealing" title="Simulated annealing">simulated annealing</a>, <a href="/wiki/Cross-entropy_method" title="Cross-entropy method">cross-entropy search</a> or methods of <a href="/wiki/Evolutionary_computation" title="Evolutionary computation">evolutionary computation</a>. Many gradient-free methods can achieve (in theory and in the limit) a global optimum. </p><p>Policy search methods may converge slowly given noisy data. For example, this happens in episodic problems when the trajectories are long and the variance of the returns is large. Value-function based methods that rely on temporal differences might help in this case. In recent years, <i>actor–critic methods</i> have been proposed and performed well on various problems.<sup id="cite_ref-23" class="reference"><a href="#cite_note-23"><span class="cite-bracket">[</span>23<span class="cite-bracket">]</span></a></sup> </p><p>Policy search methods have been used in the <a href="/wiki/Robotics" title="Robotics">robotics</a> context.<sup id="cite_ref-24" class="reference"><a href="#cite_note-24"><span class="cite-bracket">[</span>24<span class="cite-bracket">]</span></a></sup> Many policy search methods may get stuck in local optima (as they are based on <a href="/wiki/Local_search_(optimization)" title="Local search (optimization)">local search</a>). </p> <div class="mw-heading mw-heading3"><h3 id="Model-based_algorithms">Model-based algorithms</h3><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Reinforcement_learning&action=edit&section=13" title="Edit section: Model-based algorithms"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <p>Finally, all of the above methods can be combined with algorithms that first learn a model of the <a href="/wiki/Markov_decision_process" title="Markov decision process">Markov Decision Process</a>, the probability of each next state given an action taken from an existing state. For instance, the Dyna algorithm<sup id="cite_ref-25" class="reference"><a href="#cite_note-25"><span class="cite-bracket">[</span>25<span class="cite-bracket">]</span></a></sup> learns a model from experience, and uses that to provide more modelled transitions for a value function, in addition to the real transitions. Such methods can sometimes be extended to use of non-parametric models, such as when the transitions are simply stored and 'replayed'<sup id="cite_ref-26" class="reference"><a href="#cite_note-26"><span class="cite-bracket">[</span>26<span class="cite-bracket">]</span></a></sup> to the learning algorithm. </p><p>Model-based methods can be more computationally intensive than model-free approaches, and their utility can be limited by the extent to which the Markov Decision Process can be learnt.<sup id="cite_ref-27" class="reference"><a href="#cite_note-27"><span class="cite-bracket">[</span>27<span class="cite-bracket">]</span></a></sup> </p><p>There are other ways to use models than to update a value function.<sup id="cite_ref-28" class="reference"><a href="#cite_note-28"><span class="cite-bracket">[</span>28<span class="cite-bracket">]</span></a></sup> For instance, in <a href="/wiki/Model_predictive_control" title="Model predictive control">model predictive control</a> the model is used to update the behavior directly. </p> <div class="mw-heading mw-heading2"><h2 id="Theory">Theory</h2><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Reinforcement_learning&action=edit&section=14" title="Edit section: Theory"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <p>Both the asymptotic and finite-sample behaviors of most algorithms are well understood. Algorithms with provably good online performance (addressing the exploration issue) are known. </p><p>Efficient exploration of Markov decision processes is given in Burnetas and Katehakis (1997).<sup id="cite_ref-Optimal_adaptive_policies_for_Marko_12-1" class="reference"><a href="#cite_note-Optimal_adaptive_policies_for_Marko-12"><span class="cite-bracket">[</span>12<span class="cite-bracket">]</span></a></sup> Finite-time performance bounds have also appeared for many algorithms, but these bounds are expected to be rather loose and thus more work is needed to better understand the relative advantages and limitations. </p><p>For incremental algorithms, asymptotic convergence issues have been settled<sup class="noprint Inline-Template" style="margin-left:0.1em; white-space:nowrap;">[<i><a href="/wiki/Wikipedia:Please_clarify" title="Wikipedia:Please clarify"><span title="What are the issues that have been settled? (January 2020)">clarification needed</span></a></i>]</sup>. Temporal-difference-based algorithms converge under a wider set of conditions than was previously possible (for example, when used with arbitrary, smooth function approximation). </p> <div class="mw-heading mw-heading2"><h2 id="Research">Research</h2><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Reinforcement_learning&action=edit&section=15" title="Edit section: Research"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <style data-mw-deduplicate="TemplateStyles:r1251242444">.mw-parser-output .ambox{border:1px solid #a2a9b1;border-left:10px solid #36c;background-color:#fbfbfb;box-sizing:border-box}.mw-parser-output .ambox+link+.ambox,.mw-parser-output .ambox+link+style+.ambox,.mw-parser-output .ambox+link+link+.ambox,.mw-parser-output .ambox+.mw-empty-elt+link+.ambox,.mw-parser-output .ambox+.mw-empty-elt+link+style+.ambox,.mw-parser-output .ambox+.mw-empty-elt+link+link+.ambox{margin-top:-1px}html body.mediawiki .mw-parser-output .ambox.mbox-small-left{margin:4px 1em 4px 0;overflow:hidden;width:238px;border-collapse:collapse;font-size:88%;line-height:1.25em}.mw-parser-output .ambox-speedy{border-left:10px solid #b32424;background-color:#fee7e6}.mw-parser-output .ambox-delete{border-left:10px solid #b32424}.mw-parser-output .ambox-content{border-left:10px solid #f28500}.mw-parser-output .ambox-style{border-left:10px solid #fc3}.mw-parser-output .ambox-move{border-left:10px solid #9932cc}.mw-parser-output .ambox-protection{border-left:10px solid #a2a9b1}.mw-parser-output .ambox .mbox-text{border:none;padding:0.25em 0.5em;width:100%}.mw-parser-output .ambox .mbox-image{border:none;padding:2px 0 2px 0.5em;text-align:center}.mw-parser-output .ambox .mbox-imageright{border:none;padding:2px 0.5em 2px 0;text-align:center}.mw-parser-output .ambox .mbox-empty-cell{border:none;padding:0;width:1px}.mw-parser-output .ambox .mbox-image-div{width:52px}@media(min-width:720px){.mw-parser-output .ambox{margin:0 10%}}@media print{body.ns-0 .mw-parser-output .ambox{display:none!important}}</style><table class="box-More_citations_needed_section plainlinks metadata ambox ambox-content ambox-Refimprove" role="presentation"><tbody><tr><td class="mbox-image"><div class="mbox-image-div"><span typeof="mw:File"><a href="/wiki/File:Question_book-new.svg" class="mw-file-description"><img alt="" src="//upload.wikimedia.org/wikipedia/en/thumb/9/99/Question_book-new.svg/50px-Question_book-new.svg.png" decoding="async" width="50" height="39" class="mw-file-element" srcset="//upload.wikimedia.org/wikipedia/en/thumb/9/99/Question_book-new.svg/75px-Question_book-new.svg.png 1.5x, //upload.wikimedia.org/wikipedia/en/thumb/9/99/Question_book-new.svg/100px-Question_book-new.svg.png 2x" data-file-width="512" data-file-height="399" /></a></span></div></td><td class="mbox-text"><div class="mbox-text-span">This section <b>needs additional citations for <a href="/wiki/Wikipedia:Verifiability" title="Wikipedia:Verifiability">verification</a></b>.<span class="hide-when-compact"> Please help <a href="/wiki/Special:EditPage/Reinforcement_learning" title="Special:EditPage/Reinforcement learning">improve this article</a> by <a href="/wiki/Help:Referencing_for_beginners" title="Help:Referencing for beginners">adding citations to reliable sources</a> in this section. Unsourced material may be challenged and removed.</span> <span class="date-container"><i>(<span class="date">October 2022</span>)</i></span><span class="hide-when-compact"><i> (<small><a href="/wiki/Help:Maintenance_template_removal" title="Help:Maintenance template removal">Learn how and when to remove this message</a></small>)</i></span></div></td></tr></tbody></table> <p>Research topics include: </p> <ul><li>actor-critic architecture<sup id="cite_ref-29" class="reference"><a href="#cite_note-29"><span class="cite-bracket">[</span>29<span class="cite-bracket">]</span></a></sup></li> <li>actor-critic-scenery architecture <sup id="cite_ref-Li-2023_3-1" class="reference"><a href="#cite_note-Li-2023-3"><span class="cite-bracket">[</span>3<span class="cite-bracket">]</span></a></sup></li> <li>adaptive methods that work with fewer (or no) parameters under a large number of conditions</li> <li>bug detection in software projects<sup id="cite_ref-30" class="reference"><a href="#cite_note-30"><span class="cite-bracket">[</span>30<span class="cite-bracket">]</span></a></sup></li> <li>continuous learning</li> <li>combinations with logic-based frameworks<sup id="cite_ref-31" class="reference"><a href="#cite_note-31"><span class="cite-bracket">[</span>31<span class="cite-bracket">]</span></a></sup></li> <li>exploration in large Markov decision processes</li> <li><a href="/wiki/Reinforcement_learning_from_human_feedback" title="Reinforcement learning from human feedback">human feedback</a><sup id="cite_ref-32" class="reference"><a href="#cite_note-32"><span class="cite-bracket">[</span>32<span class="cite-bracket">]</span></a></sup></li> <li>interaction between implicit and explicit learning in skill acquisition</li> <li><a href="/wiki/Intrinsic_motivation_(artificial_intelligence)" title="Intrinsic motivation (artificial intelligence)">intrinsic motivation</a> which differentiates information-seeking, curiosity-type behaviours from task-dependent goal-directed behaviours large-scale empirical evaluations</li> <li>large (or continuous) action spaces</li> <li>modular and hierarchical reinforcement learning<sup id="cite_ref-33" class="reference"><a href="#cite_note-33"><span class="cite-bracket">[</span>33<span class="cite-bracket">]</span></a></sup></li> <li>multiagent/distributed reinforcement learning is a topic of interest. Applications are expanding.<sup id="cite_ref-34" class="reference"><a href="#cite_note-34"><span class="cite-bracket">[</span>34<span class="cite-bracket">]</span></a></sup></li> <li>occupant-centric control</li> <li>optimization of computing resources<sup id="cite_ref-35" class="reference"><a href="#cite_note-35"><span class="cite-bracket">[</span>35<span class="cite-bracket">]</span></a></sup><sup id="cite_ref-36" class="reference"><a href="#cite_note-36"><span class="cite-bracket">[</span>36<span class="cite-bracket">]</span></a></sup><sup id="cite_ref-37" class="reference"><a href="#cite_note-37"><span class="cite-bracket">[</span>37<span class="cite-bracket">]</span></a></sup></li> <li><a href="/wiki/Partially_observable_Markov_decision_process" title="Partially observable Markov decision process">partial information</a> (e.g., using <a href="/wiki/Predictive_state_representation" title="Predictive state representation">predictive state representation</a>)</li> <li>reward function based on maximising novel information<sup id="cite_ref-kaplan2004_38-0" class="reference"><a href="#cite_note-kaplan2004-38"><span class="cite-bracket">[</span>38<span class="cite-bracket">]</span></a></sup><sup id="cite_ref-klyubin2008_39-0" class="reference"><a href="#cite_note-klyubin2008-39"><span class="cite-bracket">[</span>39<span class="cite-bracket">]</span></a></sup><sup id="cite_ref-barto2013_40-0" class="reference"><a href="#cite_note-barto2013-40"><span class="cite-bracket">[</span>40<span class="cite-bracket">]</span></a></sup></li> <li>sample-based planning (e.g., based on <a href="/wiki/Monte_Carlo_tree_search" title="Monte Carlo tree search">Monte Carlo tree search</a>).</li> <li>securities trading<sup id="cite_ref-41" class="reference"><a href="#cite_note-41"><span class="cite-bracket">[</span>41<span class="cite-bracket">]</span></a></sup></li> <li><a href="/wiki/Transfer_learning" title="Transfer learning">transfer learning</a><sup id="cite_ref-42" class="reference"><a href="#cite_note-42"><span class="cite-bracket">[</span>42<span class="cite-bracket">]</span></a></sup></li> <li>TD learning modeling <a href="/wiki/Dopamine" title="Dopamine">dopamine</a>-based learning in the brain. <a href="/wiki/Dopaminergic" title="Dopaminergic">Dopaminergic</a> projections from the <a href="/wiki/Substantia_nigra" title="Substantia nigra">substantia nigra</a> to the <a href="/wiki/Basal_ganglia" title="Basal ganglia">basal ganglia</a> function are the prediction error.</li> <li>value-function and policy search methods</li></ul> <div class="mw-heading mw-heading2"><h2 id="Comparison_of_key_algorithms">Comparison of key algorithms</h2><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Reinforcement_learning&action=edit&section=16" title="Edit section: Comparison of key algorithms"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <table class="wikitable sortable"> <tbody><tr> <th>Algorithm</th> <th>Description</th> <th>Policy</th> <th>Action space</th> <th>State space</th> <th>Operator </th></tr> <tr> <td><a href="/wiki/Monte_Carlo_method" title="Monte Carlo method">Monte Carlo</a></td> <td>Every visit to Monte Carlo</td> <td>Either</td> <td>Discrete</td> <td>Discrete</td> <td>Sample-means of state-values or action-values </td></tr> <tr> <td><a href="/wiki/Temporal_difference_learning" title="Temporal difference learning">TD learning</a></td> <td>State–action–reward–state</td> <td>Off-policy</td> <td>Discrete</td> <td>Discrete</td> <td>State-value </td></tr> <tr> <td><a href="/wiki/Q-learning" title="Q-learning">Q-learning</a></td> <td>State–action–reward–state</td> <td>Off-policy</td> <td>Discrete</td> <td>Discrete</td> <td>Action-value </td></tr> <tr> <td><a href="/wiki/State%E2%80%93action%E2%80%93reward%E2%80%93state%E2%80%93action" title="State–action–reward–state–action">SARSA</a></td> <td>State–action–reward–state–action</td> <td>On-policy</td> <td>Discrete</td> <td>Discrete</td> <td>Action-value </td></tr> <tr> <td><a href="/wiki/Q-learning#Deep_Q-learning" title="Q-learning">DQN</a></td> <td>Deep Q Network</td> <td>Off-policy</td> <td>Discrete</td> <td>Continuous</td> <td>Action-value </td></tr> <tr> <td>DDPG</td> <td>Deep Deterministic Policy Gradient</td> <td>Off-policy</td> <td>Continuous</td> <td>Continuous</td> <td>Action-value </td></tr> <tr> <td>A3C</td> <td>Asynchronous Advantage Actor-Critic Algorithm</td> <td>On-policy</td> <td>Discrete</td> <td>Continuous</td> <td>Advantage (=action-value - state-value) </td></tr> <tr> <td>TRPO</td> <td>Trust Region Policy Optimization</td> <td>On-policy</td> <td>Continuous or Discrete</td> <td>Continuous</td> <td>Advantage </td></tr> <tr> <td><a href="/wiki/Proximal_Policy_Optimization" class="mw-redirect" title="Proximal Policy Optimization">PPO</a></td> <td>Proximal Policy Optimization</td> <td>On-policy</td> <td>Continuous or Discrete</td> <td>Continuous</td> <td>Advantage </td></tr> <tr> <td>TD3 </td> <td>Twin Delayed Deep Deterministic Policy Gradient </td> <td>Off-policy </td> <td>Continuous </td> <td>Continuous </td> <td>Action-value </td></tr> <tr> <td>SAC </td> <td>Soft Actor-Critic </td> <td>Off-policy </td> <td>Continuous </td> <td>Continuous </td> <td>Advantage </td></tr> <tr> <td><a href="/wiki/Distributional_Soft_Actor_Critic" title="Distributional Soft Actor Critic">DSAC</a><sup id="cite_ref-43" class="reference"><a href="#cite_note-43"><span class="cite-bracket">[</span>43<span class="cite-bracket">]</span></a></sup><sup id="cite_ref-44" class="reference"><a href="#cite_note-44"><span class="cite-bracket">[</span>44<span class="cite-bracket">]</span></a></sup><sup id="cite_ref-45" class="reference"><a href="#cite_note-45"><span class="cite-bracket">[</span>45<span class="cite-bracket">]</span></a></sup></td> <td>Distributional Soft Actor Critic</td> <td>Off-policy</td> <td>Continuous</td> <td>Continuous</td> <td>Action-value distribution </td></tr></tbody></table> <div class="mw-heading mw-heading3"><h3 id="Associative_reinforcement_learning">Associative reinforcement learning</h3><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Reinforcement_learning&action=edit&section=17" title="Edit section: Associative reinforcement learning"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <p>Associative reinforcement learning tasks combine facets of stochastic learning automata tasks and supervised learning pattern classification tasks. In associative reinforcement learning tasks, the learning system interacts in a closed loop with its environment.<sup id="cite_ref-46" class="reference"><a href="#cite_note-46"><span class="cite-bracket">[</span>46<span class="cite-bracket">]</span></a></sup> </p> <div class="mw-heading mw-heading3"><h3 id="Deep_reinforcement_learning">Deep reinforcement learning</h3><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Reinforcement_learning&action=edit&section=18" title="Edit section: Deep reinforcement learning"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <p>This approach extends reinforcement learning by using a deep neural network and without explicitly designing the state space.<sup id="cite_ref-intro_deep_RL_47-0" class="reference"><a href="#cite_note-intro_deep_RL-47"><span class="cite-bracket">[</span>47<span class="cite-bracket">]</span></a></sup> The work on learning ATARI games by Google <a href="/wiki/DeepMind" class="mw-redirect" title="DeepMind">DeepMind</a> increased attention to <a href="/wiki/Deep_reinforcement_learning" title="Deep reinforcement learning">deep reinforcement learning</a> or <a href="/wiki/End-to-end_reinforcement_learning" class="mw-redirect" title="End-to-end reinforcement learning">end-to-end reinforcement learning</a>.<sup id="cite_ref-DQN2_48-0" class="reference"><a href="#cite_note-DQN2-48"><span class="cite-bracket">[</span>48<span class="cite-bracket">]</span></a></sup> </p> <div class="mw-heading mw-heading3"><h3 id="Adversarial_deep_reinforcement_learning">Adversarial deep reinforcement learning</h3><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Reinforcement_learning&action=edit&section=19" title="Edit section: Adversarial deep reinforcement learning"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <p>Adversarial deep reinforcement learning is an active area of research in reinforcement learning focusing on vulnerabilities of learned policies. In this research area some studies initially showed that reinforcement learning policies are susceptible to imperceptible adversarial manipulations.<sup id="cite_ref-49" class="reference"><a href="#cite_note-49"><span class="cite-bracket">[</span>49<span class="cite-bracket">]</span></a></sup><sup id="cite_ref-50" class="reference"><a href="#cite_note-50"><span class="cite-bracket">[</span>50<span class="cite-bracket">]</span></a></sup><sup id="cite_ref-51" class="reference"><a href="#cite_note-51"><span class="cite-bracket">[</span>51<span class="cite-bracket">]</span></a></sup> While some methods have been proposed to overcome these susceptibilities, in the most recent studies it has been shown that these proposed solutions are far from providing an accurate representation of current vulnerabilities of deep reinforcement learning policies.<sup id="cite_ref-52" class="reference"><a href="#cite_note-52"><span class="cite-bracket">[</span>52<span class="cite-bracket">]</span></a></sup> </p> <div class="mw-heading mw-heading3"><h3 id="Fuzzy_reinforcement_learning">Fuzzy reinforcement learning</h3><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Reinforcement_learning&action=edit&section=20" title="Edit section: Fuzzy reinforcement learning"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <p>By introducing <a href="/wiki/Fuzzy_control_system" title="Fuzzy control system">fuzzy inference</a> in reinforcement learning,<sup id="cite_ref-53" class="reference"><a href="#cite_note-53"><span class="cite-bracket">[</span>53<span class="cite-bracket">]</span></a></sup> approximating the state-action value function with <a href="/wiki/Fuzzy_rule" title="Fuzzy rule">fuzzy rules</a> in continuous space becomes possible. The IF - THEN form of fuzzy rules make this approach suitable for expressing the results in a form close to natural language. Extending FRL with Fuzzy Rule Interpolation <sup id="cite_ref-54" class="reference"><a href="#cite_note-54"><span class="cite-bracket">[</span>54<span class="cite-bracket">]</span></a></sup> allows the use of reduced size sparse fuzzy rule-bases to emphasize cardinal rules (most important state-action values). </p> <div class="mw-heading mw-heading3"><h3 id="Inverse_reinforcement_learning">Inverse reinforcement learning</h3><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Reinforcement_learning&action=edit&section=21" title="Edit section: Inverse reinforcement learning"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <p>In inverse reinforcement learning (IRL), no reward function is given. Instead, the reward function is inferred given an observed behavior from an expert. The idea is to mimic observed behavior, which is often optimal or close to optimal.<sup id="cite_ref-55" class="reference"><a href="#cite_note-55"><span class="cite-bracket">[</span>55<span class="cite-bracket">]</span></a></sup> One popular IRL paradigm is named maximum entropy inverse reinforcement learning (MaxEnt IRL). <sup id="cite_ref-56" class="reference"><a href="#cite_note-56"><span class="cite-bracket">[</span>56<span class="cite-bracket">]</span></a></sup> MaxEnt IRL estimates the parameters of a linear model of the reward function by maximizing the entropy of the probability distribution of observed trajectories subject to constraints related to matching expected feature counts. Recently it has been shown that MaxEnt IRL is a particular case of a more general framework named random utility inverse reinforcement learning (RU-IRL). <sup id="cite_ref-57" class="reference"><a href="#cite_note-57"><span class="cite-bracket">[</span>57<span class="cite-bracket">]</span></a></sup> RU-IRL is based on <a href="/wiki/Random_utility_model" title="Random utility model">random utility theory</a> and Markov decision processes. While prior IRL approaches assume that the apparent random behavior of an observed agent is due to it following a random policy, RU-IRL assumes that the observed agent follows a deterministic policy but randomness in observed behavior is due to the fact that an observer only has partial access to the features the observed agent uses in decision making. The utility function is modeled as a random variable to account for the ignorance of the observer regarding the features the observed agent actually considers in its utility function. </p> <div class="mw-heading mw-heading3"><h3 id="Safe_reinforcement_learning">Safe reinforcement learning</h3><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Reinforcement_learning&action=edit&section=22" title="Edit section: Safe reinforcement learning"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <p>Safe reinforcement learning (SRL) can be defined as the process of learning policies that maximize the expectation of the return in problems in which it is important to ensure reasonable system performance and/or respect safety constraints during the learning and/or deployment processes.<sup id="cite_ref-58" class="reference"><a href="#cite_note-58"><span class="cite-bracket">[</span>58<span class="cite-bracket">]</span></a></sup> An alternative approach is risk-averse reinforcement learning, where instead of the <i>expected</i> return, a <i>risk-measure</i> of the return is optimized, such as the <a href="/wiki/Expected_shortfall" title="Expected shortfall">Conditional Value at Risk</a> (CVaR).<sup id="cite_ref-59" class="reference"><a href="#cite_note-59"><span class="cite-bracket">[</span>59<span class="cite-bracket">]</span></a></sup> In addition to mitigating risk, the CVaR objective increases robustness to model uncertainties.<sup id="cite_ref-60" class="reference"><a href="#cite_note-60"><span class="cite-bracket">[</span>60<span class="cite-bracket">]</span></a></sup><sup id="cite_ref-61" class="reference"><a href="#cite_note-61"><span class="cite-bracket">[</span>61<span class="cite-bracket">]</span></a></sup> However, CVaR optimization in risk-averse RL requires special care, to prevent gradient bias<sup id="cite_ref-62" class="reference"><a href="#cite_note-62"><span class="cite-bracket">[</span>62<span class="cite-bracket">]</span></a></sup> and blindness to success.<sup id="cite_ref-63" class="reference"><a href="#cite_note-63"><span class="cite-bracket">[</span>63<span class="cite-bracket">]</span></a></sup> </p> <div class="mw-heading mw-heading3"><h3 id="Self-reinforcement_learning">Self-reinforcement learning</h3><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Reinforcement_learning&action=edit&section=23" title="Edit section: Self-reinforcement learning"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <p>Self-reinforcement learning (or self learning), is a learning paradigm which does not use the concept of immediate reward Ra(s,s') after transition from s to s' with action a. It does not use an external reinforcement, it only uses the agent internal self-reinforcement. The internal self-reinforcement is provided by mechanism of feelings and emotions. In the learning process emotions are backpropagated by a mechanism of secondary reinforcement. The learning equation does not include the immediate reward, it only includes the state evaluation. </p><p>The self-reinforcement algorithm updates a memory matrix W =||w(a,s)|| such that in each iteration executes the following machine learning routine: 1. in situation s perform action a 2. receive a consequence situation s' 3. compute state evaluation v(s') of how good is to be in the consequence situation s' 4. update crossbar memory w'(a,s) = w(a,s) + v(s') </p><p>Initial conditions of the memory are received as input from the genetic environment. It is a system with only one input (situation), and only one output (action, or behavior). </p><p>Self reinforcement (self learning) was introduced in 1982 along with a neural network capable of self-reinforcement learning, named Crossbar Adaptive Array (CAA).<sup id="cite_ref-64" class="reference"><a href="#cite_note-64"><span class="cite-bracket">[</span>64<span class="cite-bracket">]</span></a></sup><sup id="cite_ref-65" class="reference"><a href="#cite_note-65"><span class="cite-bracket">[</span>65<span class="cite-bracket">]</span></a></sup> The CAA computes, in a crossbar fashion, both decisions about actions and emotions (feelings) about consequence states. The system is driven by the interaction between cognition and emotion. <sup id="cite_ref-66" class="reference"><a href="#cite_note-66"><span class="cite-bracket">[</span>66<span class="cite-bracket">]</span></a></sup> </p> <div class="mw-heading mw-heading2"><h2 id="Statistical_comparison_of_reinforcement_learning_algorithms">Statistical comparison of reinforcement learning algorithms</h2><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Reinforcement_learning&action=edit&section=24" title="Edit section: Statistical comparison of reinforcement learning algorithms"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <p>Efficient comparison of RL algorithms is essential for research, deployment and monitoring of RL systems. To compare different algorithms on a given environment, an agent can be trained for each algorithm. Since the performance is sensitive to implementation details, all algorithms should be implemented as closely as possible to each other.<sup id="cite_ref-67" class="reference"><a href="#cite_note-67"><span class="cite-bracket">[</span>67<span class="cite-bracket">]</span></a></sup> After the training is finished, the agents can be run on a sample of test episodes, and their scores (returns) can be compared. Since episodes are typically assumed to be <a href="/wiki/I.i.d" class="mw-redirect" title="I.i.d">i.i.d</a>, standard statistical tools can be used for hypothesis testing, such as <a href="/wiki/Student%27s_t-test" title="Student's t-test">T-test</a> and <a href="/wiki/Permutation_test" title="Permutation test">permutation test</a>.<sup id="cite_ref-68" class="reference"><a href="#cite_note-68"><span class="cite-bracket">[</span>68<span class="cite-bracket">]</span></a></sup> This requires to accumulate all the rewards within an episode into a single number - the episodic return. However, this causes a loss of information, as different time-steps are averaged together, possibly with different levels of noise. Whenever the noise level varies across the episode, the statistical power can be improved significantly, by weighting the rewards according to their estimated noise.<sup id="cite_ref-69" class="reference"><a href="#cite_note-69"><span class="cite-bracket">[</span>69<span class="cite-bracket">]</span></a></sup> </p> <div class="mw-heading mw-heading2"><h2 id="See_also">See also</h2><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Reinforcement_learning&action=edit&section=25" title="Edit section: See also"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <style data-mw-deduplicate="TemplateStyles:r1184024115">.mw-parser-output .div-col{margin-top:0.3em;column-width:30em}.mw-parser-output .div-col-small{font-size:90%}.mw-parser-output .div-col-rules{column-rule:1px solid #aaa}.mw-parser-output .div-col dl,.mw-parser-output .div-col ol,.mw-parser-output .div-col ul{margin-top:0}.mw-parser-output .div-col li,.mw-parser-output .div-col dd{page-break-inside:avoid;break-inside:avoid-column}</style><div class="div-col" style="column-width: 20em;"> <ul><li><a href="/wiki/Temporal_difference_learning" title="Temporal difference learning">Temporal difference learning</a></li> <li><a href="/wiki/Q-learning" title="Q-learning">Q-learning</a></li> <li><a href="/wiki/State%E2%80%93action%E2%80%93reward%E2%80%93state%E2%80%93action" title="State–action–reward–state–action">State–action–reward–state–action</a> (SARSA)</li> <li><a href="/wiki/Reinforcement_learning_from_human_feedback" title="Reinforcement learning from human feedback">Reinforcement learning from human feedback</a></li> <li><a href="/wiki/Optimal_control" title="Optimal control">Optimal control</a></li> <li><a href="/wiki/Error-driven_learning" title="Error-driven learning">Error-driven learning</a></li> <li><a href="/wiki/Multi-agent_reinforcement_learning" title="Multi-agent reinforcement learning">Multi-agent reinforcement learning</a></li> <li><a href="/wiki/Apprenticeship_learning" title="Apprenticeship learning">Apprenticeship learning</a></li> <li><a href="/wiki/Model-free_(reinforcement_learning)" title="Model-free (reinforcement learning)">Model-free (reinforcement learning)</a></li> <li><a href="/wiki/Active_learning_(machine_learning)" title="Active learning (machine learning)">active learning (machine learning)</a></li></ul> </div> <div class="mw-heading mw-heading2"><h2 id="References">References</h2><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Reinforcement_learning&action=edit&section=26" title="Edit section: References"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <style data-mw-deduplicate="TemplateStyles:r1239543626">.mw-parser-output .reflist{margin-bottom:0.5em;list-style-type:decimal}@media screen{.mw-parser-output .reflist{font-size:90%}}.mw-parser-output .reflist .references{font-size:100%;margin-bottom:0;list-style-type:inherit}.mw-parser-output .reflist-columns-2{column-width:30em}.mw-parser-output .reflist-columns-3{column-width:25em}.mw-parser-output .reflist-columns{margin-top:0.3em}.mw-parser-output .reflist-columns ol{margin-top:0}.mw-parser-output .reflist-columns li{page-break-inside:avoid;break-inside:avoid-column}.mw-parser-output .reflist-upper-alpha{list-style-type:upper-alpha}.mw-parser-output .reflist-upper-roman{list-style-type:upper-roman}.mw-parser-output .reflist-lower-alpha{list-style-type:lower-alpha}.mw-parser-output .reflist-lower-greek{list-style-type:lower-greek}.mw-parser-output .reflist-lower-roman{list-style-type:lower-roman}</style><div class="reflist"> <div class="mw-references-wrap mw-references-columns"><ol class="references"> <li id="cite_note-kaelbling-1"><span class="mw-cite-backlink"><b><a href="#cite_ref-kaelbling_1-0">^</a></b></span> <span class="reference-text"><style data-mw-deduplicate="TemplateStyles:r1238218222">.mw-parser-output cite.citation{font-style:inherit;word-wrap:break-word}.mw-parser-output .citation q{quotes:"\"""\"""'""'"}.mw-parser-output .citation:target{background-color:rgba(0,127,255,0.133)}.mw-parser-output .id-lock-free.id-lock-free a{background:url("//upload.wikimedia.org/wikipedia/commons/6/65/Lock-green.svg")right 0.1em center/9px no-repeat}.mw-parser-output .id-lock-limited.id-lock-limited a,.mw-parser-output .id-lock-registration.id-lock-registration a{background:url("//upload.wikimedia.org/wikipedia/commons/d/d6/Lock-gray-alt-2.svg")right 0.1em center/9px no-repeat}.mw-parser-output .id-lock-subscription.id-lock-subscription a{background:url("//upload.wikimedia.org/wikipedia/commons/a/aa/Lock-red-alt-2.svg")right 0.1em center/9px no-repeat}.mw-parser-output .cs1-ws-icon a{background:url("//upload.wikimedia.org/wikipedia/commons/4/4c/Wikisource-logo.svg")right 0.1em center/12px no-repeat}body:not(.skin-timeless):not(.skin-minerva) .mw-parser-output .id-lock-free a,body:not(.skin-timeless):not(.skin-minerva) .mw-parser-output .id-lock-limited a,body:not(.skin-timeless):not(.skin-minerva) .mw-parser-output .id-lock-registration a,body:not(.skin-timeless):not(.skin-minerva) .mw-parser-output .id-lock-subscription a,body:not(.skin-timeless):not(.skin-minerva) .mw-parser-output .cs1-ws-icon a{background-size:contain;padding:0 1em 0 0}.mw-parser-output .cs1-code{color:inherit;background:inherit;border:none;padding:inherit}.mw-parser-output .cs1-hidden-error{display:none;color:var(--color-error,#d33)}.mw-parser-output .cs1-visible-error{color:var(--color-error,#d33)}.mw-parser-output .cs1-maint{display:none;color:#085;margin-left:0.3em}.mw-parser-output .cs1-kern-left{padding-left:0.2em}.mw-parser-output .cs1-kern-right{padding-right:0.2em}.mw-parser-output .citation .mw-selflink{font-weight:inherit}@media screen{.mw-parser-output .cs1-format{font-size:95%}html.skin-theme-clientpref-night .mw-parser-output .cs1-maint{color:#18911f}}@media screen and (prefers-color-scheme:dark){html.skin-theme-clientpref-os .mw-parser-output .cs1-maint{color:#18911f}}</style><cite id="CITEREFKaelblingLittmanMoore1996" class="citation journal cs1"><a href="/wiki/Leslie_P._Kaelbling" title="Leslie P. Kaelbling">Kaelbling, Leslie P.</a>; <a href="/wiki/Michael_L._Littman" title="Michael L. Littman">Littman, Michael L.</a>; <a href="/w/index.php?title=Andrew_W._Moore&action=edit&redlink=1" class="new" title="Andrew W. Moore (page does not exist)">Moore, Andrew W.</a> (1996). <a rel="nofollow" class="external text" href="http://webarchive.loc.gov/all/20011120234539/http://www.cs.washington.edu/research/jair/abstracts/kaelbling96a.html">"Reinforcement Learning: A Survey"</a>. <i>Journal of Artificial Intelligence Research</i>. <b>4</b>: 237–285. <a href="/wiki/ArXiv_(identifier)" class="mw-redirect" title="ArXiv (identifier)">arXiv</a>:<span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://arxiv.org/abs/cs/9605103">cs/9605103</a></span>. <a href="/wiki/Doi_(identifier)" class="mw-redirect" title="Doi (identifier)">doi</a>:<a rel="nofollow" class="external text" href="https://doi.org/10.1613%2Fjair.301">10.1613/jair.301</a>. <a href="/wiki/S2CID_(identifier)" class="mw-redirect" title="S2CID (identifier)">S2CID</a> <a rel="nofollow" class="external text" href="https://api.semanticscholar.org/CorpusID:1708582">1708582</a>. Archived from <a rel="nofollow" class="external text" href="http://www.cs.washington.edu/research/jair/abstracts/kaelbling96a.html">the original</a> on 2001-11-20.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=article&rft.jtitle=Journal+of+Artificial+Intelligence+Research&rft.atitle=Reinforcement+Learning%3A+A+Survey&rft.volume=4&rft.pages=237-285&rft.date=1996&rft_id=info%3Aarxiv%2Fcs%2F9605103&rft_id=https%3A%2F%2Fapi.semanticscholar.org%2FCorpusID%3A1708582%23id-name%3DS2CID&rft_id=info%3Adoi%2F10.1613%2Fjair.301&rft.aulast=Kaelbling&rft.aufirst=Leslie+P.&rft.au=Littman%2C+Michael+L.&rft.au=Moore%2C+Andrew+W.&rft_id=http%3A%2F%2Fwww.cs.washington.edu%2Fresearch%2Fjair%2Fabstracts%2Fkaelbling96a.html&rfr_id=info%3Asid%2Fen.wikipedia.org%3AReinforcement+learning" class="Z3988"></span></span> </li> <li id="cite_note-2"><span class="mw-cite-backlink"><b><a href="#cite_ref-2">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFvan_Otterlo,_M.Wiering,_M.2012" class="citation book cs1">van Otterlo, M.; Wiering, M. (2012). "Reinforcement Learning and Markov Decision Processes". <i>Reinforcement Learning</i>. Adaptation, Learning, and Optimization. Vol. 12. pp. 3–42. <a href="/wiki/Doi_(identifier)" class="mw-redirect" title="Doi (identifier)">doi</a>:<a rel="nofollow" class="external text" href="https://doi.org/10.1007%2F978-3-642-27645-3_1">10.1007/978-3-642-27645-3_1</a>. <a href="/wiki/ISBN_(identifier)" class="mw-redirect" title="ISBN (identifier)">ISBN</a> <a href="/wiki/Special:BookSources/978-3-642-27644-6" title="Special:BookSources/978-3-642-27644-6"><bdi>978-3-642-27644-6</bdi></a>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&rft.genre=bookitem&rft.atitle=Reinforcement+Learning+and+Markov+Decision+Processes&rft.btitle=Reinforcement+Learning&rft.series=Adaptation%2C+Learning%2C+and+Optimization&rft.pages=3-42&rft.date=2012&rft_id=info%3Adoi%2F10.1007%2F978-3-642-27645-3_1&rft.isbn=978-3-642-27644-6&rft.au=van+Otterlo%2C+M.&rft.au=Wiering%2C+M.&rfr_id=info%3Asid%2Fen.wikipedia.org%3AReinforcement+learning" class="Z3988"></span></span> </li> <li id="cite_note-Li-2023-3"><span class="mw-cite-backlink">^ <a href="#cite_ref-Li-2023_3-0"><sup><i><b>a</b></i></sup></a> <a href="#cite_ref-Li-2023_3-1"><sup><i><b>b</b></i></sup></a></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFLi2023" class="citation book cs1">Li, Shengbo (2023). <a rel="nofollow" class="external text" href="https://link.springer.com/book/10.1007/978-981-19-7784-8"><i>Reinforcement Learning for Sequential Decision and Optimal Control</i></a> (First ed.). Springer Verlag, Singapore. pp. 1–460. <a href="/wiki/Doi_(identifier)" class="mw-redirect" title="Doi (identifier)">doi</a>:<a rel="nofollow" class="external text" href="https://doi.org/10.1007%2F978-981-19-7784-8">10.1007/978-981-19-7784-8</a>. <a href="/wiki/ISBN_(identifier)" class="mw-redirect" title="ISBN (identifier)">ISBN</a> <a href="/wiki/Special:BookSources/978-9-811-97783-1" title="Special:BookSources/978-9-811-97783-1"><bdi>978-9-811-97783-1</bdi></a>. <a href="/wiki/S2CID_(identifier)" class="mw-redirect" title="S2CID (identifier)">S2CID</a> <a rel="nofollow" class="external text" href="https://api.semanticscholar.org/CorpusID:257928563">257928563</a>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&rft.genre=book&rft.btitle=Reinforcement+Learning+for+Sequential+Decision+and+Optimal+Control&rft.place=Springer+Verlag%2C+Singapore&rft.pages=1-460&rft.edition=First&rft.date=2023&rft_id=https%3A%2F%2Fapi.semanticscholar.org%2FCorpusID%3A257928563%23id-name%3DS2CID&rft_id=info%3Adoi%2F10.1007%2F978-981-19-7784-8&rft.isbn=978-9-811-97783-1&rft.aulast=Li&rft.aufirst=Shengbo&rft_id=https%3A%2F%2Flink.springer.com%2Fbook%2F10.1007%2F978-981-19-7784-8&rfr_id=info%3Asid%2Fen.wikipedia.org%3AReinforcement+learning" class="Z3988"></span><span class="cs1-maint citation-comment"><code class="cs1-code">{{<a href="/wiki/Template:Cite_book" title="Template:Cite book">cite book</a>}}</code>: CS1 maint: location missing publisher (<a href="/wiki/Category:CS1_maint:_location_missing_publisher" title="Category:CS1 maint: location missing publisher">link</a>)</span></span> </li> <li id="cite_note-4"><span class="mw-cite-backlink"><b><a href="#cite_ref-4">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFRussellNorvig2010" class="citation book cs1">Russell, Stuart J.; Norvig, Peter (2010). <i>Artificial intelligence : a modern approach</i> (Third ed.). Upper Saddle River, New Jersey. pp. 830, 831. <a href="/wiki/ISBN_(identifier)" class="mw-redirect" title="ISBN (identifier)">ISBN</a> <a href="/wiki/Special:BookSources/978-0-13-604259-4" title="Special:BookSources/978-0-13-604259-4"><bdi>978-0-13-604259-4</bdi></a>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&rft.genre=book&rft.btitle=Artificial+intelligence+%3A+a+modern+approach&rft.place=Upper+Saddle+River%2C+New+Jersey&rft.pages=830%2C+831&rft.edition=Third&rft.date=2010&rft.isbn=978-0-13-604259-4&rft.aulast=Russell&rft.aufirst=Stuart+J.&rft.au=Norvig%2C+Peter&rfr_id=info%3Asid%2Fen.wikipedia.org%3AReinforcement+learning" class="Z3988"></span><span class="cs1-maint citation-comment"><code class="cs1-code">{{<a href="/wiki/Template:Cite_book" title="Template:Cite book">cite book</a>}}</code>: CS1 maint: location missing publisher (<a href="/wiki/Category:CS1_maint:_location_missing_publisher" title="Category:CS1 maint: location missing publisher">link</a>)</span></span> </li> <li id="cite_note-5"><span class="mw-cite-backlink"><b><a href="#cite_ref-5">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFLeeSeoJung2012" class="citation journal cs1">Lee, Daeyeol; Seo, Hyojung; Jung, Min Whan (21 July 2012). <a rel="nofollow" class="external text" href="https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3490621">"Neural Basis of Reinforcement Learning and Decision Making"</a>. <i>Annual Review of Neuroscience</i>. <b>35</b> (1): 287–308. <a href="/wiki/Doi_(identifier)" class="mw-redirect" title="Doi (identifier)">doi</a>:<a rel="nofollow" class="external text" href="https://doi.org/10.1146%2Fannurev-neuro-062111-150512">10.1146/annurev-neuro-062111-150512</a>. <a href="/wiki/PMC_(identifier)" class="mw-redirect" title="PMC (identifier)">PMC</a> <span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3490621">3490621</a></span>. <a href="/wiki/PMID_(identifier)" class="mw-redirect" title="PMID (identifier)">PMID</a> <a rel="nofollow" class="external text" href="https://pubmed.ncbi.nlm.nih.gov/22462543">22462543</a>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=article&rft.jtitle=Annual+Review+of+Neuroscience&rft.atitle=Neural+Basis+of+Reinforcement+Learning+and+Decision+Making&rft.volume=35&rft.issue=1&rft.pages=287-308&rft.date=2012-07-21&rft_id=https%3A%2F%2Fwww.ncbi.nlm.nih.gov%2Fpmc%2Farticles%2FPMC3490621%23id-name%3DPMC&rft_id=info%3Apmid%2F22462543&rft_id=info%3Adoi%2F10.1146%2Fannurev-neuro-062111-150512&rft.aulast=Lee&rft.aufirst=Daeyeol&rft.au=Seo%2C+Hyojung&rft.au=Jung%2C+Min+Whan&rft_id=https%3A%2F%2Fwww.ncbi.nlm.nih.gov%2Fpmc%2Farticles%2FPMC3490621&rfr_id=info%3Asid%2Fen.wikipedia.org%3AReinforcement+learning" class="Z3988"></span></span> </li> <li id="cite_note-6"><span class="mw-cite-backlink"><b><a href="#cite_ref-6">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFSalazar_DuqueGiraldoVergaraNguyen2022" class="citation journal cs1">Salazar Duque, Edgar Mauricio; Giraldo, Juan S.; Vergara, Pedro P.; Nguyen, Phuong; Van Der Molen, Anne; Slootweg, Han (2022). <a rel="nofollow" class="external text" href="https://doi.org/10.1016%2Fj.epsr.2022.108515">"Community energy storage operation via reinforcement learning with eligibility traces"</a>. <i>Electric Power Systems Research</i>. <b>212</b>. <a href="/wiki/Bibcode_(identifier)" class="mw-redirect" title="Bibcode (identifier)">Bibcode</a>:<a rel="nofollow" class="external text" href="https://ui.adsabs.harvard.edu/abs/2022EPSR..21208515S">2022EPSR..21208515S</a>. <a href="/wiki/Doi_(identifier)" class="mw-redirect" title="Doi (identifier)">doi</a>:<span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://doi.org/10.1016%2Fj.epsr.2022.108515">10.1016/j.epsr.2022.108515</a></span>. <a href="/wiki/S2CID_(identifier)" class="mw-redirect" title="S2CID (identifier)">S2CID</a> <a rel="nofollow" class="external text" href="https://api.semanticscholar.org/CorpusID:250635151">250635151</a>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=article&rft.jtitle=Electric+Power+Systems+Research&rft.atitle=Community+energy+storage+operation+via+reinforcement+learning+with+eligibility+traces&rft.volume=212&rft.date=2022&rft_id=https%3A%2F%2Fapi.semanticscholar.org%2FCorpusID%3A250635151%23id-name%3DS2CID&rft_id=info%3Adoi%2F10.1016%2Fj.epsr.2022.108515&rft_id=info%3Abibcode%2F2022EPSR..21208515S&rft.aulast=Salazar+Duque&rft.aufirst=Edgar+Mauricio&rft.au=Giraldo%2C+Juan+S.&rft.au=Vergara%2C+Pedro+P.&rft.au=Nguyen%2C+Phuong&rft.au=Van+Der+Molen%2C+Anne&rft.au=Slootweg%2C+Han&rft_id=https%3A%2F%2Fdoi.org%2F10.1016%252Fj.epsr.2022.108515&rfr_id=info%3Asid%2Fen.wikipedia.org%3AReinforcement+learning" class="Z3988"></span></span> </li> <li id="cite_note-7"><span class="mw-cite-backlink"><b><a href="#cite_ref-7">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFXieHung_Yu_LingNam_Hee_KimMichiel_van_de_Panne2020" class="citation arxiv cs1">Xie, Zhaoming; Hung Yu Ling; Nam Hee Kim; Michiel van de Panne (2020). "ALLSTEPS: Curriculum-driven Learning of Stepping Stone Skills". <a href="/wiki/ArXiv_(identifier)" class="mw-redirect" title="ArXiv (identifier)">arXiv</a>:<span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://arxiv.org/abs/2005.04323">2005.04323</a></span> [<a rel="nofollow" class="external text" href="https://arxiv.org/archive/cs.GR">cs.GR</a>].</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=preprint&rft.jtitle=arXiv&rft.atitle=ALLSTEPS%3A+Curriculum-driven+Learning+of+Stepping+Stone+Skills&rft.date=2020&rft_id=info%3Aarxiv%2F2005.04323&rft.aulast=Xie&rft.aufirst=Zhaoming&rft.au=Hung+Yu+Ling&rft.au=Nam+Hee+Kim&rft.au=Michiel+van+de+Panne&rfr_id=info%3Asid%2Fen.wikipedia.org%3AReinforcement+learning" class="Z3988"></span></span> </li> <li id="cite_note-8"><span class="mw-cite-backlink"><b><a href="#cite_ref-8">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFVergaraSalazarGiraldoPalensky2022" class="citation journal cs1">Vergara, Pedro P.; Salazar, Mauricio; Giraldo, Juan S.; Palensky, Peter (2022). <a rel="nofollow" class="external text" href="https://doi.org/10.1016%2Fj.ijepes.2021.107628">"Optimal dispatch of PV inverters in unbalanced distribution systems using Reinforcement Learning"</a>. <i>International Journal of Electrical Power & Energy Systems</i>. <b>136</b>. <a href="/wiki/Bibcode_(identifier)" class="mw-redirect" title="Bibcode (identifier)">Bibcode</a>:<a rel="nofollow" class="external text" href="https://ui.adsabs.harvard.edu/abs/2022IJEPE.13607628V">2022IJEPE.13607628V</a>. <a href="/wiki/Doi_(identifier)" class="mw-redirect" title="Doi (identifier)">doi</a>:<span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://doi.org/10.1016%2Fj.ijepes.2021.107628">10.1016/j.ijepes.2021.107628</a></span>. <a href="/wiki/S2CID_(identifier)" class="mw-redirect" title="S2CID (identifier)">S2CID</a> <a rel="nofollow" class="external text" href="https://api.semanticscholar.org/CorpusID:244099841">244099841</a>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=article&rft.jtitle=International+Journal+of+Electrical+Power+%26+Energy+Systems&rft.atitle=Optimal+dispatch+of+PV+inverters+in+unbalanced+distribution+systems+using+Reinforcement+Learning&rft.volume=136&rft.date=2022&rft_id=https%3A%2F%2Fapi.semanticscholar.org%2FCorpusID%3A244099841%23id-name%3DS2CID&rft_id=info%3Adoi%2F10.1016%2Fj.ijepes.2021.107628&rft_id=info%3Abibcode%2F2022IJEPE.13607628V&rft.aulast=Vergara&rft.aufirst=Pedro+P.&rft.au=Salazar%2C+Mauricio&rft.au=Giraldo%2C+Juan+S.&rft.au=Palensky%2C+Peter&rft_id=https%3A%2F%2Fdoi.org%2F10.1016%252Fj.ijepes.2021.107628&rfr_id=info%3Asid%2Fen.wikipedia.org%3AReinforcement+learning" class="Z3988"></span></span> </li> <li id="cite_note-FOOTNOTESuttonBarto2018Chapter_11-9"><span class="mw-cite-backlink"><b><a href="#cite_ref-FOOTNOTESuttonBarto2018Chapter_11_9-0">^</a></b></span> <span class="reference-text"><a href="#CITEREFSuttonBarto2018">Sutton & Barto 2018</a>, Chapter 11.</span> </li> <li id="cite_note-Ren-2022-10"><span class="mw-cite-backlink"><b><a href="#cite_ref-Ren-2022_10-0">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFRenJiangZhanLi2022" class="citation journal cs1">Ren, Yangang; Jiang, Jianhua; Zhan, Guojian; Li, Shengbo Eben; Chen, Chen; Li, Keqiang; Duan, Jingliang (2022). <a rel="nofollow" class="external text" href="https://ieeexplore.ieee.org/document/9857655">"Self-Learned Intelligence for Integrated Decision and Control of Automated Vehicles at Signalized Intersections"</a>. <i>IEEE Transactions on Intelligent Transportation Systems</i>. <b>23</b> (12): 24145–24156. <a href="/wiki/ArXiv_(identifier)" class="mw-redirect" title="ArXiv (identifier)">arXiv</a>:<span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://arxiv.org/abs/2110.12359">2110.12359</a></span>. <a href="/wiki/Doi_(identifier)" class="mw-redirect" title="Doi (identifier)">doi</a>:<a rel="nofollow" class="external text" href="https://doi.org/10.1109%2FTITS.2022.3196167">10.1109/TITS.2022.3196167</a>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=article&rft.jtitle=IEEE+Transactions+on+Intelligent+Transportation+Systems&rft.atitle=Self-Learned+Intelligence+for+Integrated+Decision+and+Control+of+Automated+Vehicles+at+Signalized+Intersections&rft.volume=23&rft.issue=12&rft.pages=24145-24156&rft.date=2022&rft_id=info%3Aarxiv%2F2110.12359&rft_id=info%3Adoi%2F10.1109%2FTITS.2022.3196167&rft.aulast=Ren&rft.aufirst=Yangang&rft.au=Jiang%2C+Jianhua&rft.au=Zhan%2C+Guojian&rft.au=Li%2C+Shengbo+Eben&rft.au=Chen%2C+Chen&rft.au=Li%2C+Keqiang&rft.au=Duan%2C+Jingliang&rft_id=https%3A%2F%2Fieeexplore.ieee.org%2Fdocument%2F9857655&rfr_id=info%3Asid%2Fen.wikipedia.org%3AReinforcement+learning" class="Z3988"></span></span> </li> <li id="cite_note-11"><span class="mw-cite-backlink"><b><a href="#cite_ref-11">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFGosavi2003" class="citation book cs1"><a href="/w/index.php?title=Abhijit_Gosavi&action=edit&redlink=1" class="new" title="Abhijit Gosavi (page does not exist)">Gosavi, Abhijit</a> (2003). <a rel="nofollow" class="external text" href="https://www.springer.com/mathematics/applications/book/978-1-4020-7454-7"><i>Simulation-based Optimization: Parametric Optimization Techniques and Reinforcement</i></a>. Operations Research/Computer Science Interfaces Series. Springer. <a href="/wiki/ISBN_(identifier)" class="mw-redirect" title="ISBN (identifier)">ISBN</a> <a href="/wiki/Special:BookSources/978-1-4020-7454-7" title="Special:BookSources/978-1-4020-7454-7"><bdi>978-1-4020-7454-7</bdi></a>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&rft.genre=book&rft.btitle=Simulation-based+Optimization%3A+Parametric+Optimization+Techniques+and+Reinforcement&rft.series=Operations+Research%2FComputer+Science+Interfaces+Series&rft.pub=Springer&rft.date=2003&rft.isbn=978-1-4020-7454-7&rft.aulast=Gosavi&rft.aufirst=Abhijit&rft_id=https%3A%2F%2Fwww.springer.com%2Fmathematics%2Fapplications%2Fbook%2F978-1-4020-7454-7&rfr_id=info%3Asid%2Fen.wikipedia.org%3AReinforcement+learning" class="Z3988"></span></span> </li> <li id="cite_note-Optimal_adaptive_policies_for_Marko-12"><span class="mw-cite-backlink">^ <a href="#cite_ref-Optimal_adaptive_policies_for_Marko_12-0"><sup><i><b>a</b></i></sup></a> <a href="#cite_ref-Optimal_adaptive_policies_for_Marko_12-1"><sup><i><b>b</b></i></sup></a></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFBurnetasKatehakis1997" class="citation cs2">Burnetas, Apostolos N.; <a href="/wiki/Michael_N._Katehakis" class="mw-redirect" title="Michael N. Katehakis">Katehakis, Michael N.</a> (1997), "Optimal adaptive policies for Markov Decision Processes", <i><a href="/wiki/Mathematics_of_Operations_Research" title="Mathematics of Operations Research">Mathematics of Operations Research</a></i>, <b>22</b> (1): 222–255, <a href="/wiki/Doi_(identifier)" class="mw-redirect" title="Doi (identifier)">doi</a>:<a rel="nofollow" class="external text" href="https://doi.org/10.1287%2Fmoor.22.1.222">10.1287/moor.22.1.222</a>, <a href="/wiki/JSTOR_(identifier)" class="mw-redirect" title="JSTOR (identifier)">JSTOR</a> <a rel="nofollow" class="external text" href="https://www.jstor.org/stable/3690147">3690147</a></cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=article&rft.jtitle=Mathematics+of+Operations+Research&rft.atitle=Optimal+adaptive+policies+for+Markov+Decision+Processes&rft.volume=22&rft.issue=1&rft.pages=222-255&rft.date=1997&rft_id=info%3Adoi%2F10.1287%2Fmoor.22.1.222&rft_id=https%3A%2F%2Fwww.jstor.org%2Fstable%2F3690147%23id-name%3DJSTOR&rft.aulast=Burnetas&rft.aufirst=Apostolos+N.&rft.au=Katehakis%2C+Michael+N.&rfr_id=info%3Asid%2Fen.wikipedia.org%3AReinforcement+learning" class="Z3988"></span></span> </li> <li id="cite_note-13"><span class="mw-cite-backlink"><b><a href="#cite_ref-13">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFTokicPalm2011" class="citation cs2">Tokic, Michel; Palm, Günther (2011), <a rel="nofollow" class="external text" href="http://www.tokic.com/www/tokicm/publikationen/papers/KI2011.pdf">"Value-Difference Based Exploration: Adaptive Control Between Epsilon-Greedy and Softmax"</a> <span class="cs1-format">(PDF)</span>, <i>KI 2011: Advances in Artificial Intelligence</i>, Lecture Notes in Computer Science, vol. 7006, Springer, pp. 335–346, <a href="/wiki/ISBN_(identifier)" class="mw-redirect" title="ISBN (identifier)">ISBN</a> <a href="/wiki/Special:BookSources/978-3-642-24455-1" title="Special:BookSources/978-3-642-24455-1"><bdi>978-3-642-24455-1</bdi></a></cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&rft.genre=bookitem&rft.atitle=Value-Difference+Based+Exploration%3A+Adaptive+Control+Between+Epsilon-Greedy+and+Softmax&rft.btitle=KI+2011%3A+Advances+in+Artificial+Intelligence&rft.series=Lecture+Notes+in+Computer+Science&rft.pages=335-346&rft.pub=Springer&rft.date=2011&rft.isbn=978-3-642-24455-1&rft.aulast=Tokic&rft.aufirst=Michel&rft.au=Palm%2C+G%C3%BCnther&rft_id=http%3A%2F%2Fwww.tokic.com%2Fwww%2Ftokicm%2Fpublikationen%2Fpapers%2FKI2011.pdf&rfr_id=info%3Asid%2Fen.wikipedia.org%3AReinforcement+learning" class="Z3988"></span></span> </li> <li id="cite_note-:0-14"><span class="mw-cite-backlink">^ <a href="#cite_ref-:0_14-0"><sup><i><b>a</b></i></sup></a> <a href="#cite_ref-:0_14-1"><sup><i><b>b</b></i></sup></a> <a href="#cite_ref-:0_14-2"><sup><i><b>c</b></i></sup></a></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite class="citation web cs1"><a rel="nofollow" class="external text" href="https://web.archive.org/web/20170712170739/http://people.inf.elte.hu/lorincz/Files/RL_2006/SuttonBook.pdf">"Reinforcement learning: An introduction"</a> <span class="cs1-format">(PDF)</span>. Archived from <a rel="nofollow" class="external text" href="http://people.inf.elte.hu/lorincz/Files/RL_2006/SuttonBook.pdf">the original</a> <span class="cs1-format">(PDF)</span> on 2017-07-12<span class="reference-accessdate">. Retrieved <span class="nowrap">2017-07-23</span></span>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&rft.genre=unknown&rft.btitle=Reinforcement+learning%3A+An+introduction&rft_id=http%3A%2F%2Fpeople.inf.elte.hu%2Florincz%2FFiles%2FRL_2006%2FSuttonBook.pdf&rfr_id=info%3Asid%2Fen.wikipedia.org%3AReinforcement+learning" class="Z3988"></span></span> </li> <li id="cite_note-15"><span class="mw-cite-backlink"><b><a href="#cite_ref-15">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFSinghSutton1996" class="citation journal cs1">Singh, Satinder P.; Sutton, Richard S. (1996-03-01). <a rel="nofollow" class="external text" href="https://link.springer.com/article/10.1007/BF00114726">"Reinforcement learning with replacing eligibility traces"</a>. <i>Machine Learning</i>. <b>22</b> (1): 123–158. <a href="/wiki/Doi_(identifier)" class="mw-redirect" title="Doi (identifier)">doi</a>:<a rel="nofollow" class="external text" href="https://doi.org/10.1007%2FBF00114726">10.1007/BF00114726</a>. <a href="/wiki/ISSN_(identifier)" class="mw-redirect" title="ISSN (identifier)">ISSN</a> <a rel="nofollow" class="external text" href="https://search.worldcat.org/issn/1573-0565">1573-0565</a>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=article&rft.jtitle=Machine+Learning&rft.atitle=Reinforcement+learning+with+replacing+eligibility+traces&rft.volume=22&rft.issue=1&rft.pages=123-158&rft.date=1996-03-01&rft_id=info%3Adoi%2F10.1007%2FBF00114726&rft.issn=1573-0565&rft.aulast=Singh&rft.aufirst=Satinder+P.&rft.au=Sutton%2C+Richard+S.&rft_id=https%3A%2F%2Flink.springer.com%2Farticle%2F10.1007%2FBF00114726&rfr_id=info%3Asid%2Fen.wikipedia.org%3AReinforcement+learning" class="Z3988"></span></span> </li> <li id="cite_note-16"><span class="mw-cite-backlink"><b><a href="#cite_ref-16">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFSutton1984" class="citation thesis cs1"><a href="/wiki/Richard_S._Sutton" title="Richard S. Sutton">Sutton, Richard S.</a> (1984). <a rel="nofollow" class="external text" href="https://web.archive.org/web/20170330002227/http://incompleteideas.net/sutton/publications.html#PhDthesis"><i>Temporal Credit Assignment in Reinforcement Learning</i></a> (PhD thesis). University of Massachusetts, Amherst, MA. Archived from <a rel="nofollow" class="external text" href="http://incompleteideas.net/sutton/publications.html#PhDthesis">the original</a> on 2017-03-30<span class="reference-accessdate">. Retrieved <span class="nowrap">2017-03-29</span></span>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Adissertation&rft.title=Temporal+Credit+Assignment+in+Reinforcement+Learning&rft.degree=PhD&rft.inst=University+of+Massachusetts%2C+Amherst%2C+MA&rft.date=1984&rft.aulast=Sutton&rft.aufirst=Richard+S.&rft_id=http%3A%2F%2Fincompleteideas.net%2Fsutton%2Fpublications.html%23PhDthesis&rfr_id=info%3Asid%2Fen.wikipedia.org%3AReinforcement+learning" class="Z3988"></span></span> </li> <li id="cite_note-FOOTNOTESuttonBarto2018[httpincompleteideasnetsuttonbookebooknode60html_§6._Temporal-Difference_Learning]-17"><span class="mw-cite-backlink"><b><a href="#cite_ref-FOOTNOTESuttonBarto2018[httpincompleteideasnetsuttonbookebooknode60html_§6._Temporal-Difference_Learning]_17-0">^</a></b></span> <span class="reference-text"><a href="#CITEREFSuttonBarto2018">Sutton & Barto 2018</a>, <a rel="nofollow" class="external text" href="http://incompleteideas.net/sutton/book/ebook/node60.html">§6. Temporal-Difference Learning</a>.</span> </li> <li id="cite_note-18"><span class="mw-cite-backlink"><b><a href="#cite_ref-18">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFBradtkeBarto1996" class="citation journal cs1"><a href="/w/index.php?title=Steven_J._Bradtke&action=edit&redlink=1" class="new" title="Steven J. Bradtke (page does not exist)">Bradtke, Steven J.</a>; <a href="/wiki/Andrew_G._Barto" class="mw-redirect" title="Andrew G. Barto">Barto, Andrew G.</a> (1996). "Learning to predict by the method of temporal differences". <i>Machine Learning</i>. <b>22</b>: 33–57. <a href="/wiki/CiteSeerX_(identifier)" class="mw-redirect" title="CiteSeerX (identifier)">CiteSeerX</a> <span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.143.857">10.1.1.143.857</a></span>. <a href="/wiki/Doi_(identifier)" class="mw-redirect" title="Doi (identifier)">doi</a>:<a rel="nofollow" class="external text" href="https://doi.org/10.1023%2FA%3A1018056104778">10.1023/A:1018056104778</a>. <a href="/wiki/S2CID_(identifier)" class="mw-redirect" title="S2CID (identifier)">S2CID</a> <a rel="nofollow" class="external text" href="https://api.semanticscholar.org/CorpusID:20327856">20327856</a>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=article&rft.jtitle=Machine+Learning&rft.atitle=Learning+to+predict+by+the+method+of+temporal+differences&rft.volume=22&rft.pages=33-57&rft.date=1996&rft_id=https%3A%2F%2Fciteseerx.ist.psu.edu%2Fviewdoc%2Fsummary%3Fdoi%3D10.1.1.143.857%23id-name%3DCiteSeerX&rft_id=https%3A%2F%2Fapi.semanticscholar.org%2FCorpusID%3A20327856%23id-name%3DS2CID&rft_id=info%3Adoi%2F10.1023%2FA%3A1018056104778&rft.aulast=Bradtke&rft.aufirst=Steven+J.&rft.au=Barto%2C+Andrew+G.&rfr_id=info%3Asid%2Fen.wikipedia.org%3AReinforcement+learning" class="Z3988"></span></span> </li> <li id="cite_note-19"><span class="mw-cite-backlink"><b><a href="#cite_ref-19">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFWatkins1989" class="citation thesis cs1"><a href="/w/index.php?title=Christopher_J.C.H._Watkins&action=edit&redlink=1" class="new" title="Christopher J.C.H. Watkins (page does not exist)">Watkins, Christopher J.C.H.</a> (1989). <a rel="nofollow" class="external text" href="http://www.cs.rhul.ac.uk/~chrisw/new_thesis.pdf"><i>Learning from Delayed Rewards</i></a> <span class="cs1-format">(PDF)</span> (PhD thesis). King’s College, Cambridge, UK.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Adissertation&rft.title=Learning+from+Delayed+Rewards&rft.degree=PhD&rft.inst=King%E2%80%99s+College%2C+Cambridge%2C+UK&rft.date=1989&rft.aulast=Watkins&rft.aufirst=Christopher+J.C.H.&rft_id=http%3A%2F%2Fwww.cs.rhul.ac.uk%2F~chrisw%2Fnew_thesis.pdf&rfr_id=info%3Asid%2Fen.wikipedia.org%3AReinforcement+learning" class="Z3988"></span></span> </li> <li id="cite_note-MBK-20"><span class="mw-cite-backlink"><b><a href="#cite_ref-MBK_20-0">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFMatzliachBen-GalKagan2022" class="citation journal cs1">Matzliach, Barouch; Ben-Gal, Irad; Kagan, Evgeny (2022). <a rel="nofollow" class="external text" href="https://www.ncbi.nlm.nih.gov/pmc/articles/PMC9407070">"Detection of Static and Mobile Targets by an Autonomous Agent with Deep Q-Learning Abilities"</a>. <i>Entropy</i>. <b>24</b> (8): 1168. <a href="/wiki/Bibcode_(identifier)" class="mw-redirect" title="Bibcode (identifier)">Bibcode</a>:<a rel="nofollow" class="external text" href="https://ui.adsabs.harvard.edu/abs/2022Entrp..24.1168M">2022Entrp..24.1168M</a>. <a href="/wiki/Doi_(identifier)" class="mw-redirect" title="Doi (identifier)">doi</a>:<span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://doi.org/10.3390%2Fe24081168">10.3390/e24081168</a></span>. <a href="/wiki/PMC_(identifier)" class="mw-redirect" title="PMC (identifier)">PMC</a> <span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://www.ncbi.nlm.nih.gov/pmc/articles/PMC9407070">9407070</a></span>. <a href="/wiki/PMID_(identifier)" class="mw-redirect" title="PMID (identifier)">PMID</a> <a rel="nofollow" class="external text" href="https://pubmed.ncbi.nlm.nih.gov/36010832">36010832</a>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=article&rft.jtitle=Entropy&rft.atitle=Detection+of+Static+and+Mobile+Targets+by+an+Autonomous+Agent+with+Deep+Q-Learning+Abilities&rft.volume=24&rft.issue=8&rft.pages=1168&rft.date=2022&rft_id=https%3A%2F%2Fwww.ncbi.nlm.nih.gov%2Fpmc%2Farticles%2FPMC9407070%23id-name%3DPMC&rft_id=info%3Apmid%2F36010832&rft_id=info%3Adoi%2F10.3390%2Fe24081168&rft_id=info%3Abibcode%2F2022Entrp..24.1168M&rft.aulast=Matzliach&rft.aufirst=Barouch&rft.au=Ben-Gal%2C+Irad&rft.au=Kagan%2C+Evgeny&rft_id=https%3A%2F%2Fwww.ncbi.nlm.nih.gov%2Fpmc%2Farticles%2FPMC9407070&rfr_id=info%3Asid%2Fen.wikipedia.org%3AReinforcement+learning" class="Z3988"></span></span> </li> <li id="cite_note-21"><span class="mw-cite-backlink"><b><a href="#cite_ref-21">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFWilliams1987" class="citation conference cs1"><a href="/wiki/Ronald_J._Williams" title="Ronald J. Williams">Williams, Ronald J.</a> (1987). "A class of gradient-estimating algorithms for reinforcement learning in neural networks". <i>Proceedings of the IEEE First International Conference on Neural Networks</i>. <a href="/wiki/CiteSeerX_(identifier)" class="mw-redirect" title="CiteSeerX (identifier)">CiteSeerX</a> <span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.129.8871">10.1.1.129.8871</a></span>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&rft.genre=conference&rft.atitle=A+class+of+gradient-estimating+algorithms+for+reinforcement+learning+in+neural+networks&rft.btitle=Proceedings+of+the+IEEE+First+International+Conference+on+Neural+Networks&rft.date=1987&rft_id=https%3A%2F%2Fciteseerx.ist.psu.edu%2Fviewdoc%2Fsummary%3Fdoi%3D10.1.1.129.8871%23id-name%3DCiteSeerX&rft.aulast=Williams&rft.aufirst=Ronald+J.&rfr_id=info%3Asid%2Fen.wikipedia.org%3AReinforcement+learning" class="Z3988"></span></span> </li> <li id="cite_note-22"><span class="mw-cite-backlink"><b><a href="#cite_ref-22">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFPetersVijayakumarSchaal2003" class="citation conference cs1"><a href="/wiki/Jan_Peters_(computer_scientist)" title="Jan Peters (computer scientist)">Peters, Jan</a>; <a href="/wiki/Sethu_Vijayakumar" title="Sethu Vijayakumar">Vijayakumar, Sethu</a>; <a href="/wiki/Stefan_Schaal" title="Stefan Schaal">Schaal, Stefan</a> (2003). <a rel="nofollow" class="external text" href="http://web.archive.org/web/20130512223911/http://www-clmc.usc.edu/publications/p/peters-ICHR2003.pdf"><i>Reinforcement Learning for Humanoid Robotics</i></a> <span class="cs1-format">(PDF)</span>. IEEE-RAS International Conference on Humanoid Robots. Archived from <a rel="nofollow" class="external text" href="http://www-clmc.usc.edu/publications/p/peters-ICHR2003.pdf">the original</a> <span class="cs1-format">(PDF)</span> on 2013-05-12.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&rft.genre=conference&rft.btitle=Reinforcement+Learning+for+Humanoid+Robotics&rft.date=2003&rft.aulast=Peters&rft.aufirst=Jan&rft.au=Vijayakumar%2C+Sethu&rft.au=Schaal%2C+Stefan&rft_id=http%3A%2F%2Fwww-clmc.usc.edu%2Fpublications%2Fp%2Fpeters-ICHR2003.pdf&rfr_id=info%3Asid%2Fen.wikipedia.org%3AReinforcement+learning" class="Z3988"></span></span> </li> <li id="cite_note-23"><span class="mw-cite-backlink"><b><a href="#cite_ref-23">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFJuliani2016" class="citation web cs1">Juliani, Arthur (2016-12-17). <a rel="nofollow" class="external text" href="https://medium.com/emergent-future/simple-reinforcement-learning-with-tensorflow-part-8-asynchronous-actor-critic-agents-a3c-c88f72a5e9f2">"Simple Reinforcement Learning with Tensorflow Part 8: Asynchronous Actor-Critic Agents (A3C)"</a>. <i>Medium</i><span class="reference-accessdate">. Retrieved <span class="nowrap">2018-02-22</span></span>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=unknown&rft.jtitle=Medium&rft.atitle=Simple+Reinforcement+Learning+with+Tensorflow+Part+8%3A+Asynchronous+Actor-Critic+Agents+%28A3C%29&rft.date=2016-12-17&rft.aulast=Juliani&rft.aufirst=Arthur&rft_id=https%3A%2F%2Fmedium.com%2Femergent-future%2Fsimple-reinforcement-learning-with-tensorflow-part-8-asynchronous-actor-critic-agents-a3c-c88f72a5e9f2&rfr_id=info%3Asid%2Fen.wikipedia.org%3AReinforcement+learning" class="Z3988"></span></span> </li> <li id="cite_note-24"><span class="mw-cite-backlink"><b><a href="#cite_ref-24">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFDeisenrothNeumannPeters2013" class="citation book cs1"><a href="/w/index.php?title=Marc_Peter_Deisenroth&action=edit&redlink=1" class="new" title="Marc Peter Deisenroth (page does not exist)">Deisenroth, Marc Peter</a>; <a href="/wiki/Gerhard_Neumann" title="Gerhard Neumann">Neumann, Gerhard</a>; <a href="/wiki/Jan_Peters_(computer_scientist)" title="Jan Peters (computer scientist)">Peters, Jan</a> (2013). <a rel="nofollow" class="external text" href="http://eprints.lincoln.ac.uk/28029/1/PolicySearchReview.pdf"><i>A Survey on Policy Search for Robotics</i></a> <span class="cs1-format">(PDF)</span>. Foundations and Trends in Robotics. Vol. 2. NOW Publishers. pp. 1–142. <a href="/wiki/Doi_(identifier)" class="mw-redirect" title="Doi (identifier)">doi</a>:<a rel="nofollow" class="external text" href="https://doi.org/10.1561%2F2300000021">10.1561/2300000021</a>. <a href="/wiki/Hdl_(identifier)" class="mw-redirect" title="Hdl (identifier)">hdl</a>:<a rel="nofollow" class="external text" href="https://hdl.handle.net/10044%2F1%2F12051">10044/1/12051</a>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&rft.genre=book&rft.btitle=A+Survey+on+Policy+Search+for+Robotics&rft.series=Foundations+and+Trends+in+Robotics&rft.pages=1-142&rft.pub=NOW+Publishers&rft.date=2013&rft_id=info%3Ahdl%2F10044%2F1%2F12051&rft_id=info%3Adoi%2F10.1561%2F2300000021&rft.aulast=Deisenroth&rft.aufirst=Marc+Peter&rft.au=Neumann%2C+Gerhard&rft.au=Peters%2C+Jan&rft_id=http%3A%2F%2Feprints.lincoln.ac.uk%2F28029%2F1%2FPolicySearchReview.pdf&rfr_id=info%3Asid%2Fen.wikipedia.org%3AReinforcement+learning" class="Z3988"></span></span> </li> <li id="cite_note-25"><span class="mw-cite-backlink"><b><a href="#cite_ref-25">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFSutton1990" class="citation conference cs1">Sutton, Richard (1990). "Integrated Architectures for Learning, Planning and Reacting based on Dynamic Programming". <i>Machine Learning: Proceedings of the Seventh International Workshop</i>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&rft.genre=conference&rft.atitle=Integrated+Architectures+for+Learning%2C+Planning+and+Reacting+based+on+Dynamic+Programming&rft.btitle=Machine+Learning%3A+Proceedings+of+the+Seventh+International+Workshop&rft.date=1990&rft.aulast=Sutton&rft.aufirst=Richard&rfr_id=info%3Asid%2Fen.wikipedia.org%3AReinforcement+learning" class="Z3988"></span></span> </li> <li id="cite_note-26"><span class="mw-cite-backlink"><b><a href="#cite_ref-26">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFLin1992" class="citation conference cs1">Lin, Long-Ji (1992). <a rel="nofollow" class="external text" href="https://link.springer.com/content/pdf/10.1007/BF00992699.pdf">"Self-improving reactive agents based on reinforcement learning, planning and teaching"</a> <span class="cs1-format">(PDF)</span>. <i>Machine Learning volume 8</i>. <a href="/wiki/Doi_(identifier)" class="mw-redirect" title="Doi (identifier)">doi</a>:<a rel="nofollow" class="external text" href="https://doi.org/10.1007%2FBF00992699">10.1007/BF00992699</a>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&rft.genre=conference&rft.atitle=Self-improving+reactive+agents+based+on+reinforcement+learning%2C+planning+and+teaching&rft.btitle=Machine+Learning+volume+8&rft.date=1992&rft_id=info%3Adoi%2F10.1007%2FBF00992699&rft.aulast=Lin&rft.aufirst=Long-Ji&rft_id=https%3A%2F%2Flink.springer.com%2Fcontent%2Fpdf%2F10.1007%2FBF00992699.pdf&rfr_id=info%3Asid%2Fen.wikipedia.org%3AReinforcement+learning" class="Z3988"></span></span> </li> <li id="cite_note-27"><span class="mw-cite-backlink"><b><a href="#cite_ref-27">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFZou2023" class="citation cs2">Zou, Lan (2023-01-01), Zou, Lan (ed.), <a rel="nofollow" class="external text" href="https://www.sciencedirect.com/science/article/pii/B9780323899314000110">"Chapter 7 - Meta-reinforcement learning"</a>, <i>Meta-Learning</i>, Academic Press, pp. 267–297, <a href="/wiki/Doi_(identifier)" class="mw-redirect" title="Doi (identifier)">doi</a>:<a rel="nofollow" class="external text" href="https://doi.org/10.1016%2Fb978-0-323-89931-4.00011-0">10.1016/b978-0-323-89931-4.00011-0</a>, <a href="/wiki/ISBN_(identifier)" class="mw-redirect" title="ISBN (identifier)">ISBN</a> <a href="/wiki/Special:BookSources/978-0-323-89931-4" title="Special:BookSources/978-0-323-89931-4"><bdi>978-0-323-89931-4</bdi></a><span class="reference-accessdate">, retrieved <span class="nowrap">2023-11-08</span></span></cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=article&rft.jtitle=Meta-Learning&rft.atitle=Chapter+7+-+Meta-reinforcement+learning&rft.pages=267-297&rft.date=2023-01-01&rft_id=info%3Adoi%2F10.1016%2Fb978-0-323-89931-4.00011-0&rft.isbn=978-0-323-89931-4&rft.aulast=Zou&rft.aufirst=Lan&rft_id=https%3A%2F%2Fwww.sciencedirect.com%2Fscience%2Farticle%2Fpii%2FB9780323899314000110&rfr_id=info%3Asid%2Fen.wikipedia.org%3AReinforcement+learning" class="Z3988"></span></span> </li> <li id="cite_note-28"><span class="mw-cite-backlink"><b><a href="#cite_ref-28">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFvan_HasseltHesselAslanides2019" class="citation conference cs1">van Hasselt, Hado; Hessel, Matteo; Aslanides, John (2019). <a rel="nofollow" class="external text" href="https://proceedings.neurips.cc/paper/2019/file/1b742ae215adf18b75449c6e272fd92d-Paper.pdf">"When to use parametric models in reinforcement learning?"</a> <span class="cs1-format">(PDF)</span>. <i>Advances in Neural Information Processing Systems 32</i>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&rft.genre=conference&rft.atitle=When+to+use+parametric+models+in+reinforcement+learning%3F&rft.btitle=Advances+in+Neural+Information+Processing+Systems+32&rft.date=2019&rft.aulast=van+Hasselt&rft.aufirst=Hado&rft.au=Hessel%2C+Matteo&rft.au=Aslanides%2C+John&rft_id=https%3A%2F%2Fproceedings.neurips.cc%2Fpaper%2F2019%2Ffile%2F1b742ae215adf18b75449c6e272fd92d-Paper.pdf&rfr_id=info%3Asid%2Fen.wikipedia.org%3AReinforcement+learning" class="Z3988"></span></span> </li> <li id="cite_note-29"><span class="mw-cite-backlink"><b><a href="#cite_ref-29">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFGrondmanVaandragerBusoniuBabuska2012" class="citation journal cs1">Grondman, Ivo; Vaandrager, Maarten; Busoniu, Lucian; Babuska, Robert; Schuitema, Erik (2012-06-01). <a rel="nofollow" class="external text" href="https://dl.acm.org/doi/10.1109/TSMCB.2011.2170565">"Efficient Model Learning Methods for Actor–Critic Control"</a>. <i>IEEE Transactions on Systems, Man, and Cybernetics, Part B (Cybernetics)</i>. <b>42</b> (3): 591–602. <a href="/wiki/Doi_(identifier)" class="mw-redirect" title="Doi (identifier)">doi</a>:<a rel="nofollow" class="external text" href="https://doi.org/10.1109%2FTSMCB.2011.2170565">10.1109/TSMCB.2011.2170565</a>. <a href="/wiki/ISSN_(identifier)" class="mw-redirect" title="ISSN (identifier)">ISSN</a> <a rel="nofollow" class="external text" href="https://search.worldcat.org/issn/1083-4419">1083-4419</a>. <a href="/wiki/PMID_(identifier)" class="mw-redirect" title="PMID (identifier)">PMID</a> <a rel="nofollow" class="external text" href="https://pubmed.ncbi.nlm.nih.gov/22156998">22156998</a>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=article&rft.jtitle=IEEE+Transactions+on+Systems%2C+Man%2C+and+Cybernetics%2C+Part+B+%28Cybernetics%29&rft.atitle=Efficient+Model+Learning+Methods+for+Actor%E2%80%93Critic+Control&rft.volume=42&rft.issue=3&rft.pages=591-602&rft.date=2012-06-01&rft.issn=1083-4419&rft_id=info%3Apmid%2F22156998&rft_id=info%3Adoi%2F10.1109%2FTSMCB.2011.2170565&rft.aulast=Grondman&rft.aufirst=Ivo&rft.au=Vaandrager%2C+Maarten&rft.au=Busoniu%2C+Lucian&rft.au=Babuska%2C+Robert&rft.au=Schuitema%2C+Erik&rft_id=https%3A%2F%2Fdl.acm.org%2Fdoi%2F10.1109%2FTSMCB.2011.2170565&rfr_id=info%3Asid%2Fen.wikipedia.org%3AReinforcement+learning" class="Z3988"></span></span> </li> <li id="cite_note-30"><span class="mw-cite-backlink"><b><a href="#cite_ref-30">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite class="citation web cs1"><a rel="nofollow" class="external text" href="https://cie.acm.org/articles/use-reinforcements-learning-testing-game-mechanics/">"On the Use of Reinforcement Learning for Testing Game Mechanics : ACM - Computers in Entertainment"</a>. <i>cie.acm.org</i><span class="reference-accessdate">. Retrieved <span class="nowrap">2018-11-27</span></span>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=unknown&rft.jtitle=cie.acm.org&rft.atitle=On+the+Use+of+Reinforcement+Learning+for+Testing+Game+Mechanics+%3A+ACM+-+Computers+in+Entertainment&rft_id=https%3A%2F%2Fcie.acm.org%2Farticles%2Fuse-reinforcements-learning-testing-game-mechanics%2F&rfr_id=info%3Asid%2Fen.wikipedia.org%3AReinforcement+learning" class="Z3988"></span></span> </li> <li id="cite_note-31"><span class="mw-cite-backlink"><b><a href="#cite_ref-31">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFRiveretGao2019" class="citation journal cs1">Riveret, Regis; Gao, Yang (2019). "A probabilistic argumentation framework for reinforcement learning agents". <i>Autonomous Agents and Multi-Agent Systems</i>. <b>33</b> (1–2): 216–274. <a href="/wiki/Doi_(identifier)" class="mw-redirect" title="Doi (identifier)">doi</a>:<a rel="nofollow" class="external text" href="https://doi.org/10.1007%2Fs10458-019-09404-2">10.1007/s10458-019-09404-2</a>. <a href="/wiki/S2CID_(identifier)" class="mw-redirect" title="S2CID (identifier)">S2CID</a> <a rel="nofollow" class="external text" href="https://api.semanticscholar.org/CorpusID:71147890">71147890</a>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=article&rft.jtitle=Autonomous+Agents+and+Multi-Agent+Systems&rft.atitle=A+probabilistic+argumentation+framework+for+reinforcement+learning+agents&rft.volume=33&rft.issue=1%E2%80%932&rft.pages=216-274&rft.date=2019&rft_id=info%3Adoi%2F10.1007%2Fs10458-019-09404-2&rft_id=https%3A%2F%2Fapi.semanticscholar.org%2FCorpusID%3A71147890%23id-name%3DS2CID&rft.aulast=Riveret&rft.aufirst=Regis&rft.au=Gao%2C+Yang&rfr_id=info%3Asid%2Fen.wikipedia.org%3AReinforcement+learning" class="Z3988"></span></span> </li> <li id="cite_note-32"><span class="mw-cite-backlink"><b><a href="#cite_ref-32">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFYamagataMcConvilleSantos-Rodriguez2021" class="citation arxiv cs1">Yamagata, Taku; McConville, Ryan; Santos-Rodriguez, Raul (2021-11-16). "Reinforcement Learning with Feedback from Multiple Humans with Diverse Skills". <a href="/wiki/ArXiv_(identifier)" class="mw-redirect" title="ArXiv (identifier)">arXiv</a>:<span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://arxiv.org/abs/2111.08596">2111.08596</a></span> [<a rel="nofollow" class="external text" href="https://arxiv.org/archive/cs.LG">cs.LG</a>].</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=preprint&rft.jtitle=arXiv&rft.atitle=Reinforcement+Learning+with+Feedback+from+Multiple+Humans+with+Diverse+Skills&rft.date=2021-11-16&rft_id=info%3Aarxiv%2F2111.08596&rft.aulast=Yamagata&rft.aufirst=Taku&rft.au=McConville%2C+Ryan&rft.au=Santos-Rodriguez%2C+Raul&rfr_id=info%3Asid%2Fen.wikipedia.org%3AReinforcement+learning" class="Z3988"></span></span> </li> <li id="cite_note-33"><span class="mw-cite-backlink"><b><a href="#cite_ref-33">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFKulkarniNarasimhanSaeediTenenbaum2016" class="citation journal cs1">Kulkarni, Tejas D.; Narasimhan, Karthik R.; Saeedi, Ardavan; Tenenbaum, Joshua B. (2016). <a rel="nofollow" class="external text" href="http://dl.acm.org/citation.cfm?id=3157382.3157509">"Hierarchical Deep Reinforcement Learning: Integrating Temporal Abstraction and Intrinsic Motivation"</a>. <i>Proceedings of the 30th International Conference on Neural Information Processing Systems</i>. NIPS'16. USA: Curran Associates Inc.: 3682–3690. <a href="/wiki/ArXiv_(identifier)" class="mw-redirect" title="ArXiv (identifier)">arXiv</a>:<span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://arxiv.org/abs/1604.06057">1604.06057</a></span>. <a href="/wiki/Bibcode_(identifier)" class="mw-redirect" title="Bibcode (identifier)">Bibcode</a>:<a rel="nofollow" class="external text" href="https://ui.adsabs.harvard.edu/abs/2016arXiv160406057K">2016arXiv160406057K</a>. <a href="/wiki/ISBN_(identifier)" class="mw-redirect" title="ISBN (identifier)">ISBN</a> <a href="/wiki/Special:BookSources/978-1-5108-3881-9" title="Special:BookSources/978-1-5108-3881-9"><bdi>978-1-5108-3881-9</bdi></a>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=article&rft.jtitle=Proceedings+of+the+30th+International+Conference+on+Neural+Information+Processing+Systems&rft.atitle=Hierarchical+Deep+Reinforcement+Learning%3A+Integrating+Temporal+Abstraction+and+Intrinsic+Motivation&rft.pages=3682-3690&rft.date=2016&rft_id=info%3Aarxiv%2F1604.06057&rft_id=info%3Abibcode%2F2016arXiv160406057K&rft.isbn=978-1-5108-3881-9&rft.aulast=Kulkarni&rft.aufirst=Tejas+D.&rft.au=Narasimhan%2C+Karthik+R.&rft.au=Saeedi%2C+Ardavan&rft.au=Tenenbaum%2C+Joshua+B.&rft_id=http%3A%2F%2Fdl.acm.org%2Fcitation.cfm%3Fid%3D3157382.3157509&rfr_id=info%3Asid%2Fen.wikipedia.org%3AReinforcement+learning" class="Z3988"></span></span> </li> <li id="cite_note-34"><span class="mw-cite-backlink"><b><a href="#cite_ref-34">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite class="citation web cs1"><a rel="nofollow" class="external text" href="http://umichrl.pbworks.com/Successes-of-Reinforcement-Learning/">"Reinforcement Learning / Successes of Reinforcement Learning"</a>. <i>umichrl.pbworks.com</i><span class="reference-accessdate">. Retrieved <span class="nowrap">2017-08-06</span></span>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=unknown&rft.jtitle=umichrl.pbworks.com&rft.atitle=Reinforcement+Learning+%2F+Successes+of+Reinforcement+Learning&rft_id=http%3A%2F%2Fumichrl.pbworks.com%2FSuccesses-of-Reinforcement-Learning%2F&rfr_id=info%3Asid%2Fen.wikipedia.org%3AReinforcement+learning" class="Z3988"></span></span> </li> <li id="cite_note-35"><span class="mw-cite-backlink"><b><a href="#cite_ref-35">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFDeySinghWangMcDonald-Maier2020" class="citation book cs1">Dey, Somdip; Singh, Amit Kumar; Wang, Xiaohang; McDonald-Maier, Klaus (March 2020). <a rel="nofollow" class="external text" href="https://ieeexplore.ieee.org/document/9116294">"User Interaction Aware Reinforcement Learning for Power and Thermal Efficiency of CPU-GPU Mobile MPSoCs"</a>. <a rel="nofollow" class="external text" href="http://repository.essex.ac.uk/27546/1/User%20Interaction%20Aware%20Reinforcement%20Learning.pdf"><i>2020 Design, Automation & Test in Europe Conference & Exhibition (DATE)</i></a> <span class="cs1-format">(PDF)</span>. pp. 1728–1733. <a href="/wiki/Doi_(identifier)" class="mw-redirect" title="Doi (identifier)">doi</a>:<a rel="nofollow" class="external text" href="https://doi.org/10.23919%2FDATE48585.2020.9116294">10.23919/DATE48585.2020.9116294</a>. <a href="/wiki/ISBN_(identifier)" class="mw-redirect" title="ISBN (identifier)">ISBN</a> <a href="/wiki/Special:BookSources/978-3-9819263-4-7" title="Special:BookSources/978-3-9819263-4-7"><bdi>978-3-9819263-4-7</bdi></a>. <a href="/wiki/S2CID_(identifier)" class="mw-redirect" title="S2CID (identifier)">S2CID</a> <a rel="nofollow" class="external text" href="https://api.semanticscholar.org/CorpusID:219858480">219858480</a>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&rft.genre=bookitem&rft.atitle=User+Interaction+Aware+Reinforcement+Learning+for+Power+and+Thermal+Efficiency+of+CPU-GPU+Mobile+MPSoCs&rft.btitle=2020+Design%2C+Automation+%26+Test+in+Europe+Conference+%26+Exhibition+%28DATE%29&rft.pages=1728-1733&rft.date=2020-03&rft_id=https%3A%2F%2Fapi.semanticscholar.org%2FCorpusID%3A219858480%23id-name%3DS2CID&rft_id=info%3Adoi%2F10.23919%2FDATE48585.2020.9116294&rft.isbn=978-3-9819263-4-7&rft.aulast=Dey&rft.aufirst=Somdip&rft.au=Singh%2C+Amit+Kumar&rft.au=Wang%2C+Xiaohang&rft.au=McDonald-Maier%2C+Klaus&rft_id=https%3A%2F%2Fieeexplore.ieee.org%2Fdocument%2F9116294&rfr_id=info%3Asid%2Fen.wikipedia.org%3AReinforcement+learning" class="Z3988"></span></span> </li> <li id="cite_note-36"><span class="mw-cite-backlink"><b><a href="#cite_ref-36">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFQuested" class="citation web cs1">Quested, Tony. <a rel="nofollow" class="external text" href="https://www.businessweekly.co.uk/news/academia-research/smartphones-get-smarter-essex-innovation">"Smartphones get smarter with Essex innovation"</a>. <i>Business Weekly</i><span class="reference-accessdate">. Retrieved <span class="nowrap">2021-06-17</span></span>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=unknown&rft.jtitle=Business+Weekly&rft.atitle=Smartphones+get+smarter+with+Essex+innovation&rft.aulast=Quested&rft.aufirst=Tony&rft_id=https%3A%2F%2Fwww.businessweekly.co.uk%2Fnews%2Facademia-research%2Fsmartphones-get-smarter-essex-innovation&rfr_id=info%3Asid%2Fen.wikipedia.org%3AReinforcement+learning" class="Z3988"></span></span> </li> <li id="cite_note-37"><span class="mw-cite-backlink"><b><a href="#cite_ref-37">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFWilliams2020" class="citation web cs1">Williams, Rhiannon (2020-07-21). <a rel="nofollow" class="external text" href="https://inews.co.uk/news/technology/future-smartphones-prolong-battery-life-monitoring-behaviour-558689">"Future smartphones 'will prolong their own battery life by monitoring owners' behaviour'<span class="cs1-kern-right"></span>"</a>. <i><a href="/wiki/I_(newspaper)" title="I (newspaper)">i</a></i><span class="reference-accessdate">. Retrieved <span class="nowrap">2021-06-17</span></span>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=unknown&rft.jtitle=i&rft.atitle=Future+smartphones+%27will+prolong+their+own+battery+life+by+monitoring+owners%27+behaviour%27&rft.date=2020-07-21&rft.aulast=Williams&rft.aufirst=Rhiannon&rft_id=https%3A%2F%2Finews.co.uk%2Fnews%2Ftechnology%2Ffuture-smartphones-prolong-battery-life-monitoring-behaviour-558689&rfr_id=info%3Asid%2Fen.wikipedia.org%3AReinforcement+learning" class="Z3988"></span></span> </li> <li id="cite_note-kaplan2004-38"><span class="mw-cite-backlink"><b><a href="#cite_ref-kaplan2004_38-0">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFKaplanOudeyer2004" class="citation book cs1">Kaplan, F.; Oudeyer, P. (2004). "Maximizing Learning Progress: An Internal Reward System for Development". In Iida, F.; Pfeifer, R.; Steels, L.; Kuniyoshi, Y. (eds.). <i>Embodied Artificial Intelligence</i>. Lecture Notes in Computer Science. Vol. 3139. Berlin; Heidelberg: Springer. pp. 259–270. <a href="/wiki/Doi_(identifier)" class="mw-redirect" title="Doi (identifier)">doi</a>:<a rel="nofollow" class="external text" href="https://doi.org/10.1007%2F978-3-540-27833-7_19">10.1007/978-3-540-27833-7_19</a>. <a href="/wiki/ISBN_(identifier)" class="mw-redirect" title="ISBN (identifier)">ISBN</a> <a href="/wiki/Special:BookSources/978-3-540-22484-6" title="Special:BookSources/978-3-540-22484-6"><bdi>978-3-540-22484-6</bdi></a>. <a href="/wiki/S2CID_(identifier)" class="mw-redirect" title="S2CID (identifier)">S2CID</a> <a rel="nofollow" class="external text" href="https://api.semanticscholar.org/CorpusID:9781221">9781221</a>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&rft.genre=bookitem&rft.atitle=Maximizing+Learning+Progress%3A+An+Internal+Reward+System+for+Development&rft.btitle=Embodied+Artificial+Intelligence&rft.place=Berlin%3B+Heidelberg&rft.series=Lecture+Notes+in+Computer+Science&rft.pages=259-270&rft.pub=Springer&rft.date=2004&rft_id=https%3A%2F%2Fapi.semanticscholar.org%2FCorpusID%3A9781221%23id-name%3DS2CID&rft_id=info%3Adoi%2F10.1007%2F978-3-540-27833-7_19&rft.isbn=978-3-540-22484-6&rft.aulast=Kaplan&rft.aufirst=F.&rft.au=Oudeyer%2C+P.&rfr_id=info%3Asid%2Fen.wikipedia.org%3AReinforcement+learning" class="Z3988"></span></span> </li> <li id="cite_note-klyubin2008-39"><span class="mw-cite-backlink"><b><a href="#cite_ref-klyubin2008_39-0">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFKlyubinPolaniNehaniv2008" class="citation journal cs1">Klyubin, A.; Polani, D.; Nehaniv, C. (2008). <a rel="nofollow" class="external text" href="https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2607028">"Keep your options open: an information-based driving principle for sensorimotor systems"</a>. <i>PLOS ONE</i>. <b>3</b> (12): e4018. <a href="/wiki/Bibcode_(identifier)" class="mw-redirect" title="Bibcode (identifier)">Bibcode</a>:<a rel="nofollow" class="external text" href="https://ui.adsabs.harvard.edu/abs/2008PLoSO...3.4018K">2008PLoSO...3.4018K</a>. <a href="/wiki/Doi_(identifier)" class="mw-redirect" title="Doi (identifier)">doi</a>:<span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://doi.org/10.1371%2Fjournal.pone.0004018">10.1371/journal.pone.0004018</a></span>. <a href="/wiki/PMC_(identifier)" class="mw-redirect" title="PMC (identifier)">PMC</a> <span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2607028">2607028</a></span>. <a href="/wiki/PMID_(identifier)" class="mw-redirect" title="PMID (identifier)">PMID</a> <a rel="nofollow" class="external text" href="https://pubmed.ncbi.nlm.nih.gov/19107219">19107219</a>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=article&rft.jtitle=PLOS+ONE&rft.atitle=Keep+your+options+open%3A+an+information-based+driving+principle+for+sensorimotor+systems&rft.volume=3&rft.issue=12&rft.pages=e4018&rft.date=2008&rft_id=https%3A%2F%2Fwww.ncbi.nlm.nih.gov%2Fpmc%2Farticles%2FPMC2607028%23id-name%3DPMC&rft_id=info%3Apmid%2F19107219&rft_id=info%3Adoi%2F10.1371%2Fjournal.pone.0004018&rft_id=info%3Abibcode%2F2008PLoSO...3.4018K&rft.aulast=Klyubin&rft.aufirst=A.&rft.au=Polani%2C+D.&rft.au=Nehaniv%2C+C.&rft_id=https%3A%2F%2Fwww.ncbi.nlm.nih.gov%2Fpmc%2Farticles%2FPMC2607028&rfr_id=info%3Asid%2Fen.wikipedia.org%3AReinforcement+learning" class="Z3988"></span></span> </li> <li id="cite_note-barto2013-40"><span class="mw-cite-backlink"><b><a href="#cite_ref-barto2013_40-0">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFBarto2013" class="citation book cs1">Barto, A. G. (2013). "Intrinsic motivation and reinforcement learning". <a rel="nofollow" class="external text" href="https://people.cs.umass.edu/~barto/IMCleVer-chapter-totypeset2.pdf"><i>Intrinsically Motivated Learning in Natural and Artificial Systems</i></a> <span class="cs1-format">(PDF)</span>. Berlin; Heidelberg: Springer. pp. 17–47.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&rft.genre=bookitem&rft.atitle=Intrinsic+motivation+and+reinforcement+learning&rft.btitle=Intrinsically+Motivated+Learning+in+Natural+and+Artificial+Systems&rft.place=Berlin%3B+Heidelberg&rft.pages=17-47&rft.pub=Springer&rft.date=2013&rft.aulast=Barto&rft.aufirst=A.+G.&rft_id=https%3A%2F%2Fpeople.cs.umass.edu%2F~barto%2FIMCleVer-chapter-totypeset2.pdf&rfr_id=info%3Asid%2Fen.wikipedia.org%3AReinforcement+learning" class="Z3988"></span></span> </li> <li id="cite_note-41"><span class="mw-cite-backlink"><b><a href="#cite_ref-41">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFDabériusGranatKarlsson2020" class="citation journal cs1">Dabérius, Kevin; Granat, Elvin; Karlsson, Patrik (2020). "Deep Execution - Value and Policy Based Reinforcement Learning for Trading and Beating Market Benchmarks". <i>The Journal of Machine Learning in Finance</i>. <b>1</b>. <a href="/wiki/SSRN_(identifier)" class="mw-redirect" title="SSRN (identifier)">SSRN</a> <a rel="nofollow" class="external text" href="https://papers.ssrn.com/sol3/papers.cfm?abstract_id=3374766">3374766</a>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=article&rft.jtitle=The+Journal+of+Machine+Learning+in+Finance&rft.atitle=Deep+Execution+-+Value+and+Policy+Based+Reinforcement+Learning+for+Trading+and+Beating+Market+Benchmarks&rft.volume=1&rft.date=2020&rft_id=https%3A%2F%2Fpapers.ssrn.com%2Fsol3%2Fpapers.cfm%3Fabstract_id%3D3374766%23id-name%3DSSRN&rft.aulast=Dab%C3%A9rius&rft.aufirst=Kevin&rft.au=Granat%2C+Elvin&rft.au=Karlsson%2C+Patrik&rfr_id=info%3Asid%2Fen.wikipedia.org%3AReinforcement+learning" class="Z3988"></span></span> </li> <li id="cite_note-42"><span class="mw-cite-backlink"><b><a href="#cite_ref-42">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFGeorge_KarimpanalBouffanais2019" class="citation journal cs1">George Karimpanal, Thommen; Bouffanais, Roland (2019). "Self-organizing maps for storage and transfer of knowledge in reinforcement learning". <i>Adaptive Behavior</i>. <b>27</b> (2): 111–126. <a href="/wiki/ArXiv_(identifier)" class="mw-redirect" title="ArXiv (identifier)">arXiv</a>:<span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://arxiv.org/abs/1811.08318">1811.08318</a></span>. <a href="/wiki/Doi_(identifier)" class="mw-redirect" title="Doi (identifier)">doi</a>:<a rel="nofollow" class="external text" href="https://doi.org/10.1177%2F1059712318818568">10.1177/1059712318818568</a>. <a href="/wiki/ISSN_(identifier)" class="mw-redirect" title="ISSN (identifier)">ISSN</a> <a rel="nofollow" class="external text" href="https://search.worldcat.org/issn/1059-7123">1059-7123</a>. <a href="/wiki/S2CID_(identifier)" class="mw-redirect" title="S2CID (identifier)">S2CID</a> <a rel="nofollow" class="external text" href="https://api.semanticscholar.org/CorpusID:53774629">53774629</a>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=article&rft.jtitle=Adaptive+Behavior&rft.atitle=Self-organizing+maps+for+storage+and+transfer+of+knowledge+in+reinforcement+learning&rft.volume=27&rft.issue=2&rft.pages=111-126&rft.date=2019&rft_id=info%3Aarxiv%2F1811.08318&rft_id=https%3A%2F%2Fapi.semanticscholar.org%2FCorpusID%3A53774629%23id-name%3DS2CID&rft.issn=1059-7123&rft_id=info%3Adoi%2F10.1177%2F1059712318818568&rft.aulast=George+Karimpanal&rft.aufirst=Thommen&rft.au=Bouffanais%2C+Roland&rfr_id=info%3Asid%2Fen.wikipedia.org%3AReinforcement+learning" class="Z3988"></span></span> </li> <li id="cite_note-43"><span class="mw-cite-backlink"><b><a href="#cite_ref-43">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFJ_DuanY_GuanS_Li2021" class="citation journal cs1">J Duan; Y Guan; S Li (2021). <a rel="nofollow" class="external text" href="https://ieeexplore.ieee.org/document/9448360">"Distributional Soft Actor-Critic: Off-policy reinforcement learning for addressing value estimation errors"</a>. <i>IEEE Transactions on Neural Networks and Learning Systems</i>. <b>33</b> (11): 6584–6598. <a href="/wiki/ArXiv_(identifier)" class="mw-redirect" title="ArXiv (identifier)">arXiv</a>:<span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://arxiv.org/abs/2001.02811">2001.02811</a></span>. <a href="/wiki/Doi_(identifier)" class="mw-redirect" title="Doi (identifier)">doi</a>:<a rel="nofollow" class="external text" href="https://doi.org/10.1109%2FTNNLS.2021.3082568">10.1109/TNNLS.2021.3082568</a>. <a href="/wiki/PMID_(identifier)" class="mw-redirect" title="PMID (identifier)">PMID</a> <a rel="nofollow" class="external text" href="https://pubmed.ncbi.nlm.nih.gov/34101599">34101599</a>. <a href="/wiki/S2CID_(identifier)" class="mw-redirect" title="S2CID (identifier)">S2CID</a> <a rel="nofollow" class="external text" href="https://api.semanticscholar.org/CorpusID:211259373">211259373</a>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=article&rft.jtitle=IEEE+Transactions+on+Neural+Networks+and+Learning+Systems&rft.atitle=Distributional+Soft+Actor-Critic%3A+Off-policy+reinforcement+learning+for+addressing+value+estimation+errors&rft.volume=33&rft.issue=11&rft.pages=6584-6598&rft.date=2021&rft_id=info%3Aarxiv%2F2001.02811&rft_id=https%3A%2F%2Fapi.semanticscholar.org%2FCorpusID%3A211259373%23id-name%3DS2CID&rft_id=info%3Apmid%2F34101599&rft_id=info%3Adoi%2F10.1109%2FTNNLS.2021.3082568&rft.au=J+Duan&rft.au=Y+Guan&rft.au=S+Li&rft_id=https%3A%2F%2Fieeexplore.ieee.org%2Fdocument%2F9448360&rfr_id=info%3Asid%2Fen.wikipedia.org%3AReinforcement+learning" class="Z3988"></span></span> </li> <li id="cite_note-44"><span class="mw-cite-backlink"><b><a href="#cite_ref-44">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFY_RenJ_DuanS_Li2020" class="citation book cs1">Y Ren; J Duan; S Li (2020). <a rel="nofollow" class="external text" href="https://ieeexplore.ieee.org/document/9294300">"Improving Generalization of Reinforcement Learning with Minimax Distributional Soft Actor-Critic"</a>. <i>2020 IEEE 23rd International Conference on Intelligent Transportation Systems (ITSC)</i>. pp. 1–6. <a href="/wiki/ArXiv_(identifier)" class="mw-redirect" title="ArXiv (identifier)">arXiv</a>:<span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://arxiv.org/abs/2002.05502">2002.05502</a></span>. <a href="/wiki/Doi_(identifier)" class="mw-redirect" title="Doi (identifier)">doi</a>:<a rel="nofollow" class="external text" href="https://doi.org/10.1109%2FITSC45102.2020.9294300">10.1109/ITSC45102.2020.9294300</a>. <a href="/wiki/ISBN_(identifier)" class="mw-redirect" title="ISBN (identifier)">ISBN</a> <a href="/wiki/Special:BookSources/978-1-7281-4149-7" title="Special:BookSources/978-1-7281-4149-7"><bdi>978-1-7281-4149-7</bdi></a>. <a href="/wiki/S2CID_(identifier)" class="mw-redirect" title="S2CID (identifier)">S2CID</a> <a rel="nofollow" class="external text" href="https://api.semanticscholar.org/CorpusID:211096594">211096594</a>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&rft.genre=bookitem&rft.atitle=Improving+Generalization+of+Reinforcement+Learning+with+Minimax+Distributional+Soft+Actor-Critic&rft.btitle=2020+IEEE+23rd+International+Conference+on+Intelligent+Transportation+Systems+%28ITSC%29&rft.pages=1-6&rft.date=2020&rft_id=info%3Aarxiv%2F2002.05502&rft_id=https%3A%2F%2Fapi.semanticscholar.org%2FCorpusID%3A211096594%23id-name%3DS2CID&rft_id=info%3Adoi%2F10.1109%2FITSC45102.2020.9294300&rft.isbn=978-1-7281-4149-7&rft.au=Y+Ren&rft.au=J+Duan&rft.au=S+Li&rft_id=https%3A%2F%2Fieeexplore.ieee.org%2Fdocument%2F9294300&rfr_id=info%3Asid%2Fen.wikipedia.org%3AReinforcement+learning" class="Z3988"></span></span> </li> <li id="cite_note-45"><span class="mw-cite-backlink"><b><a href="#cite_ref-45">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFDuanWangXiao2023" class="citation arxiv cs1">Duan, J; Wang, W; Xiao, L (2023-10-26). "DSAC-T: Distributional Soft Actor-Critic with Three Refinements". <a href="/wiki/ArXiv_(identifier)" class="mw-redirect" title="ArXiv (identifier)">arXiv</a>:<span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://arxiv.org/abs/2310.05858">2310.05858</a></span> [<a rel="nofollow" class="external text" href="https://arxiv.org/archive/cs.LG">cs.LG</a>].</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=preprint&rft.jtitle=arXiv&rft.atitle=DSAC-T%3A+Distributional+Soft+Actor-Critic+with+Three+Refinements&rft.date=2023-10-26&rft_id=info%3Aarxiv%2F2310.05858&rft.aulast=Duan&rft.aufirst=J&rft.au=Wang%2C+W&rft.au=Xiao%2C+L&rfr_id=info%3Asid%2Fen.wikipedia.org%3AReinforcement+learning" class="Z3988"></span></span> </li> <li id="cite_note-46"><span class="mw-cite-backlink"><b><a href="#cite_ref-46">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFSoucek1992" class="citation book cs1">Soucek, Branko (6 May 1992). <i>Dynamic, Genetic and Chaotic Programming: The Sixth-Generation Computer Technology Series</i>. John Wiley & Sons, Inc. p. 38. <a href="/wiki/ISBN_(identifier)" class="mw-redirect" title="ISBN (identifier)">ISBN</a> <a href="/wiki/Special:BookSources/0-471-55717-X" title="Special:BookSources/0-471-55717-X"><bdi>0-471-55717-X</bdi></a>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&rft.genre=book&rft.btitle=Dynamic%2C+Genetic+and+Chaotic+Programming%3A+The+Sixth-Generation+Computer+Technology+Series&rft.pages=38&rft.pub=John+Wiley+%26+Sons%2C+Inc&rft.date=1992-05-06&rft.isbn=0-471-55717-X&rft.aulast=Soucek&rft.aufirst=Branko&rfr_id=info%3Asid%2Fen.wikipedia.org%3AReinforcement+learning" class="Z3988"></span></span> </li> <li id="cite_note-intro_deep_RL-47"><span class="mw-cite-backlink"><b><a href="#cite_ref-intro_deep_RL_47-0">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFFrancois-Lavet2018" class="citation journal cs1">Francois-Lavet, Vincent; et al. (2018). "An Introduction to Deep Reinforcement Learning". <i>Foundations and Trends in Machine Learning</i>. <b>11</b> (3–4): 219–354. <a href="/wiki/ArXiv_(identifier)" class="mw-redirect" title="ArXiv (identifier)">arXiv</a>:<span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://arxiv.org/abs/1811.12560">1811.12560</a></span>. <a href="/wiki/Bibcode_(identifier)" class="mw-redirect" title="Bibcode (identifier)">Bibcode</a>:<a rel="nofollow" class="external text" href="https://ui.adsabs.harvard.edu/abs/2018arXiv181112560F">2018arXiv181112560F</a>. <a href="/wiki/Doi_(identifier)" class="mw-redirect" title="Doi (identifier)">doi</a>:<a rel="nofollow" class="external text" href="https://doi.org/10.1561%2F2200000071">10.1561/2200000071</a>. <a href="/wiki/S2CID_(identifier)" class="mw-redirect" title="S2CID (identifier)">S2CID</a> <a rel="nofollow" class="external text" href="https://api.semanticscholar.org/CorpusID:54434537">54434537</a>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=article&rft.jtitle=Foundations+and+Trends+in+Machine+Learning&rft.atitle=An+Introduction+to+Deep+Reinforcement+Learning&rft.volume=11&rft.issue=3%E2%80%934&rft.pages=219-354&rft.date=2018&rft_id=info%3Aarxiv%2F1811.12560&rft_id=https%3A%2F%2Fapi.semanticscholar.org%2FCorpusID%3A54434537%23id-name%3DS2CID&rft_id=info%3Adoi%2F10.1561%2F2200000071&rft_id=info%3Abibcode%2F2018arXiv181112560F&rft.aulast=Francois-Lavet&rft.aufirst=Vincent&rfr_id=info%3Asid%2Fen.wikipedia.org%3AReinforcement+learning" class="Z3988"></span></span> </li> <li id="cite_note-DQN2-48"><span class="mw-cite-backlink"><b><a href="#cite_ref-DQN2_48-0">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFMnih2015" class="citation journal cs1">Mnih, Volodymyr; et al. (2015). "Human-level control through deep reinforcement learning". <i>Nature</i>. <b>518</b> (7540): 529–533. <a href="/wiki/Bibcode_(identifier)" class="mw-redirect" title="Bibcode (identifier)">Bibcode</a>:<a rel="nofollow" class="external text" href="https://ui.adsabs.harvard.edu/abs/2015Natur.518..529M">2015Natur.518..529M</a>. <a href="/wiki/Doi_(identifier)" class="mw-redirect" title="Doi (identifier)">doi</a>:<a rel="nofollow" class="external text" href="https://doi.org/10.1038%2Fnature14236">10.1038/nature14236</a>. <a href="/wiki/PMID_(identifier)" class="mw-redirect" title="PMID (identifier)">PMID</a> <a rel="nofollow" class="external text" href="https://pubmed.ncbi.nlm.nih.gov/25719670">25719670</a>. <a href="/wiki/S2CID_(identifier)" class="mw-redirect" title="S2CID (identifier)">S2CID</a> <a rel="nofollow" class="external text" href="https://api.semanticscholar.org/CorpusID:205242740">205242740</a>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=article&rft.jtitle=Nature&rft.atitle=Human-level+control+through+deep+reinforcement+learning&rft.volume=518&rft.issue=7540&rft.pages=529-533&rft.date=2015&rft_id=info%3Adoi%2F10.1038%2Fnature14236&rft_id=https%3A%2F%2Fapi.semanticscholar.org%2FCorpusID%3A205242740%23id-name%3DS2CID&rft_id=info%3Apmid%2F25719670&rft_id=info%3Abibcode%2F2015Natur.518..529M&rft.aulast=Mnih&rft.aufirst=Volodymyr&rfr_id=info%3Asid%2Fen.wikipedia.org%3AReinforcement+learning" class="Z3988"></span></span> </li> <li id="cite_note-49"><span class="mw-cite-backlink"><b><a href="#cite_ref-49">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFGoodfellowShlensSzegedy2015" class="citation journal cs1">Goodfellow, Ian; Shlens, Jonathan; Szegedy, Christian (2015). "Explaining and Harnessing Adversarial Examples". <i>International Conference on Learning Representations</i>. <a href="/wiki/ArXiv_(identifier)" class="mw-redirect" title="ArXiv (identifier)">arXiv</a>:<span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://arxiv.org/abs/1412.6572">1412.6572</a></span>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=article&rft.jtitle=International+Conference+on+Learning+Representations&rft.atitle=Explaining+and+Harnessing+Adversarial+Examples&rft.date=2015&rft_id=info%3Aarxiv%2F1412.6572&rft.aulast=Goodfellow&rft.aufirst=Ian&rft.au=Shlens%2C+Jonathan&rft.au=Szegedy%2C+Christian&rfr_id=info%3Asid%2Fen.wikipedia.org%3AReinforcement+learning" class="Z3988"></span></span> </li> <li id="cite_note-50"><span class="mw-cite-backlink"><b><a href="#cite_ref-50">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFBehzadanMunir2017" class="citation book cs1">Behzadan, Vahid; Munir, Arslan (2017). "Vulnerability of Deep Reinforcement Learning to Policy Induction Attacks". <i>Machine Learning and Data Mining in Pattern Recognition</i>. Lecture Notes in Computer Science. Vol. 10358. pp. 262–275. <a href="/wiki/ArXiv_(identifier)" class="mw-redirect" title="ArXiv (identifier)">arXiv</a>:<span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://arxiv.org/abs/1701.04143">1701.04143</a></span>. <a href="/wiki/Doi_(identifier)" class="mw-redirect" title="Doi (identifier)">doi</a>:<a rel="nofollow" class="external text" href="https://doi.org/10.1007%2F978-3-319-62416-7_19">10.1007/978-3-319-62416-7_19</a>. <a href="/wiki/ISBN_(identifier)" class="mw-redirect" title="ISBN (identifier)">ISBN</a> <a href="/wiki/Special:BookSources/978-3-319-62415-0" title="Special:BookSources/978-3-319-62415-0"><bdi>978-3-319-62415-0</bdi></a>. <a href="/wiki/S2CID_(identifier)" class="mw-redirect" title="S2CID (identifier)">S2CID</a> <a rel="nofollow" class="external text" href="https://api.semanticscholar.org/CorpusID:1562290">1562290</a>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&rft.genre=bookitem&rft.atitle=Vulnerability+of+Deep+Reinforcement+Learning+to+Policy+Induction+Attacks&rft.btitle=Machine+Learning+and+Data+Mining+in+Pattern+Recognition&rft.series=Lecture+Notes+in+Computer+Science&rft.pages=262-275&rft.date=2017&rft_id=info%3Aarxiv%2F1701.04143&rft_id=https%3A%2F%2Fapi.semanticscholar.org%2FCorpusID%3A1562290%23id-name%3DS2CID&rft_id=info%3Adoi%2F10.1007%2F978-3-319-62416-7_19&rft.isbn=978-3-319-62415-0&rft.aulast=Behzadan&rft.aufirst=Vahid&rft.au=Munir%2C+Arslan&rfr_id=info%3Asid%2Fen.wikipedia.org%3AReinforcement+learning" class="Z3988"></span></span> </li> <li id="cite_note-51"><span class="mw-cite-backlink"><b><a href="#cite_ref-51">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFPieter2017" class="citation book cs1">Pieter, Huang, Sandy Papernot, Nicolas Goodfellow, Ian Duan, Yan Abbeel (2017-02-07). <a rel="nofollow" class="external text" href="http://worldcat.org/oclc/1106256905"><i>Adversarial Attacks on Neural Network Policies</i></a>. <a href="/wiki/OCLC_(identifier)" class="mw-redirect" title="OCLC (identifier)">OCLC</a> <a rel="nofollow" class="external text" href="https://search.worldcat.org/oclc/1106256905">1106256905</a>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&rft.genre=book&rft.btitle=Adversarial+Attacks+on+Neural+Network+Policies&rft.date=2017-02-07&rft_id=info%3Aoclcnum%2F1106256905&rft.aulast=Pieter&rft.aufirst=Huang%2C+Sandy+Papernot%2C+Nicolas+Goodfellow%2C+Ian+Duan%2C+Yan+Abbeel&rft_id=http%3A%2F%2Fworldcat.org%2Foclc%2F1106256905&rfr_id=info%3Asid%2Fen.wikipedia.org%3AReinforcement+learning" class="Z3988"></span><span class="cs1-maint citation-comment"><code class="cs1-code">{{<a href="/wiki/Template:Cite_book" title="Template:Cite book">cite book</a>}}</code>: CS1 maint: multiple names: authors list (<a href="/wiki/Category:CS1_maint:_multiple_names:_authors_list" title="Category:CS1 maint: multiple names: authors list">link</a>)</span></span> </li> <li id="cite_note-52"><span class="mw-cite-backlink"><b><a href="#cite_ref-52">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFKorkmaz2022" class="citation journal cs1">Korkmaz, Ezgi (2022). <a rel="nofollow" class="external text" href="https://doi.org/10.1609%2Faaai.v36i7.20684">"Deep Reinforcement Learning Policies Learn Shared Adversarial Features Across MDPs"</a>. <i>Thirty-Sixth AAAI Conference on Artificial Intelligence (AAAI-22)</i>. <b>36</b> (7): 7229–7238. <a href="/wiki/ArXiv_(identifier)" class="mw-redirect" title="ArXiv (identifier)">arXiv</a>:<span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://arxiv.org/abs/2112.09025">2112.09025</a></span>. <a href="/wiki/Doi_(identifier)" class="mw-redirect" title="Doi (identifier)">doi</a>:<span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://doi.org/10.1609%2Faaai.v36i7.20684">10.1609/aaai.v36i7.20684</a></span>. <a href="/wiki/S2CID_(identifier)" class="mw-redirect" title="S2CID (identifier)">S2CID</a> <a rel="nofollow" class="external text" href="https://api.semanticscholar.org/CorpusID:245219157">245219157</a>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=article&rft.jtitle=Thirty-Sixth+AAAI+Conference+on+Artificial+Intelligence+%28AAAI-22%29&rft.atitle=Deep+Reinforcement+Learning+Policies+Learn+Shared+Adversarial+Features+Across+MDPs.&rft.volume=36&rft.issue=7&rft.pages=7229-7238&rft.date=2022&rft_id=info%3Aarxiv%2F2112.09025&rft_id=https%3A%2F%2Fapi.semanticscholar.org%2FCorpusID%3A245219157%23id-name%3DS2CID&rft_id=info%3Adoi%2F10.1609%2Faaai.v36i7.20684&rft.aulast=Korkmaz&rft.aufirst=Ezgi&rft_id=https%3A%2F%2Fdoi.org%2F10.1609%252Faaai.v36i7.20684&rfr_id=info%3Asid%2Fen.wikipedia.org%3AReinforcement+learning" class="Z3988"></span></span> </li> <li id="cite_note-53"><span class="mw-cite-backlink"><b><a href="#cite_ref-53">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFBerenji1994" class="citation book cs1">Berenji, H.R. (1994). <a rel="nofollow" class="external text" href="https://ieeexplore.ieee.org/document/343737">"Fuzzy Q-learning: A new approach for fuzzy dynamic programming"</a>. <i>Proceedings of 1994 IEEE 3rd International Fuzzy Systems Conference</i>. Orlando, FL, USA: IEEE. pp. 486–491. <a href="/wiki/Doi_(identifier)" class="mw-redirect" title="Doi (identifier)">doi</a>:<a rel="nofollow" class="external text" href="https://doi.org/10.1109%2FFUZZY.1994.343737">10.1109/FUZZY.1994.343737</a>. <a href="/wiki/ISBN_(identifier)" class="mw-redirect" title="ISBN (identifier)">ISBN</a> <a href="/wiki/Special:BookSources/0-7803-1896-X" title="Special:BookSources/0-7803-1896-X"><bdi>0-7803-1896-X</bdi></a>. <a href="/wiki/S2CID_(identifier)" class="mw-redirect" title="S2CID (identifier)">S2CID</a> <a rel="nofollow" class="external text" href="https://api.semanticscholar.org/CorpusID:56694947">56694947</a>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&rft.genre=bookitem&rft.atitle=Fuzzy+Q-learning%3A+A+new+approach+for+fuzzy+dynamic+programming&rft.btitle=Proceedings+of+1994+IEEE+3rd+International+Fuzzy+Systems+Conference&rft.place=Orlando%2C+FL%2C+USA&rft.pages=486-491&rft.pub=IEEE&rft.date=1994&rft_id=https%3A%2F%2Fapi.semanticscholar.org%2FCorpusID%3A56694947%23id-name%3DS2CID&rft_id=info%3Adoi%2F10.1109%2FFUZZY.1994.343737&rft.isbn=0-7803-1896-X&rft.aulast=Berenji&rft.aufirst=H.R.&rft_id=https%3A%2F%2Fieeexplore.ieee.org%2Fdocument%2F343737&rfr_id=info%3Asid%2Fen.wikipedia.org%3AReinforcement+learning" class="Z3988"></span></span> </li> <li id="cite_note-54"><span class="mw-cite-backlink"><b><a href="#cite_ref-54">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFVincze2017" class="citation book cs1">Vincze, David (2017). <a rel="nofollow" class="external text" href="http://users.iit.uni-miskolc.hu/~vinczed/research/vinczed_sami2017_author_draft.pdf">"Fuzzy rule interpolation and reinforcement learning"</a> <span class="cs1-format">(PDF)</span>. <i>2017 IEEE 15th International Symposium on Applied Machine Intelligence and Informatics (SAMI)</i>. IEEE. pp. 173–178. <a href="/wiki/Doi_(identifier)" class="mw-redirect" title="Doi (identifier)">doi</a>:<a rel="nofollow" class="external text" href="https://doi.org/10.1109%2FSAMI.2017.7880298">10.1109/SAMI.2017.7880298</a>. <a href="/wiki/ISBN_(identifier)" class="mw-redirect" title="ISBN (identifier)">ISBN</a> <a href="/wiki/Special:BookSources/978-1-5090-5655-2" title="Special:BookSources/978-1-5090-5655-2"><bdi>978-1-5090-5655-2</bdi></a>. <a href="/wiki/S2CID_(identifier)" class="mw-redirect" title="S2CID (identifier)">S2CID</a> <a rel="nofollow" class="external text" href="https://api.semanticscholar.org/CorpusID:17590120">17590120</a>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&rft.genre=bookitem&rft.atitle=Fuzzy+rule+interpolation+and+reinforcement+learning&rft.btitle=2017+IEEE+15th+International+Symposium+on+Applied+Machine+Intelligence+and+Informatics+%28SAMI%29&rft.pages=173-178&rft.pub=IEEE&rft.date=2017&rft_id=https%3A%2F%2Fapi.semanticscholar.org%2FCorpusID%3A17590120%23id-name%3DS2CID&rft_id=info%3Adoi%2F10.1109%2FSAMI.2017.7880298&rft.isbn=978-1-5090-5655-2&rft.aulast=Vincze&rft.aufirst=David&rft_id=http%3A%2F%2Fusers.iit.uni-miskolc.hu%2F~vinczed%2Fresearch%2Fvinczed_sami2017_author_draft.pdf&rfr_id=info%3Asid%2Fen.wikipedia.org%3AReinforcement+learning" class="Z3988"></span></span> </li> <li id="cite_note-55"><span class="mw-cite-backlink"><b><a href="#cite_ref-55">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFNgRussell2000" class="citation book cs1">Ng, A. Y.; Russell, S. J. (2000). <a rel="nofollow" class="external text" href="https://ai.stanford.edu/~ang/papers/icml00-irl.pdf">"Algorithms for Inverse Reinforcement Learning"</a> <span class="cs1-format">(PDF)</span>. <i>Proceeding ICML '00 Proceedings of the Seventeenth International Conference on Machine Learning</i>. pp. 663–670. <a href="/wiki/ISBN_(identifier)" class="mw-redirect" title="ISBN (identifier)">ISBN</a> <a href="/wiki/Special:BookSources/1-55860-707-2" title="Special:BookSources/1-55860-707-2"><bdi>1-55860-707-2</bdi></a>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&rft.genre=bookitem&rft.atitle=Algorithms+for+Inverse+Reinforcement+Learning&rft.btitle=Proceeding+ICML+%2700+Proceedings+of+the+Seventeenth+International+Conference+on+Machine+Learning&rft.pages=663-670&rft.date=2000&rft.isbn=1-55860-707-2&rft.aulast=Ng&rft.aufirst=A.+Y.&rft.au=Russell%2C+S.+J.&rft_id=https%3A%2F%2Fai.stanford.edu%2F~ang%2Fpapers%2Ficml00-irl.pdf&rfr_id=info%3Asid%2Fen.wikipedia.org%3AReinforcement+learning" class="Z3988"></span></span> </li> <li id="cite_note-56"><span class="mw-cite-backlink"><b><a href="#cite_ref-56">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFZiebartMaasBagnellDey2008" class="citation journal cs1">Ziebart, Brian D.; Maas, Andrew; Bagnell, J. Andrew; Dey, Anind K. (2008-07-13). <a rel="nofollow" class="external text" href="https://dl.acm.org/doi/10.5555/1620270.1620297">"Maximum entropy inverse reinforcement learning"</a>. <i>Proceedings of the 23rd National Conference on Artificial Intelligence - Volume 3</i>. AAAI'08. Chicago, Illinois: AAAI Press: 1433–1438. <a href="/wiki/ISBN_(identifier)" class="mw-redirect" title="ISBN (identifier)">ISBN</a> <a href="/wiki/Special:BookSources/978-1-57735-368-3" title="Special:BookSources/978-1-57735-368-3"><bdi>978-1-57735-368-3</bdi></a>. <a href="/wiki/S2CID_(identifier)" class="mw-redirect" title="S2CID (identifier)">S2CID</a> <a rel="nofollow" class="external text" href="https://api.semanticscholar.org/CorpusID:336219">336219</a>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=article&rft.jtitle=Proceedings+of+the+23rd+National+Conference+on+Artificial+Intelligence+-+Volume+3&rft.atitle=Maximum+entropy+inverse+reinforcement+learning&rft.pages=1433-1438&rft.date=2008-07-13&rft_id=https%3A%2F%2Fapi.semanticscholar.org%2FCorpusID%3A336219%23id-name%3DS2CID&rft.isbn=978-1-57735-368-3&rft.aulast=Ziebart&rft.aufirst=Brian+D.&rft.au=Maas%2C+Andrew&rft.au=Bagnell%2C+J.+Andrew&rft.au=Dey%2C+Anind+K.&rft_id=https%3A%2F%2Fdl.acm.org%2Fdoi%2F10.5555%2F1620270.1620297&rfr_id=info%3Asid%2Fen.wikipedia.org%3AReinforcement+learning" class="Z3988"></span></span> </li> <li id="cite_note-57"><span class="mw-cite-backlink"><b><a href="#cite_ref-57">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFPitombeira-NetoSantosCoelho_da_Silvade_Macedo2024" class="citation journal cs1">Pitombeira-Neto, Anselmo R.; Santos, Helano P.; Coelho da Silva, Ticiana L.; de Macedo, José Antonio F. (March 2024). <a rel="nofollow" class="external text" href="https://doi.org/10.1016/j.ins.2024.120128">"Trajectory modeling via random utility inverse reinforcement learning"</a>. <i>Information Sciences</i>. <b>660</b>: 120128. <a href="/wiki/ArXiv_(identifier)" class="mw-redirect" title="ArXiv (identifier)">arXiv</a>:<span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://arxiv.org/abs/2105.12092">2105.12092</a></span>. <a href="/wiki/Doi_(identifier)" class="mw-redirect" title="Doi (identifier)">doi</a>:<a rel="nofollow" class="external text" href="https://doi.org/10.1016%2Fj.ins.2024.120128">10.1016/j.ins.2024.120128</a>. <a href="/wiki/ISSN_(identifier)" class="mw-redirect" title="ISSN (identifier)">ISSN</a> <a rel="nofollow" class="external text" href="https://search.worldcat.org/issn/0020-0255">0020-0255</a>. <a href="/wiki/S2CID_(identifier)" class="mw-redirect" title="S2CID (identifier)">S2CID</a> <a rel="nofollow" class="external text" href="https://api.semanticscholar.org/CorpusID:235187141">235187141</a>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=article&rft.jtitle=Information+Sciences&rft.atitle=Trajectory+modeling+via+random+utility+inverse+reinforcement+learning&rft.volume=660&rft.pages=120128&rft.date=2024-03&rft_id=info%3Aarxiv%2F2105.12092&rft_id=https%3A%2F%2Fapi.semanticscholar.org%2FCorpusID%3A235187141%23id-name%3DS2CID&rft.issn=0020-0255&rft_id=info%3Adoi%2F10.1016%2Fj.ins.2024.120128&rft.aulast=Pitombeira-Neto&rft.aufirst=Anselmo+R.&rft.au=Santos%2C+Helano+P.&rft.au=Coelho+da+Silva%2C+Ticiana+L.&rft.au=de+Macedo%2C+Jos%C3%A9+Antonio+F.&rft_id=https%3A%2F%2Fdoi.org%2F10.1016%2Fj.ins.2024.120128&rfr_id=info%3Asid%2Fen.wikipedia.org%3AReinforcement+learning" class="Z3988"></span></span> </li> <li id="cite_note-58"><span class="mw-cite-backlink"><b><a href="#cite_ref-58">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFGarcíaFernández2015" class="citation journal cs1">García, Javier; Fernández, Fernando (1 January 2015). <a rel="nofollow" class="external text" href="https://jmlr.org/papers/volume16/garcia15a/garcia15a.pdf">"A comprehensive survey on safe reinforcement learning"</a> <span class="cs1-format">(PDF)</span>. <i>The Journal of Machine Learning Research</i>. <b>16</b> (1): 1437–1480.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=article&rft.jtitle=The+Journal+of+Machine+Learning+Research&rft.atitle=A+comprehensive+survey+on+safe+reinforcement+learning&rft.volume=16&rft.issue=1&rft.pages=1437-1480&rft.date=2015-01-01&rft.aulast=Garc%C3%ADa&rft.aufirst=Javier&rft.au=Fern%C3%A1ndez%2C+Fernando&rft_id=https%3A%2F%2Fjmlr.org%2Fpapers%2Fvolume16%2Fgarcia15a%2Fgarcia15a.pdf&rfr_id=info%3Asid%2Fen.wikipedia.org%3AReinforcement+learning" class="Z3988"></span></span> </li> <li id="cite_note-59"><span class="mw-cite-backlink"><b><a href="#cite_ref-59">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFDabneyOstrovskiSilverMunos2018" class="citation journal cs1">Dabney, Will; Ostrovski, Georg; Silver, David; Munos, Remi (2018-07-03). <a rel="nofollow" class="external text" href="https://proceedings.mlr.press/v80/dabney18a.html">"Implicit Quantile Networks for Distributional Reinforcement Learning"</a>. <i>Proceedings of the 35th International Conference on Machine Learning</i>. PMLR: 1096–1105. <a href="/wiki/ArXiv_(identifier)" class="mw-redirect" title="ArXiv (identifier)">arXiv</a>:<span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://arxiv.org/abs/1806.06923">1806.06923</a></span>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=article&rft.jtitle=Proceedings+of+the+35th+International+Conference+on+Machine+Learning&rft.atitle=Implicit+Quantile+Networks+for+Distributional+Reinforcement+Learning&rft.pages=1096-1105&rft.date=2018-07-03&rft_id=info%3Aarxiv%2F1806.06923&rft.aulast=Dabney&rft.aufirst=Will&rft.au=Ostrovski%2C+Georg&rft.au=Silver%2C+David&rft.au=Munos%2C+Remi&rft_id=https%3A%2F%2Fproceedings.mlr.press%2Fv80%2Fdabney18a.html&rfr_id=info%3Asid%2Fen.wikipedia.org%3AReinforcement+learning" class="Z3988"></span></span> </li> <li id="cite_note-60"><span class="mw-cite-backlink"><b><a href="#cite_ref-60">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFChowTamarMannorPavone2015" class="citation journal cs1">Chow, Yinlam; Tamar, Aviv; Mannor, Shie; Pavone, Marco (2015). <a rel="nofollow" class="external text" href="https://proceedings.neurips.cc/paper/2015/hash/64223ccf70bbb65a3a4aceac37e21016-Abstract.html">"Risk-Sensitive and Robust Decision-Making: a CVaR Optimization Approach"</a>. <i>Advances in Neural Information Processing Systems</i>. <b>28</b>. Curran Associates, Inc. <a href="/wiki/ArXiv_(identifier)" class="mw-redirect" title="ArXiv (identifier)">arXiv</a>:<span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://arxiv.org/abs/1506.02188">1506.02188</a></span>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=article&rft.jtitle=Advances+in+Neural+Information+Processing+Systems&rft.atitle=Risk-Sensitive+and+Robust+Decision-Making%3A+a+CVaR+Optimization+Approach&rft.volume=28&rft.date=2015&rft_id=info%3Aarxiv%2F1506.02188&rft.aulast=Chow&rft.aufirst=Yinlam&rft.au=Tamar%2C+Aviv&rft.au=Mannor%2C+Shie&rft.au=Pavone%2C+Marco&rft_id=https%3A%2F%2Fproceedings.neurips.cc%2Fpaper%2F2015%2Fhash%2F64223ccf70bbb65a3a4aceac37e21016-Abstract.html&rfr_id=info%3Asid%2Fen.wikipedia.org%3AReinforcement+learning" class="Z3988"></span></span> </li> <li id="cite_note-61"><span class="mw-cite-backlink"><b><a href="#cite_ref-61">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite class="citation web cs1"><a rel="nofollow" class="external text" href="https://scholar.google.com/citations?view_op=view_citation&hl=en&user=LnwyFkkAAAAJ&citation_for_view=LnwyFkkAAAAJ:eQOLeE2rZwMC">"Train Hard, Fight Easy: Robust Meta Reinforcement Learning"</a>. <i>scholar.google.com</i><span class="reference-accessdate">. Retrieved <span class="nowrap">2024-06-21</span></span>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=unknown&rft.jtitle=scholar.google.com&rft.atitle=Train+Hard%2C+Fight+Easy%3A+Robust+Meta+Reinforcement+Learning&rft_id=https%3A%2F%2Fscholar.google.com%2Fcitations%3Fview_op%3Dview_citation%26hl%3Den%26user%3DLnwyFkkAAAAJ%26citation_for_view%3DLnwyFkkAAAAJ%3AeQOLeE2rZwMC&rfr_id=info%3Asid%2Fen.wikipedia.org%3AReinforcement+learning" class="Z3988"></span></span> </li> <li id="cite_note-62"><span class="mw-cite-backlink"><b><a href="#cite_ref-62">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFTamarGlassnerMannor2015" class="citation journal cs1">Tamar, Aviv; Glassner, Yonatan; Mannor, Shie (2015-02-21). <a rel="nofollow" class="external text" href="https://ojs.aaai.org/index.php/AAAI/article/view/9561">"Optimizing the CVaR via Sampling"</a>. <i>Proceedings of the AAAI Conference on Artificial Intelligence</i>. <b>29</b> (1). <a href="/wiki/ArXiv_(identifier)" class="mw-redirect" title="ArXiv (identifier)">arXiv</a>:<span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://arxiv.org/abs/1404.3862">1404.3862</a></span>. <a href="/wiki/Doi_(identifier)" class="mw-redirect" title="Doi (identifier)">doi</a>:<a rel="nofollow" class="external text" href="https://doi.org/10.1609%2Faaai.v29i1.9561">10.1609/aaai.v29i1.9561</a>. <a href="/wiki/ISSN_(identifier)" class="mw-redirect" title="ISSN (identifier)">ISSN</a> <a rel="nofollow" class="external text" href="https://search.worldcat.org/issn/2374-3468">2374-3468</a>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=article&rft.jtitle=Proceedings+of+the+AAAI+Conference+on+Artificial+Intelligence&rft.atitle=Optimizing+the+CVaR+via+Sampling&rft.volume=29&rft.issue=1&rft.date=2015-02-21&rft_id=info%3Aarxiv%2F1404.3862&rft.issn=2374-3468&rft_id=info%3Adoi%2F10.1609%2Faaai.v29i1.9561&rft.aulast=Tamar&rft.aufirst=Aviv&rft.au=Glassner%2C+Yonatan&rft.au=Mannor%2C+Shie&rft_id=https%3A%2F%2Fojs.aaai.org%2Findex.php%2FAAAI%2Farticle%2Fview%2F9561&rfr_id=info%3Asid%2Fen.wikipedia.org%3AReinforcement+learning" class="Z3988"></span></span> </li> <li id="cite_note-63"><span class="mw-cite-backlink"><b><a href="#cite_ref-63">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFGreenbergChowGhavamzadehMannor2022" class="citation journal cs1">Greenberg, Ido; Chow, Yinlam; Ghavamzadeh, Mohammad; Mannor, Shie (2022-12-06). <a rel="nofollow" class="external text" href="https://proceedings.neurips.cc/paper_files/paper/2022/hash/d2511dfb731fa336739782ba825cd98c-Abstract-Conference.html">"Efficient Risk-Averse Reinforcement Learning"</a>. <i>Advances in Neural Information Processing Systems</i>. <b>35</b>: 32639–32652. <a href="/wiki/ArXiv_(identifier)" class="mw-redirect" title="ArXiv (identifier)">arXiv</a>:<span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://arxiv.org/abs/2205.05138">2205.05138</a></span>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=article&rft.jtitle=Advances+in+Neural+Information+Processing+Systems&rft.atitle=Efficient+Risk-Averse+Reinforcement+Learning&rft.volume=35&rft.pages=32639-32652&rft.date=2022-12-06&rft_id=info%3Aarxiv%2F2205.05138&rft.aulast=Greenberg&rft.aufirst=Ido&rft.au=Chow%2C+Yinlam&rft.au=Ghavamzadeh%2C+Mohammad&rft.au=Mannor%2C+Shie&rft_id=https%3A%2F%2Fproceedings.neurips.cc%2Fpaper_files%2Fpaper%2F2022%2Fhash%2Fd2511dfb731fa336739782ba825cd98c-Abstract-Conference.html&rfr_id=info%3Asid%2Fen.wikipedia.org%3AReinforcement+learning" class="Z3988"></span></span> </li> <li id="cite_note-64"><span class="mw-cite-backlink"><b><a href="#cite_ref-64">^</a></b></span> <span class="reference-text"> Bozinovski, S. (1982). "A self-learning system using secondary reinforcement". In Trappl, Robert (ed.). Cybernetics and Systems Research: Proceedings of the Sixth European Meeting on Cybernetics and Systems Research. North-Holland. pp. 397–402. ISBN 978-0-444-86488-8 </span> </li> <li id="cite_note-65"><span class="mw-cite-backlink"><b><a href="#cite_ref-65">^</a></b></span> <span class="reference-text"> Bozinovski S. (1995) "Neuro genetic agents and structural theory of self-reinforcement learning systems". CMPSCI Technical Report 95-107, University of Massachusetts at Amherst <a rel="nofollow" class="external autonumber" href="https://web.cs.umass.edu/publication/docs/1995/UM-CS-1995-107.pdf">[1]</a></span> </li> <li id="cite_note-66"><span class="mw-cite-backlink"><b><a href="#cite_ref-66">^</a></b></span> <span class="reference-text"> Bozinovski, S. (2014) "Modeling mechanisms of cognition-emotion interaction in artificial neural networks, since 1981." Procedia Computer Science p. 255-263 </span> </li> <li id="cite_note-67"><span class="mw-cite-backlink"><b><a href="#cite_ref-67">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFEngstromIlyasSanturkarTsipras2019" class="citation journal cs1">Engstrom, Logan; Ilyas, Andrew; Santurkar, Shibani; Tsipras, Dimitris; Janoos, Firdaus; Rudolph, Larry; Madry, Aleksander (2019-09-25). <a rel="nofollow" class="external text" href="https://openreview.net/forum?id=r1etN1rtPB">"Implementation Matters in Deep RL: A Case Study on PPO and TRPO"</a>. <i>ICLR</i>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=article&rft.jtitle=ICLR&rft.atitle=Implementation+Matters+in+Deep+RL%3A+A+Case+Study+on+PPO+and+TRPO&rft.date=2019-09-25&rft.aulast=Engstrom&rft.aufirst=Logan&rft.au=Ilyas%2C+Andrew&rft.au=Santurkar%2C+Shibani&rft.au=Tsipras%2C+Dimitris&rft.au=Janoos%2C+Firdaus&rft.au=Rudolph%2C+Larry&rft.au=Madry%2C+Aleksander&rft_id=https%3A%2F%2Fopenreview.net%2Fforum%3Fid%3Dr1etN1rtPB&rfr_id=info%3Asid%2Fen.wikipedia.org%3AReinforcement+learning" class="Z3988"></span></span> </li> <li id="cite_note-68"><span class="mw-cite-backlink"><b><a href="#cite_ref-68">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFColas2019" class="citation journal cs1">Colas, Cédric (2019-03-06). <a rel="nofollow" class="external text" href="https://openreview.net/forum?id=ryx0N3IaIV">"A Hitchhiker's Guide to Statistical Comparisons of Reinforcement Learning Algorithms"</a>. <i>International Conference on Learning Representations</i>. <a href="/wiki/ArXiv_(identifier)" class="mw-redirect" title="ArXiv (identifier)">arXiv</a>:<span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://arxiv.org/abs/1904.06979">1904.06979</a></span>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=article&rft.jtitle=International+Conference+on+Learning+Representations&rft.atitle=A+Hitchhiker%27s+Guide+to+Statistical+Comparisons+of+Reinforcement+Learning+Algorithms&rft.date=2019-03-06&rft_id=info%3Aarxiv%2F1904.06979&rft.aulast=Colas&rft.aufirst=C%C3%A9dric&rft_id=https%3A%2F%2Fopenreview.net%2Fforum%3Fid%3Dryx0N3IaIV&rfr_id=info%3Asid%2Fen.wikipedia.org%3AReinforcement+learning" class="Z3988"></span></span> </li> <li id="cite_note-69"><span class="mw-cite-backlink"><b><a href="#cite_ref-69">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFGreenbergMannor2021" class="citation journal cs1">Greenberg, Ido; Mannor, Shie (2021-07-01). <a rel="nofollow" class="external text" href="https://proceedings.mlr.press/v139/greenberg21a.html">"Detecting Rewards Deterioration in Episodic Reinforcement Learning"</a>. <i>Proceedings of the 38th International Conference on Machine Learning</i>. PMLR: 3842–3853. <a href="/wiki/ArXiv_(identifier)" class="mw-redirect" title="ArXiv (identifier)">arXiv</a>:<span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://arxiv.org/abs/2010.11660">2010.11660</a></span>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=article&rft.jtitle=Proceedings+of+the+38th+International+Conference+on+Machine+Learning&rft.atitle=Detecting+Rewards+Deterioration+in+Episodic+Reinforcement+Learning&rft.pages=3842-3853&rft.date=2021-07-01&rft_id=info%3Aarxiv%2F2010.11660&rft.aulast=Greenberg&rft.aufirst=Ido&rft.au=Mannor%2C+Shie&rft_id=https%3A%2F%2Fproceedings.mlr.press%2Fv139%2Fgreenberg21a.html&rfr_id=info%3Asid%2Fen.wikipedia.org%3AReinforcement+learning" class="Z3988"></span></span> </li> </ol></div></div> <div class="mw-heading mw-heading2"><h2 id="Further_reading">Further reading</h2><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Reinforcement_learning&action=edit&section=27" title="Edit section: Further reading"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <ul><li><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFAnnaswamy2023" class="citation journal cs1">Annaswamy, Anuradha M. (3 May 2023). <a rel="nofollow" class="external text" href="https://doi.org/10.1146%2Fannurev-control-062922-090153">"Adaptive Control and Intersections with Reinforcement Learning"</a>. <i>Annual Review of Control, Robotics, and Autonomous Systems</i>. <b>6</b> (1): 65–93. <a href="/wiki/Doi_(identifier)" class="mw-redirect" title="Doi (identifier)">doi</a>:<span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://doi.org/10.1146%2Fannurev-control-062922-090153">10.1146/annurev-control-062922-090153</a></span>. <a href="/wiki/ISSN_(identifier)" class="mw-redirect" title="ISSN (identifier)">ISSN</a> <a rel="nofollow" class="external text" href="https://search.worldcat.org/issn/2573-5144">2573-5144</a>. <a href="/wiki/S2CID_(identifier)" class="mw-redirect" title="S2CID (identifier)">S2CID</a> <a rel="nofollow" class="external text" href="https://api.semanticscholar.org/CorpusID:255702873">255702873</a>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=article&rft.jtitle=Annual+Review+of+Control%2C+Robotics%2C+and+Autonomous+Systems&rft.atitle=Adaptive+Control+and+Intersections+with+Reinforcement+Learning&rft.volume=6&rft.issue=1&rft.pages=65-93&rft.date=2023-05-03&rft_id=https%3A%2F%2Fapi.semanticscholar.org%2FCorpusID%3A255702873%23id-name%3DS2CID&rft.issn=2573-5144&rft_id=info%3Adoi%2F10.1146%2Fannurev-control-062922-090153&rft.aulast=Annaswamy&rft.aufirst=Anuradha+M.&rft_id=https%3A%2F%2Fdoi.org%2F10.1146%252Fannurev-control-062922-090153&rfr_id=info%3Asid%2Fen.wikipedia.org%3AReinforcement+learning" class="Z3988"></span></li> <li><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFAuerJakschOrtner2010" class="citation journal cs1"><a href="/wiki/Peter_Auer" title="Peter Auer">Auer, Peter</a>; Jaksch, Thomas; Ortner, Ronald (2010). <a rel="nofollow" class="external text" href="http://jmlr.csail.mit.edu/papers/v11/jaksch10a.html">"Near-optimal regret bounds for reinforcement learning"</a>. <i>Journal of Machine Learning Research</i>. <b>11</b>: 1563–1600.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=article&rft.jtitle=Journal+of+Machine+Learning+Research&rft.atitle=Near-optimal+regret+bounds+for+reinforcement+learning&rft.volume=11&rft.pages=1563-1600&rft.date=2010&rft.aulast=Auer&rft.aufirst=Peter&rft.au=Jaksch%2C+Thomas&rft.au=Ortner%2C+Ronald&rft_id=http%3A%2F%2Fjmlr.csail.mit.edu%2Fpapers%2Fv11%2Fjaksch10a.html&rfr_id=info%3Asid%2Fen.wikipedia.org%3AReinforcement+learning" class="Z3988"></span></li> <li><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFBertsekas2023" class="citation book cs1">Bertsekas, Dimitri P. (2023) [2019]. <a rel="nofollow" class="external text" href="http://www.mit.edu/~dimitrib/RLbook.html"><i>REINFORCEMENT LEARNING AND OPTIMAL CONTROL</i></a> (1st ed.). Athena Scientific. <a href="/wiki/ISBN_(identifier)" class="mw-redirect" title="ISBN (identifier)">ISBN</a> <a href="/wiki/Special:BookSources/978-1-886-52939-7" title="Special:BookSources/978-1-886-52939-7"><bdi>978-1-886-52939-7</bdi></a>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&rft.genre=book&rft.btitle=REINFORCEMENT+LEARNING+AND+OPTIMAL+CONTROL&rft.edition=1st&rft.pub=Athena+Scientific&rft.date=2023&rft.isbn=978-1-886-52939-7&rft.aulast=Bertsekas&rft.aufirst=Dimitri+P.&rft_id=http%3A%2F%2Fwww.mit.edu%2F~dimitrib%2FRLbook.html&rfr_id=info%3Asid%2Fen.wikipedia.org%3AReinforcement+learning" class="Z3988"></span></li> <li><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFBusoniuBabuskaDe_SchutterErnst2010" class="citation book cs1">Busoniu, Lucian; Babuska, Robert; <a href="/wiki/Bart_De_Schutter" title="Bart De Schutter">De Schutter, Bart</a>; Ernst, Damien (2010). <a rel="nofollow" class="external text" href="http://www.dcsc.tudelft.nl/rlbook/"><i>Reinforcement Learning and Dynamic Programming using Function Approximators</i></a>. Taylor & Francis CRC Press. <a href="/wiki/ISBN_(identifier)" class="mw-redirect" title="ISBN (identifier)">ISBN</a> <a href="/wiki/Special:BookSources/978-1-4398-2108-4" title="Special:BookSources/978-1-4398-2108-4"><bdi>978-1-4398-2108-4</bdi></a>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&rft.genre=book&rft.btitle=Reinforcement+Learning+and+Dynamic+Programming+using+Function+Approximators&rft.pub=Taylor+%26+Francis+CRC+Press&rft.date=2010&rft.isbn=978-1-4398-2108-4&rft.aulast=Busoniu&rft.aufirst=Lucian&rft.au=Babuska%2C+Robert&rft.au=De+Schutter%2C+Bart&rft.au=Ernst%2C+Damien&rft_id=http%3A%2F%2Fwww.dcsc.tudelft.nl%2Frlbook%2F&rfr_id=info%3Asid%2Fen.wikipedia.org%3AReinforcement+learning" class="Z3988"></span></li> <li><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFFrançois-LavetHendersonIslamBellemare2018" class="citation journal cs1">François-Lavet, Vincent; Henderson, Peter; Islam, Riashat; Bellemare, Marc G.; Pineau, Joelle (2018). "An Introduction to Deep Reinforcement Learning". <i>Foundations and Trends in Machine Learning</i>. <b>11</b> (3–4): 219–354. <a href="/wiki/ArXiv_(identifier)" class="mw-redirect" title="ArXiv (identifier)">arXiv</a>:<span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://arxiv.org/abs/1811.12560">1811.12560</a></span>. <a href="/wiki/Bibcode_(identifier)" class="mw-redirect" title="Bibcode (identifier)">Bibcode</a>:<a rel="nofollow" class="external text" href="https://ui.adsabs.harvard.edu/abs/2018arXiv181112560F">2018arXiv181112560F</a>. <a href="/wiki/Doi_(identifier)" class="mw-redirect" title="Doi (identifier)">doi</a>:<a rel="nofollow" class="external text" href="https://doi.org/10.1561%2F2200000071">10.1561/2200000071</a>. <a href="/wiki/S2CID_(identifier)" class="mw-redirect" title="S2CID (identifier)">S2CID</a> <a rel="nofollow" class="external text" href="https://api.semanticscholar.org/CorpusID:54434537">54434537</a>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=article&rft.jtitle=Foundations+and+Trends+in+Machine+Learning&rft.atitle=An+Introduction+to+Deep+Reinforcement+Learning&rft.volume=11&rft.issue=3%E2%80%934&rft.pages=219-354&rft.date=2018&rft_id=info%3Aarxiv%2F1811.12560&rft_id=https%3A%2F%2Fapi.semanticscholar.org%2FCorpusID%3A54434537%23id-name%3DS2CID&rft_id=info%3Adoi%2F10.1561%2F2200000071&rft_id=info%3Abibcode%2F2018arXiv181112560F&rft.aulast=Fran%C3%A7ois-Lavet&rft.aufirst=Vincent&rft.au=Henderson%2C+Peter&rft.au=Islam%2C+Riashat&rft.au=Bellemare%2C+Marc+G.&rft.au=Pineau%2C+Joelle&rfr_id=info%3Asid%2Fen.wikipedia.org%3AReinforcement+learning" class="Z3988"></span></li> <li><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFLi2023" class="citation book cs1">Li, Shengbo Eben (2023). <a rel="nofollow" class="external text" href="https://link.springer.com/book/10.1007/978-981-19-7784-8"><i>Reinforcement Learning for Sequential Decision and Optimal Control</i></a> (1st ed.). Springer Verlag, Singapore. <a href="/wiki/Doi_(identifier)" class="mw-redirect" title="Doi (identifier)">doi</a>:<a rel="nofollow" class="external text" href="https://doi.org/10.1007%2F978-981-19-7784-8">10.1007/978-981-19-7784-8</a>. <a href="/wiki/ISBN_(identifier)" class="mw-redirect" title="ISBN (identifier)">ISBN</a> <a href="/wiki/Special:BookSources/978-9-811-97783-1" title="Special:BookSources/978-9-811-97783-1"><bdi>978-9-811-97783-1</bdi></a>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&rft.genre=book&rft.btitle=Reinforcement+Learning+for+Sequential+Decision+and+Optimal+Control&rft.edition=1st&rft.pub=Springer+Verlag%2C+Singapore&rft.date=2023&rft_id=info%3Adoi%2F10.1007%2F978-981-19-7784-8&rft.isbn=978-9-811-97783-1&rft.aulast=Li&rft.aufirst=Shengbo+Eben&rft_id=https%3A%2F%2Flink.springer.com%2Fbook%2F10.1007%2F978-981-19-7784-8&rfr_id=info%3Asid%2Fen.wikipedia.org%3AReinforcement+learning" class="Z3988"></span></li> <li><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFPowell2011" class="citation book cs1">Powell, Warren (2011). <a rel="nofollow" class="external text" href="https://web.archive.org/web/20160731230325/http://castlelab.princeton.edu/adp.htm"><i>Approximate dynamic programming: solving the curses of dimensionality</i></a>. Wiley-Interscience. Archived from <a rel="nofollow" class="external text" href="http://www.castlelab.princeton.edu/adp.htm">the original</a> on 2016-07-31<span class="reference-accessdate">. Retrieved <span class="nowrap">2010-09-08</span></span>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&rft.genre=book&rft.btitle=Approximate+dynamic+programming%3A+solving+the+curses+of+dimensionality&rft.pub=Wiley-Interscience&rft.date=2011&rft.aulast=Powell&rft.aufirst=Warren&rft_id=http%3A%2F%2Fwww.castlelab.princeton.edu%2Fadp.htm&rfr_id=info%3Asid%2Fen.wikipedia.org%3AReinforcement+learning" class="Z3988"></span></li> <li><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFSutton1988" class="citation journal cs1"><a href="/wiki/Richard_S._Sutton" title="Richard S. Sutton">Sutton, Richard S.</a> (1988). <a rel="nofollow" class="external text" href="https://doi.org/10.1007%2FBF00115009">"Learning to predict by the method of temporal differences"</a>. <i>Machine Learning</i>. <b>3</b>: 9–44. <a href="/wiki/Doi_(identifier)" class="mw-redirect" title="Doi (identifier)">doi</a>:<span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://doi.org/10.1007%2FBF00115009">10.1007/BF00115009</a></span>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=article&rft.jtitle=Machine+Learning&rft.atitle=Learning+to+predict+by+the+method+of+temporal+differences&rft.volume=3&rft.pages=9-44&rft.date=1988&rft_id=info%3Adoi%2F10.1007%2FBF00115009&rft.aulast=Sutton&rft.aufirst=Richard+S.&rft_id=https%3A%2F%2Fdoi.org%2F10.1007%252FBF00115009&rfr_id=info%3Asid%2Fen.wikipedia.org%3AReinforcement+learning" class="Z3988"></span></li> <li><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFSuttonBarto2018" class="citation book cs1"><a href="/wiki/Richard_S._Sutton" title="Richard S. Sutton">Sutton, Richard S.</a>; <a href="/wiki/Andrew_Barto" title="Andrew Barto">Barto, Andrew G.</a> (2018) [1998]. <a rel="nofollow" class="external text" href="http://incompleteideas.net/sutton/book/the-book.html"><i>Reinforcement Learning: An Introduction</i></a> (2nd ed.). MIT Press. <a href="/wiki/ISBN_(identifier)" class="mw-redirect" title="ISBN (identifier)">ISBN</a> <a href="/wiki/Special:BookSources/978-0-262-03924-6" title="Special:BookSources/978-0-262-03924-6"><bdi>978-0-262-03924-6</bdi></a>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&rft.genre=book&rft.btitle=Reinforcement+Learning%3A+An+Introduction&rft.edition=2nd&rft.pub=MIT+Press&rft.date=2018&rft.isbn=978-0-262-03924-6&rft.aulast=Sutton&rft.aufirst=Richard+S.&rft.au=Barto%2C+Andrew+G.&rft_id=http%3A%2F%2Fincompleteideas.net%2Fsutton%2Fbook%2Fthe-book.html&rfr_id=info%3Asid%2Fen.wikipedia.org%3AReinforcement+learning" class="Z3988"></span></li> <li><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFSzitaSzepesvari2010" class="citation conference cs1">Szita, Istvan; Szepesvari, Csaba (2010). <a rel="nofollow" class="external text" href="https://web.archive.org/web/20100714095438/http://www.icml2010.org/papers/546.pdf">"Model-based Reinforcement Learning with Nearly Tight Exploration Complexity Bounds"</a> <span class="cs1-format">(PDF)</span>. <i>ICML 2010</i>. Omnipress. pp. 1031–1038. Archived from <a rel="nofollow" class="external text" href="http://www.icml2010.org/papers/546.pdf">the original</a> <span class="cs1-format">(PDF)</span> on 2010-07-14.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&rft.genre=conference&rft.atitle=Model-based+Reinforcement+Learning+with+Nearly+Tight+Exploration+Complexity+Bounds&rft.btitle=ICML+2010&rft.pages=1031-1038&rft.pub=Omnipress&rft.date=2010&rft.aulast=Szita&rft.aufirst=Istvan&rft.au=Szepesvari%2C+Csaba&rft_id=http%3A%2F%2Fwww.icml2010.org%2Fpapers%2F546.pdf&rfr_id=info%3Asid%2Fen.wikipedia.org%3AReinforcement+learning" class="Z3988"></span></li></ul> <div class="mw-heading mw-heading2"><h2 id="External_links">External links</h2><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Reinforcement_learning&action=edit&section=28" title="Edit section: External links"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <ul><li><a rel="nofollow" class="external text" href="https://mpatacchiola.github.io/blog/2016/12/09/dissecting-reinforcement-learning.html">Dissecting Reinforcement Learning</a> Series of blog post on reinforcement learning with Python code</li> <li><a rel="nofollow" class="external text" href="https://lilianweng.github.io/posts/2018-02-19-rl-overview/">A (Long) Peek into Reinforcement Learning</a></li></ul> <div class="navbox-styles"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1129693374"><style data-mw-deduplicate="TemplateStyles:r1236075235">.mw-parser-output .navbox{box-sizing:border-box;border:1px solid #a2a9b1;width:100%;clear:both;font-size:88%;text-align:center;padding:1px;margin:1em auto 0}.mw-parser-output .navbox .navbox{margin-top:0}.mw-parser-output .navbox+.navbox,.mw-parser-output .navbox+.navbox-styles+.navbox{margin-top:-1px}.mw-parser-output .navbox-inner,.mw-parser-output .navbox-subgroup{width:100%}.mw-parser-output .navbox-group,.mw-parser-output .navbox-title,.mw-parser-output .navbox-abovebelow{padding:0.25em 1em;line-height:1.5em;text-align:center}.mw-parser-output .navbox-group{white-space:nowrap;text-align:right}.mw-parser-output .navbox,.mw-parser-output .navbox-subgroup{background-color:#fdfdfd}.mw-parser-output .navbox-list{line-height:1.5em;border-color:#fdfdfd}.mw-parser-output .navbox-list-with-group{text-align:left;border-left-width:2px;border-left-style:solid}.mw-parser-output tr+tr>.navbox-abovebelow,.mw-parser-output tr+tr>.navbox-group,.mw-parser-output tr+tr>.navbox-image,.mw-parser-output tr+tr>.navbox-list{border-top:2px solid #fdfdfd}.mw-parser-output .navbox-title{background-color:#ccf}.mw-parser-output .navbox-abovebelow,.mw-parser-output .navbox-group,.mw-parser-output .navbox-subgroup .navbox-title{background-color:#ddf}.mw-parser-output .navbox-subgroup .navbox-group,.mw-parser-output .navbox-subgroup .navbox-abovebelow{background-color:#e6e6ff}.mw-parser-output .navbox-even{background-color:#f7f7f7}.mw-parser-output .navbox-odd{background-color:transparent}.mw-parser-output .navbox .hlist td dl,.mw-parser-output .navbox .hlist td ol,.mw-parser-output .navbox .hlist td ul,.mw-parser-output .navbox td.hlist dl,.mw-parser-output .navbox td.hlist ol,.mw-parser-output .navbox td.hlist ul{padding:0.125em 0}.mw-parser-output .navbox .navbar{display:block;font-size:100%}.mw-parser-output .navbox-title .navbar{float:left;text-align:left;margin-right:0.5em}body.skin--responsive .mw-parser-output .navbox-image img{max-width:none!important}@media print{body.ns-0 .mw-parser-output .navbox{display:none!important}}</style></div><div role="navigation" class="navbox" aria-labelledby="Artificial_intelligence" style="padding:3px"><table class="nowraplinks hlist mw-collapsible {{{state}}} navbox-inner" style="border-spacing:0;background:transparent;color:inherit"><tbody><tr><th scope="col" class="navbox-title" colspan="2"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1129693374"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1239400231"><div class="navbar plainlinks hlist navbar-mini"><ul><li class="nv-view"><a href="/wiki/Template:Artificial_intelligence_navbox" title="Template:Artificial intelligence navbox"><abbr title="View this template">v</abbr></a></li><li class="nv-talk"><a href="/wiki/Template_talk:Artificial_intelligence_navbox" title="Template talk:Artificial intelligence navbox"><abbr title="Discuss this template">t</abbr></a></li><li class="nv-edit"><a href="/wiki/Special:EditPage/Template:Artificial_intelligence_navbox" title="Special:EditPage/Template:Artificial intelligence navbox"><abbr title="Edit this template">e</abbr></a></li></ul></div><div id="Artificial_intelligence" style="font-size:114%;margin:0 4em"><a href="/wiki/Artificial_intelligence" title="Artificial intelligence">Artificial intelligence</a></div></th></tr><tr><th scope="row" class="navbox-group" style="width:1%">Concepts</th><td class="navbox-list-with-group navbox-list navbox-odd" style="width:100%;padding:0"><div style="padding:0 0.25em"> <ul><li><a href="/wiki/Parameter" title="Parameter">Parameter</a> <ul><li><a href="/wiki/Hyperparameter_(machine_learning)" title="Hyperparameter (machine learning)">Hyperparameter</a></li></ul></li> <li><a href="/wiki/Loss_functions_for_classification" title="Loss functions for classification">Loss functions</a></li> <li><a href="/wiki/Regression_analysis" title="Regression analysis">Regression</a> <ul><li><a href="/wiki/Bias%E2%80%93variance_tradeoff" title="Bias–variance tradeoff">Bias–variance tradeoff</a></li> <li><a href="/wiki/Double_descent" title="Double descent">Double descent</a></li> <li><a href="/wiki/Overfitting" title="Overfitting">Overfitting</a></li></ul></li> <li><a href="/wiki/Cluster_analysis" title="Cluster analysis">Clustering</a></li> <li><a href="/wiki/Gradient_descent" title="Gradient descent">Gradient descent</a> <ul><li><a href="/wiki/Stochastic_gradient_descent" title="Stochastic gradient descent">SGD</a></li> <li><a href="/wiki/Quasi-Newton_method" title="Quasi-Newton method">Quasi-Newton method</a></li> <li><a href="/wiki/Conjugate_gradient_method" title="Conjugate gradient method">Conjugate gradient method</a></li></ul></li> <li><a href="/wiki/Backpropagation" title="Backpropagation">Backpropagation</a></li> <li><a href="/wiki/Attention_(machine_learning)" title="Attention (machine learning)">Attention</a></li> <li><a href="/wiki/Convolution" title="Convolution">Convolution</a></li> <li><a href="/wiki/Normalization_(machine_learning)" title="Normalization (machine learning)">Normalization</a> <ul><li><a href="/wiki/Batch_normalization" title="Batch normalization">Batchnorm</a></li></ul></li> <li><a href="/wiki/Activation_function" title="Activation function">Activation</a> <ul><li><a href="/wiki/Softmax_function" title="Softmax function">Softmax</a></li> <li><a href="/wiki/Sigmoid_function" title="Sigmoid function">Sigmoid</a></li> <li><a href="/wiki/Rectifier_(neural_networks)" title="Rectifier (neural networks)">Rectifier</a></li></ul></li> <li><a href="/wiki/Gating_mechanism" title="Gating mechanism">Gating</a></li> <li><a href="/wiki/Weight_initialization" title="Weight initialization">Weight initialization</a></li> <li><a href="/wiki/Regularization_(mathematics)" title="Regularization (mathematics)">Regularization</a></li> <li><a href="/wiki/Training,_validation,_and_test_data_sets" title="Training, validation, and test data sets">Datasets</a> <ul><li><a href="/wiki/Data_augmentation" title="Data augmentation">Augmentation</a></li></ul></li> <li><a class="mw-selflink selflink">Reinforcement learning</a> <ul><li><a href="/wiki/Q-learning" title="Q-learning">Q-learning</a></li> <li><a href="/wiki/State%E2%80%93action%E2%80%93reward%E2%80%93state%E2%80%93action" title="State–action–reward–state–action">SARSA</a></li> <li><a href="/wiki/Imitation_learning" title="Imitation learning">Imitation</a></li></ul></li> <li><a href="/wiki/Diffusion_process" title="Diffusion process">Diffusion</a></li> <li><a href="/wiki/Autoregressive_model" title="Autoregressive model">Autoregression</a></li> <li><a href="/wiki/Adversarial_machine_learning" title="Adversarial machine learning">Adversary</a></li> <li><a href="/wiki/Hallucination_(artificial_intelligence)" title="Hallucination (artificial intelligence)">Hallucination</a></li></ul> </div></td></tr><tr><th scope="row" class="navbox-group" style="width:1%">Applications</th><td class="navbox-list-with-group navbox-list navbox-even" style="width:100%;padding:0"><div style="padding:0 0.25em"> <ul><li><a href="/wiki/Machine_learning" title="Machine learning">Machine learning</a> <ul><li><a href="/wiki/Prompt_engineering#In-context_learning" title="Prompt engineering">In-context learning</a></li></ul></li> <li><a href="/wiki/Neural_network_(machine_learning)" title="Neural network (machine learning)">Artificial neural network</a> <ul><li><a href="/wiki/Deep_learning" title="Deep learning">Deep learning</a></li></ul></li> <li><a href="/wiki/Language_model" title="Language model">Language model</a> <ul><li><a href="/wiki/Large_language_model" title="Large language model">Large language model</a></li> <li><a href="/wiki/Neural_machine_translation" title="Neural machine translation">NMT</a></li></ul></li> <li><a href="/wiki/Artificial_general_intelligence" title="Artificial general intelligence">Artificial general intelligence</a></li></ul> </div></td></tr><tr><th scope="row" class="navbox-group" style="width:1%">Implementations</th><td class="navbox-list-with-group navbox-list navbox-odd" style="width:100%;padding:0"><div style="padding:0 0.25em"></div><table class="nowraplinks navbox-subgroup" style="border-spacing:0"><tbody><tr><th scope="row" class="navbox-group" style="width:1%">Audio–visual</th><td class="navbox-list-with-group navbox-list navbox-odd" style="width:100%;padding:0"><div style="padding:0 0.25em"> <ul><li><a href="/wiki/AlexNet" title="AlexNet">AlexNet</a></li> <li><a href="/wiki/WaveNet" title="WaveNet">WaveNet</a></li> <li><a href="/wiki/Human_image_synthesis" title="Human image synthesis">Human image synthesis</a></li> <li><a href="/wiki/Handwriting_recognition" title="Handwriting recognition">HWR</a></li> <li><a href="/wiki/Optical_character_recognition" title="Optical character recognition">OCR</a></li> <li><a href="/wiki/Deep_learning_speech_synthesis" title="Deep learning speech synthesis">Speech synthesis</a> <ul><li><a href="/wiki/ElevenLabs" title="ElevenLabs">ElevenLabs</a></li></ul></li> <li><a href="/wiki/Speech_recognition" title="Speech recognition">Speech recognition</a></li> <li><a href="/wiki/Facial_recognition_system" title="Facial recognition system">Facial recognition</a></li> <li><a href="/wiki/AlphaFold" title="AlphaFold">AlphaFold</a></li> <li><a href="/wiki/Text-to-image_model" title="Text-to-image model">Text-to-image models</a> <ul><li><a href="/wiki/Latent_diffusion_model" title="Latent diffusion model">Latent diffusion model</a></li> <li><a href="/wiki/DALL-E" title="DALL-E">DALL-E</a></li> <li><a href="/wiki/Flux_(text-to-image_model)" title="Flux (text-to-image model)">Flux</a></li> <li><a href="/wiki/Ideogram_(text-to-image_model)" title="Ideogram (text-to-image model)">Ideogram</a></li> <li><a href="/wiki/Midjourney" title="Midjourney">Midjourney</a></li> <li><a href="/wiki/Stable_Diffusion" title="Stable Diffusion">Stable Diffusion</a></li></ul></li> <li><a href="/wiki/Text-to-video_model" title="Text-to-video model">Text-to-video models</a> <ul><li><a href="/wiki/Sora_(text-to-video_model)" title="Sora (text-to-video model)">Sora</a></li> <li><a href="/wiki/Dream_Machine_(text-to-video_model)" title="Dream Machine (text-to-video model)">Dream Machine</a></li> <li><a href="/wiki/VideoPoet" title="VideoPoet">VideoPoet</a></li></ul></li> <li><a href="/wiki/Whisper_(speech_recognition_system)" title="Whisper (speech recognition system)">Whisper</a></li></ul> </div></td></tr><tr><th scope="row" class="navbox-group" style="width:1%">Text</th><td class="navbox-list-with-group navbox-list navbox-even" style="width:100%;padding:0"><div style="padding:0 0.25em"> <ul><li><a href="/wiki/Word2vec" title="Word2vec">Word2vec</a></li> <li><a href="/wiki/Seq2seq" title="Seq2seq">Seq2seq</a></li> <li><a href="/wiki/GloVe" title="GloVe">GloVe</a></li> <li><a href="/wiki/BERT_(language_model)" title="BERT (language model)">BERT</a></li> <li><a href="/wiki/T5_(language_model)" title="T5 (language model)">T5</a></li> <li><a href="/wiki/Llama_(language_model)" title="Llama (language model)">Llama</a></li> <li><a href="/wiki/Chinchilla_(language_model)" title="Chinchilla (language model)">Chinchilla AI</a></li> <li><a href="/wiki/PaLM" title="PaLM">PaLM</a></li> <li><a href="/wiki/Generative_pre-trained_transformer" title="Generative pre-trained transformer">GPT</a> <ul><li><a href="/wiki/GPT-1" title="GPT-1">1</a></li> <li><a href="/wiki/GPT-J" title="GPT-J">J</a></li> <li><a href="/wiki/GPT-2" title="GPT-2">2</a></li> <li><a href="/wiki/GPT-3" title="GPT-3">3</a></li> <li><a href="/wiki/ChatGPT" title="ChatGPT">ChatGPT</a></li> <li><a href="/wiki/GPT-4" title="GPT-4">4</a></li> <li><a href="/wiki/GPT-4o" title="GPT-4o">4o</a></li> <li><a href="/wiki/OpenAI_o1" title="OpenAI o1">o1</a></li></ul></li> <li><a href="/wiki/Claude_(language_model)" title="Claude (language model)">Claude</a></li> <li><a href="/wiki/Gemini_(language_model)" title="Gemini (language model)">Gemini</a></li> <li><a href="/wiki/Grok_(chatbot)" title="Grok (chatbot)">Grok</a></li> <li><a href="/wiki/LaMDA" title="LaMDA">LaMDA</a></li> <li><a href="/wiki/BLOOM_(language_model)" title="BLOOM (language model)">BLOOM</a></li> <li><a href="/wiki/Project_Debater" title="Project Debater">Project Debater</a></li> <li><a href="/wiki/IBM_Watson" title="IBM Watson">IBM Watson</a></li> <li><a href="/wiki/IBM_Watsonx" title="IBM Watsonx">IBM Watsonx</a></li> <li><a href="/wiki/IBM_Granite" title="IBM Granite">Granite</a></li> <li><a href="/wiki/Huawei_PanGu" title="Huawei PanGu">PanGu-Σ</a></li></ul> </div></td></tr><tr><th scope="row" class="navbox-group" style="width:1%">Decisional</th><td class="navbox-list-with-group navbox-list navbox-odd" style="width:100%;padding:0"><div style="padding:0 0.25em"> <ul><li><a href="/wiki/AlphaGo" title="AlphaGo">AlphaGo</a></li> <li><a href="/wiki/AlphaZero" title="AlphaZero">AlphaZero</a></li> <li><a href="/wiki/OpenAI_Five" title="OpenAI Five">OpenAI Five</a></li> <li><a href="/wiki/Self-driving_car" title="Self-driving car">Self-driving car</a></li> <li><a href="/wiki/MuZero" title="MuZero">MuZero</a></li> <li><a href="/wiki/Action_selection" title="Action selection">Action selection</a> <ul><li><a href="/wiki/AutoGPT" title="AutoGPT">AutoGPT</a></li></ul></li> <li><a href="/wiki/Robot_control" title="Robot control">Robot control</a></li></ul> </div></td></tr></tbody></table><div></div></td></tr><tr><th scope="row" class="navbox-group" style="width:1%">People</th><td class="navbox-list-with-group navbox-list navbox-even" style="width:100%;padding:0"><div style="padding:0 0.25em"> <ul><li><a href="/wiki/Alan_Turing" title="Alan Turing">Alan Turing</a></li> <li><a href="/wiki/Claude_Shannon" title="Claude Shannon">Claude Shannon</a></li> <li><a href="/wiki/Allen_Newell" title="Allen Newell">Allen Newell</a></li> <li><a href="/wiki/Herbert_A._Simon" title="Herbert A. Simon">Herbert A. Simon</a></li> <li><a href="/wiki/Frank_Rosenblatt" title="Frank Rosenblatt">Frank Rosenblatt</a></li> <li><a href="/wiki/Marvin_Minsky" title="Marvin Minsky">Marvin Minsky</a></li> <li><a href="/wiki/John_McCarthy_(computer_scientist)" title="John McCarthy (computer scientist)">John McCarthy</a></li> <li><a href="/wiki/Nathaniel_Rochester_(computer_scientist)" title="Nathaniel Rochester (computer scientist)">Nathaniel Rochester</a></li> <li><a href="/wiki/Seymour_Papert" title="Seymour Papert">Seymour Papert</a></li> <li><a href="/wiki/Joseph_Weizenbaum" title="Joseph Weizenbaum">Joseph Weizenbaum</a></li> <li><a href="/wiki/Bernard_Widrow" title="Bernard Widrow">Bernard Widrow</a></li> <li><a href="/wiki/Paul_Werbos" title="Paul Werbos">Paul Werbos</a></li> <li><a href="/wiki/Yoshua_Bengio" title="Yoshua Bengio">Yoshua Bengio</a></li> <li><a href="/wiki/Alex_Graves_(computer_scientist)" title="Alex Graves (computer scientist)">Alex Graves</a></li> <li><a href="/wiki/Ian_Goodfellow" title="Ian Goodfellow">Ian Goodfellow</a></li> <li><a href="/wiki/Stephen_Grossberg" title="Stephen Grossberg">Stephen Grossberg</a></li> <li><a href="/wiki/Demis_Hassabis" title="Demis Hassabis">Demis Hassabis</a></li> <li><a href="/wiki/Geoffrey_Hinton" title="Geoffrey Hinton">Geoffrey Hinton</a></li> <li><a href="/wiki/Yann_LeCun" title="Yann LeCun">Yann LeCun</a></li> <li><a href="/wiki/Fei-Fei_Li" title="Fei-Fei Li">Fei-Fei Li</a></li> <li><a href="/wiki/Andrew_Ng" title="Andrew Ng">Andrew Ng</a></li> <li><a href="/wiki/J%C3%BCrgen_Schmidhuber" title="Jürgen Schmidhuber">Jürgen Schmidhuber</a></li> <li><a href="/wiki/David_Silver_(computer_scientist)" title="David Silver (computer scientist)">David Silver</a></li> <li><a href="/wiki/Ilya_Sutskever" title="Ilya Sutskever">Ilya Sutskever</a></li></ul> </div></td></tr><tr><th scope="row" class="navbox-group" style="width:1%">Organizations</th><td class="navbox-list-with-group navbox-list navbox-odd" style="width:100%;padding:0"><div style="padding:0 0.25em"> <ul><li><a href="/wiki/Anthropic" title="Anthropic">Anthropic</a></li> <li><a href="/wiki/EleutherAI" title="EleutherAI">EleutherAI</a></li> <li><a href="/wiki/Google_DeepMind" title="Google DeepMind">Google DeepMind</a></li> <li><a href="/wiki/Hugging_Face" title="Hugging Face">Hugging Face</a></li> <li><a href="/wiki/Kuaishou" title="Kuaishou">Kuaishou</a></li> <li><a href="/wiki/Meta_AI" title="Meta AI">Meta AI</a></li> <li><a href="/wiki/Mila_(research_institute)" title="Mila (research institute)">Mila</a></li> <li><a href="/wiki/MiniMax_(company)" title="MiniMax (company)">MiniMax</a></li> <li><a href="/wiki/Mistral_AI" title="Mistral AI">Mistral AI</a></li> <li><a href="/wiki/MIT_Computer_Science_and_Artificial_Intelligence_Laboratory" title="MIT Computer Science and Artificial Intelligence Laboratory">MIT CSAIL</a></li> <li><a href="/wiki/OpenAI" title="OpenAI">OpenAI</a></li> <li><a href="/wiki/Runway_(company)" title="Runway (company)">Runway</a></li> <li><a href="/wiki/XAI_(company)" title="XAI (company)">xAI</a></li></ul> </div></td></tr><tr><th scope="row" class="navbox-group" style="width:1%">Architectures</th><td class="navbox-list-with-group navbox-list navbox-even" style="width:100%;padding:0"><div style="padding:0 0.25em"> <ul><li><a href="/wiki/Neural_Turing_machine" title="Neural Turing machine">Neural Turing machine</a></li> <li><a href="/wiki/Differentiable_neural_computer" title="Differentiable neural computer">Differentiable neural computer</a></li> <li><a href="/wiki/Transformer_(deep_learning_architecture)" title="Transformer (deep learning architecture)">Transformer</a> <ul><li><a href="/wiki/Vision_transformer" title="Vision transformer">Vision transformer (ViT)</a></li></ul></li> <li><a href="/wiki/Recurrent_neural_network" title="Recurrent neural network">Recurrent neural network (RNN)</a></li> <li><a href="/wiki/Long_short-term_memory" title="Long short-term memory">Long short-term memory (LSTM)</a></li> <li><a href="/wiki/Gated_recurrent_unit" title="Gated recurrent unit">Gated recurrent unit (GRU)</a></li> <li><a href="/wiki/Echo_state_network" title="Echo state network">Echo state network</a></li> <li><a href="/wiki/Multilayer_perceptron" title="Multilayer perceptron">Multilayer perceptron (MLP)</a></li> <li><a href="/wiki/Convolutional_neural_network" title="Convolutional neural network">Convolutional neural network (CNN)</a></li> <li><a href="/wiki/Residual_neural_network" title="Residual neural network">Residual neural network (RNN)</a></li> <li><a href="/wiki/Highway_network" title="Highway network">Highway network</a></li> <li><a href="/wiki/Mamba_(deep_learning_architecture)" title="Mamba (deep learning architecture)">Mamba</a></li> <li><a href="/wiki/Autoencoder" title="Autoencoder">Autoencoder</a></li> <li><a href="/wiki/Variational_autoencoder" title="Variational autoencoder">Variational autoencoder (VAE)</a></li> <li><a href="/wiki/Generative_adversarial_network" title="Generative adversarial network">Generative adversarial network (GAN)</a></li> <li><a href="/wiki/Graph_neural_network" title="Graph neural network">Graph neural network (GNN)</a></li></ul> </div></td></tr><tr><td class="navbox-abovebelow" colspan="2"><div> <ul><li><span class="noviewer" typeof="mw:File"><a href="/wiki/File:Symbol_portal_class.svg" class="mw-file-description" title="Portal"><img alt="" src="//upload.wikimedia.org/wikipedia/en/thumb/e/e2/Symbol_portal_class.svg/16px-Symbol_portal_class.svg.png" decoding="async" width="16" height="16" class="mw-file-element" srcset="//upload.wikimedia.org/wikipedia/en/thumb/e/e2/Symbol_portal_class.svg/23px-Symbol_portal_class.svg.png 1.5x, //upload.wikimedia.org/wikipedia/en/thumb/e/e2/Symbol_portal_class.svg/31px-Symbol_portal_class.svg.png 2x" data-file-width="180" data-file-height="185" /></a></span> Portals <ul><li><a href="/wiki/Portal:Technology" title="Portal:Technology">Technology</a></li></ul></li> <li><span class="noviewer" typeof="mw:File"><span title="Category"><img alt="" src="//upload.wikimedia.org/wikipedia/en/thumb/9/96/Symbol_category_class.svg/16px-Symbol_category_class.svg.png" decoding="async" width="16" height="16" class="mw-file-element" srcset="//upload.wikimedia.org/wikipedia/en/thumb/9/96/Symbol_category_class.svg/23px-Symbol_category_class.svg.png 1.5x, //upload.wikimedia.org/wikipedia/en/thumb/9/96/Symbol_category_class.svg/31px-Symbol_category_class.svg.png 2x" data-file-width="180" data-file-height="185" /></span></span> Categories <ul><li><a href="/wiki/Category:Artificial_neural_networks" title="Category:Artificial neural networks">Artificial neural networks</a></li> <li><a href="/wiki/Category:Machine_learning" title="Category:Machine learning">Machine learning</a></li></ul></li></ul> </div></td></tr></tbody></table></div> <div class="navbox-styles"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1129693374"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1236075235"></div><div role="navigation" class="navbox" aria-labelledby="Computer_science" style="padding:3px"><table class="nowraplinks hlist mw-collapsible autocollapse navbox-inner" style="border-spacing:0;background:transparent;color:inherit"><tbody><tr><th scope="col" class="navbox-title" colspan="2"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1129693374"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1239400231"><div class="navbar plainlinks hlist navbar-mini"><ul><li class="nv-view"><a href="/wiki/Template:Computer_science" title="Template:Computer science"><abbr title="View this template">v</abbr></a></li><li class="nv-talk"><a href="/wiki/Template_talk:Computer_science" title="Template talk:Computer science"><abbr title="Discuss this template">t</abbr></a></li><li class="nv-edit"><a href="/wiki/Special:EditPage/Template:Computer_science" title="Special:EditPage/Template:Computer science"><abbr title="Edit this template">e</abbr></a></li></ul></div><div id="Computer_science" style="font-size:114%;margin:0 4em"><a href="/wiki/Computer_science" title="Computer science">Computer science</a></div></th></tr><tr><td class="navbox-abovebelow" colspan="2"><div>Note: This template roughly follows the 2012 <a href="/wiki/ACM_Computing_Classification_System" title="ACM Computing Classification System">ACM Computing Classification System</a>.</div></td></tr><tr><th scope="row" class="navbox-group" style="width:1%"><a href="/wiki/Computer_hardware" title="Computer hardware">Hardware</a></th><td class="navbox-list-with-group navbox-list navbox-odd" style="width:100%;padding:0"><div style="padding:0 0.25em"> <ul><li><a href="/wiki/Printed_circuit_board" title="Printed circuit board">Printed circuit board</a></li> <li><a href="/wiki/Peripheral" title="Peripheral">Peripheral</a></li> <li><a href="/wiki/Integrated_circuit" title="Integrated circuit">Integrated circuit</a></li> <li><a href="/wiki/Very_Large_Scale_Integration" class="mw-redirect" title="Very Large Scale Integration">Very Large Scale Integration</a></li> <li><a href="/wiki/System_on_a_chip" title="System on a chip">Systems on Chip (SoCs)</a></li> <li><a href="/wiki/Green_computing" title="Green computing">Energy consumption (Green computing)</a></li> <li><a href="/wiki/Electronic_design_automation" title="Electronic design automation">Electronic design automation</a></li> <li><a href="/wiki/Hardware_acceleration" title="Hardware acceleration">Hardware acceleration</a></li> <li><a href="/wiki/Processor_(computing)" title="Processor (computing)">Processor</a></li> <li><a href="/wiki/List_of_computer_size_categories" title="List of computer size categories">Size</a> / <a href="/wiki/Form_factor_(design)" title="Form factor (design)">Form</a></li></ul> </div></td></tr><tr><th scope="row" class="navbox-group" style="width:1%">Computer systems organization</th><td class="navbox-list-with-group navbox-list navbox-even" style="width:100%;padding:0"><div style="padding:0 0.25em"> <ul><li><a href="/wiki/Computer_architecture" title="Computer architecture">Computer architecture</a></li> <li><a href="/wiki/Computational_complexity" title="Computational complexity">Computational complexity</a></li> <li><a href="/wiki/Dependability" title="Dependability">Dependability</a></li> <li><a href="/wiki/Embedded_system" title="Embedded system">Embedded system</a></li> <li><a href="/wiki/Real-time_computing" title="Real-time computing">Real-time computing</a></li></ul> </div></td></tr><tr><th scope="row" class="navbox-group" style="width:1%"><a href="/wiki/Computer_network" title="Computer network">Networks</a></th><td class="navbox-list-with-group navbox-list navbox-odd" style="width:100%;padding:0"><div style="padding:0 0.25em"> <ul><li><a href="/wiki/Network_architecture" title="Network architecture">Network architecture</a></li> <li><a href="/wiki/Network_protocol" class="mw-redirect" title="Network protocol">Network protocol</a></li> <li><a href="/wiki/Networking_hardware" title="Networking hardware">Network components</a></li> <li><a href="/wiki/Network_scheduler" title="Network scheduler">Network scheduler</a></li> <li><a href="/wiki/Network_performance" title="Network performance">Network performance evaluation</a></li> <li><a href="/wiki/Network_service" title="Network service">Network service</a></li></ul> </div></td></tr><tr><th scope="row" class="navbox-group" style="width:1%">Software organization</th><td class="navbox-list-with-group navbox-list navbox-even" style="width:100%;padding:0"><div style="padding:0 0.25em"> <ul><li><a href="/wiki/Interpreter_(computing)" title="Interpreter (computing)">Interpreter</a></li> <li><a href="/wiki/Middleware" title="Middleware">Middleware</a></li> <li><a href="/wiki/Virtual_machine" title="Virtual machine">Virtual machine</a></li> <li><a href="/wiki/Operating_system" title="Operating system">Operating system</a></li> <li><a href="/wiki/Software_quality" title="Software quality">Software quality</a></li></ul> </div></td></tr><tr><th scope="row" class="navbox-group" style="width:1%"><a href="/wiki/Programming_language_theory" title="Programming language theory">Software notations</a> and <a href="/wiki/Programming_tool" title="Programming tool">tools</a></th><td class="navbox-list-with-group navbox-list navbox-odd" style="width:100%;padding:0"><div style="padding:0 0.25em"> <ul><li><a href="/wiki/Programming_paradigm" title="Programming paradigm">Programming paradigm</a></li> <li><a href="/wiki/Programming_language" title="Programming language">Programming language</a></li> <li><a href="/wiki/Compiler_construction" class="mw-redirect" title="Compiler construction">Compiler</a></li> <li><a href="/wiki/Domain-specific_language" title="Domain-specific language">Domain-specific language</a></li> <li><a href="/wiki/Modeling_language" title="Modeling language">Modeling language</a></li> <li><a href="/wiki/Software_framework" title="Software framework">Software framework</a></li> <li><a href="/wiki/Integrated_development_environment" title="Integrated development environment">Integrated development environment</a></li> <li><a href="/wiki/Software_configuration_management" title="Software configuration management">Software configuration management</a></li> <li><a href="/wiki/Library_(computing)" title="Library (computing)">Software library</a></li> <li><a href="/wiki/Software_repository" title="Software repository">Software repository</a></li></ul> </div></td></tr><tr><th scope="row" class="navbox-group" style="width:1%"><a href="/wiki/Software_development" title="Software development">Software development</a></th><td class="navbox-list-with-group navbox-list navbox-even" style="width:100%;padding:0"><div style="padding:0 0.25em"> <ul><li><a href="/wiki/Control_variable_(programming)" class="mw-redirect" title="Control variable (programming)">Control variable</a></li> <li><a href="/wiki/Software_development_process" title="Software development process">Software development process</a></li> <li><a href="/wiki/Requirements_analysis" title="Requirements analysis">Requirements analysis</a></li> <li><a href="/wiki/Software_design" title="Software design">Software design</a></li> <li><a href="/wiki/Software_construction" title="Software construction">Software construction</a></li> <li><a href="/wiki/Software_deployment" title="Software deployment">Software deployment</a></li> <li><a href="/wiki/Software_engineering" title="Software engineering">Software engineering</a></li> <li><a href="/wiki/Software_maintenance" title="Software maintenance">Software maintenance</a></li> <li><a href="/wiki/Programming_team" title="Programming team">Programming team</a></li> <li><a href="/wiki/Open-source_software" title="Open-source software">Open-source model</a></li></ul> </div></td></tr><tr><th scope="row" class="navbox-group" style="width:1%"><a href="/wiki/Theory_of_computation" title="Theory of computation">Theory of computation</a></th><td class="navbox-list-with-group navbox-list navbox-odd" style="width:100%;padding:0"><div style="padding:0 0.25em"> <ul><li><a href="/wiki/Model_of_computation" title="Model of computation">Model of computation</a> <ul><li><a href="/wiki/Stochastic_computing" title="Stochastic computing">Stochastic</a></li></ul></li> <li><a href="/wiki/Formal_language" title="Formal language">Formal language</a></li> <li><a href="/wiki/Automata_theory" title="Automata theory">Automata theory</a></li> <li><a href="/wiki/Computability_theory" title="Computability theory">Computability theory</a></li> <li><a href="/wiki/Computational_complexity_theory" title="Computational complexity theory">Computational complexity theory</a></li> <li><a href="/wiki/Logic_in_computer_science" title="Logic in computer science">Logic</a></li> <li><a href="/wiki/Semantics_(computer_science)" title="Semantics (computer science)">Semantics</a></li></ul> </div></td></tr><tr><th scope="row" class="navbox-group" style="width:1%"><a href="/wiki/Algorithm" title="Algorithm">Algorithms</a></th><td class="navbox-list-with-group navbox-list navbox-even" style="width:100%;padding:0"><div style="padding:0 0.25em"> <ul><li><a href="/wiki/Algorithm_design" class="mw-redirect" title="Algorithm design">Algorithm design</a></li> <li><a href="/wiki/Analysis_of_algorithms" title="Analysis of algorithms">Analysis of algorithms</a></li> <li><a href="/wiki/Algorithmic_efficiency" title="Algorithmic efficiency">Algorithmic efficiency</a></li> <li><a href="/wiki/Randomized_algorithm" title="Randomized algorithm">Randomized algorithm</a></li> <li><a href="/wiki/Computational_geometry" title="Computational geometry">Computational geometry</a></li></ul> </div></td></tr><tr><th scope="row" class="navbox-group" style="width:1%">Mathematics of <a href="/wiki/Computing" title="Computing">computing</a></th><td class="navbox-list-with-group navbox-list navbox-odd" style="width:100%;padding:0"><div style="padding:0 0.25em"> <ul><li><a href="/wiki/Discrete_mathematics" title="Discrete mathematics">Discrete mathematics</a></li> <li><a href="/wiki/Probability" title="Probability">Probability</a></li> <li><a href="/wiki/Statistics" title="Statistics">Statistics</a></li> <li><a href="/wiki/Mathematical_software" title="Mathematical software">Mathematical software</a></li> <li><a href="/wiki/Information_theory" title="Information theory">Information theory</a></li> <li><a href="/wiki/Mathematical_analysis" title="Mathematical analysis">Mathematical analysis</a></li> <li><a href="/wiki/Numerical_analysis" title="Numerical analysis">Numerical analysis</a></li> <li><a href="/wiki/Theoretical_computer_science" title="Theoretical computer science">Theoretical computer science</a></li></ul> </div></td></tr><tr><th scope="row" class="navbox-group" style="width:1%"><a href="/wiki/Information_system" title="Information system">Information systems</a></th><td class="navbox-list-with-group navbox-list navbox-even" style="width:100%;padding:0"><div style="padding:0 0.25em"> <ul><li><a href="/wiki/Database" title="Database">Database management system</a></li> <li><a href="/wiki/Computer_data_storage" title="Computer data storage">Information storage systems</a></li> <li><a href="/wiki/Enterprise_information_system" title="Enterprise information system">Enterprise information system</a></li> <li><a href="/wiki/Social_software" title="Social software">Social information systems</a></li> <li><a href="/wiki/Geographic_information_system" title="Geographic information system">Geographic information system</a></li> <li><a href="/wiki/Decision_support_system" title="Decision support system">Decision support system</a></li> <li><a href="/wiki/Process_control" class="mw-redirect" title="Process control">Process control system</a></li> <li><a href="/wiki/Multimedia_database" title="Multimedia database">Multimedia information system</a></li> <li><a href="/wiki/Data_mining" title="Data mining">Data mining</a></li> <li><a href="/wiki/Digital_library" title="Digital library">Digital library</a></li> <li><a href="/wiki/Computing_platform" title="Computing platform">Computing platform</a></li> <li><a href="/wiki/Digital_marketing" title="Digital marketing">Digital marketing</a></li> <li><a href="/wiki/World_Wide_Web" title="World Wide Web">World Wide Web</a></li> <li><a href="/wiki/Information_retrieval" title="Information retrieval">Information retrieval</a></li></ul> </div></td></tr><tr><th scope="row" class="navbox-group" style="width:1%"><a href="/wiki/Computer_security" title="Computer security">Security</a></th><td class="navbox-list-with-group navbox-list navbox-odd" style="width:100%;padding:0"><div style="padding:0 0.25em"> <ul><li><a href="/wiki/Cryptography" title="Cryptography">Cryptography</a></li> <li><a href="/wiki/Formal_methods" title="Formal methods">Formal methods</a></li> <li><a href="/wiki/Security_hacker" title="Security hacker">Security hacker</a></li> <li><a href="/wiki/Security_service_(telecommunication)" title="Security service (telecommunication)">Security services</a></li> <li><a href="/wiki/Intrusion_detection_system" title="Intrusion detection system">Intrusion detection system</a></li> <li><a href="/wiki/Hardware_security" title="Hardware security">Hardware security</a></li> <li><a href="/wiki/Network_security" title="Network security">Network security</a></li> <li><a href="/wiki/Information_security" title="Information security">Information security</a></li> <li><a href="/wiki/Application_security" title="Application security">Application security</a></li></ul> </div></td></tr><tr><th scope="row" class="navbox-group" style="width:1%"><a href="/wiki/Human%E2%80%93computer_interaction" title="Human–computer interaction">Human–computer interaction</a></th><td class="navbox-list-with-group navbox-list navbox-even" style="width:100%;padding:0"><div style="padding:0 0.25em"> <ul><li><a href="/wiki/Interaction_design" title="Interaction design">Interaction design</a></li> <li><a href="/wiki/Social_computing" title="Social computing">Social computing</a></li> <li><a href="/wiki/Ubiquitous_computing" title="Ubiquitous computing">Ubiquitous computing</a></li> <li><a href="/wiki/Visualization_(graphics)" title="Visualization (graphics)">Visualization</a></li> <li><a href="/wiki/Computer_accessibility" title="Computer accessibility">Accessibility</a></li></ul> </div></td></tr><tr><th scope="row" class="navbox-group" style="width:1%"><a href="/wiki/Concurrency_(computer_science)" title="Concurrency (computer science)">Concurrency</a></th><td class="navbox-list-with-group navbox-list navbox-odd" style="width:100%;padding:0"><div style="padding:0 0.25em"> <ul><li><a href="/wiki/Concurrent_computing" title="Concurrent computing">Concurrent computing</a></li> <li><a href="/wiki/Parallel_computing" title="Parallel computing">Parallel computing</a></li> <li><a href="/wiki/Distributed_computing" title="Distributed computing">Distributed computing</a></li> <li><a href="/wiki/Multithreading_(computer_architecture)" title="Multithreading (computer architecture)">Multithreading</a></li> <li><a href="/wiki/Multiprocessing" title="Multiprocessing">Multiprocessing</a></li></ul> </div></td></tr><tr><th scope="row" class="navbox-group" style="width:1%"><a href="/wiki/Artificial_intelligence" title="Artificial intelligence">Artificial intelligence</a></th><td class="navbox-list-with-group navbox-list navbox-even" style="width:100%;padding:0"><div style="padding:0 0.25em"> <ul><li><a href="/wiki/Natural_language_processing" title="Natural language processing">Natural language processing</a></li> <li><a href="/wiki/Knowledge_representation_and_reasoning" title="Knowledge representation and reasoning">Knowledge representation and reasoning</a></li> <li><a href="/wiki/Computer_vision" title="Computer vision">Computer vision</a></li> <li><a href="/wiki/Automated_planning_and_scheduling" title="Automated planning and scheduling">Automated planning and scheduling</a></li> <li><a href="/wiki/Mathematical_optimization" title="Mathematical optimization">Search methodology</a></li> <li><a href="/wiki/Control_theory" title="Control theory">Control method</a></li> <li><a href="/wiki/Philosophy_of_artificial_intelligence" title="Philosophy of artificial intelligence">Philosophy of artificial intelligence</a></li> <li><a href="/wiki/Distributed_artificial_intelligence" title="Distributed artificial intelligence">Distributed artificial intelligence</a></li></ul> </div></td></tr><tr><th scope="row" class="navbox-group" style="width:1%"><a href="/wiki/Machine_learning" title="Machine learning">Machine learning</a></th><td class="navbox-list-with-group navbox-list navbox-odd" style="width:100%;padding:0"><div style="padding:0 0.25em"> <ul><li><a href="/wiki/Supervised_learning" title="Supervised learning">Supervised learning</a></li> <li><a href="/wiki/Unsupervised_learning" title="Unsupervised learning">Unsupervised learning</a></li> <li><a class="mw-selflink selflink">Reinforcement learning</a></li> <li><a href="/wiki/Multi-task_learning" title="Multi-task learning">Multi-task learning</a></li> <li><a href="/wiki/Cross-validation_(statistics)" title="Cross-validation (statistics)">Cross-validation</a></li></ul> </div></td></tr><tr><th scope="row" class="navbox-group" style="width:1%"><a href="/wiki/Computer_graphics" title="Computer graphics">Graphics</a></th><td class="navbox-list-with-group navbox-list navbox-even" style="width:100%;padding:0"><div style="padding:0 0.25em"> <ul><li><a href="/wiki/Computer_animation" title="Computer animation">Animation</a></li> <li><a href="/wiki/Rendering_(computer_graphics)" title="Rendering (computer graphics)">Rendering</a></li> <li><a href="/wiki/Photograph_manipulation" title="Photograph manipulation">Photograph manipulation</a></li> <li><a href="/wiki/Graphics_processing_unit" title="Graphics processing unit">Graphics processing unit</a></li> <li><a href="/wiki/Mixed_reality" title="Mixed reality">Mixed reality</a></li> <li><a href="/wiki/Virtual_reality" title="Virtual reality">Virtual reality</a></li> <li><a href="/wiki/Image_compression" title="Image compression">Image compression</a></li> <li><a href="/wiki/Solid_modeling" title="Solid modeling">Solid modeling</a></li></ul> </div></td></tr><tr><th scope="row" class="navbox-group" style="width:1%">Applied computing</th><td class="navbox-list-with-group navbox-list navbox-odd" style="width:100%;padding:0"><div style="padding:0 0.25em"> <ul><li><a href="/wiki/Quantum_Computing" class="mw-redirect" title="Quantum Computing">Quantum Computing</a></li> <li><a href="/wiki/E-commerce" title="E-commerce">E-commerce</a></li> <li><a href="/wiki/Enterprise_software" title="Enterprise software">Enterprise software</a></li> <li><a href="/wiki/Computational_mathematics" title="Computational mathematics">Computational mathematics</a></li> <li><a href="/wiki/Computational_physics" title="Computational physics">Computational physics</a></li> <li><a href="/wiki/Computational_chemistry" title="Computational chemistry">Computational chemistry</a></li> <li><a href="/wiki/Computational_biology" title="Computational biology">Computational biology</a></li> <li><a href="/wiki/Computational_social_science" title="Computational social science">Computational social science</a></li> <li><a href="/wiki/Computational_engineering" title="Computational engineering">Computational engineering</a></li> <li><a href="/wiki/Template:Differentiable_computing" title="Template:Differentiable computing">Differentiable computing</a></li> <li><a href="/wiki/Health_informatics" title="Health informatics">Computational healthcare</a></li> <li><a href="/wiki/Digital_art" title="Digital art">Digital art</a></li> <li><a href="/wiki/Electronic_publishing" title="Electronic publishing">Electronic publishing</a></li> <li><a href="/wiki/Cyberwarfare" title="Cyberwarfare">Cyberwarfare</a></li> <li><a href="/wiki/Electronic_voting" title="Electronic voting">Electronic voting</a></li> <li><a href="/wiki/Video_game" title="Video game">Video games</a></li> <li><a href="/wiki/Word_processor" title="Word processor">Word processing</a></li> <li><a href="/wiki/Operations_research" title="Operations research">Operations research</a></li> <li><a href="/wiki/Educational_technology" title="Educational technology">Educational technology</a></li> <li><a href="/wiki/Document_management_system" title="Document management system">Document management</a></li></ul> </div></td></tr><tr><td class="navbox-abovebelow" colspan="2"><div> <ul><li><span class="noviewer" typeof="mw:File"><span title="Category"><img alt="" src="//upload.wikimedia.org/wikipedia/en/thumb/9/96/Symbol_category_class.svg/16px-Symbol_category_class.svg.png" decoding="async" width="16" height="16" class="mw-file-element" srcset="//upload.wikimedia.org/wikipedia/en/thumb/9/96/Symbol_category_class.svg/23px-Symbol_category_class.svg.png 1.5x, //upload.wikimedia.org/wikipedia/en/thumb/9/96/Symbol_category_class.svg/31px-Symbol_category_class.svg.png 2x" data-file-width="180" data-file-height="185" /></span></span> <a href="/wiki/Category:Computer_science" title="Category:Computer science">Category</a></li> <li><span class="noviewer" typeof="mw:File"><span title="Outline"><img alt="" src="//upload.wikimedia.org/wikipedia/commons/thumb/4/41/Global_thinking.svg/10px-Global_thinking.svg.png" decoding="async" width="10" height="16" class="mw-file-element" srcset="//upload.wikimedia.org/wikipedia/commons/thumb/4/41/Global_thinking.svg/15px-Global_thinking.svg.png 1.5x, //upload.wikimedia.org/wikipedia/commons/thumb/4/41/Global_thinking.svg/21px-Global_thinking.svg.png 2x" data-file-width="130" data-file-height="200" /></span></span> <a href="/wiki/Outline_of_computer_science" title="Outline of computer science">Outline</a></li> <li><span class="noviewer" typeof="mw:File"><span><img alt="" src="//upload.wikimedia.org/wikipedia/en/thumb/e/e0/Symbol_question.svg/16px-Symbol_question.svg.png" decoding="async" width="16" height="16" class="mw-file-element" srcset="//upload.wikimedia.org/wikipedia/en/thumb/e/e0/Symbol_question.svg/23px-Symbol_question.svg.png 1.5x, //upload.wikimedia.org/wikipedia/en/thumb/e/e0/Symbol_question.svg/31px-Symbol_question.svg.png 2x" data-file-width="180" data-file-height="185" /></span></span> <a href="/wiki/Template:Glossaries_of_computers" title="Template:Glossaries of computers">Glossaries</a></li></ul> </div></td></tr></tbody></table></div> <!-- NewPP limit report Parsed by mw‐web.codfw.main‐f69cdc8f6‐rhzm7 Cached time: 20241123223258 Cache expiry: 2592000 Reduced expiry: false Complications: [vary‐revision‐sha1, show‐toc] CPU time usage: 1.265 seconds Real time usage: 1.579 seconds Preprocessor visited node count: 6131/1000000 Post‐expand include size: 276539/2097152 bytes Template argument size: 3156/2097152 bytes Highest expansion depth: 16/100 Expensive parser function count: 7/500 Unstrip recursion depth: 1/20 Unstrip post‐expand size: 305219/5000000 bytes Lua time usage: 0.757/10.000 seconds Lua memory usage: 9269473/52428800 bytes Number of Wikibase entities loaded: 0/400 --> <!-- Transclusion expansion time report (%,ms,calls,template) 100.00% 1147.511 1 -total 46.13% 529.380 1 Template:Reflist 25.43% 291.755 33 Template:Cite_journal 9.97% 114.383 20 Template:Cite_book 9.63% 110.505 1 Template:Short_description 9.20% 105.578 1 Template:Machine_learning 8.65% 99.204 1 Template:Sidebar_with_collapsible_lists 5.94% 68.157 2 Template:Sfn 5.82% 66.763 3 Template:Navbox 5.57% 63.922 2 Template:Pagetype --> <!-- Saved in parser cache with key enwiki:pcache:idhash:66294-0!canonical and timestamp 20241123223258 and revision id 1259200247. Rendering was triggered because: page-view --> </div><!--esi <esi:include src="/esitest-fa8a495983347898/content" /> --><noscript><img src="https://login.wikimedia.org/wiki/Special:CentralAutoLogin/start?type=1x1" alt="" width="1" height="1" style="border: none; position: absolute;"></noscript> <div class="printfooter" data-nosnippet="">Retrieved from "<a dir="ltr" href="https://en.wikipedia.org/w/index.php?title=Reinforcement_learning&oldid=1259200247">https://en.wikipedia.org/w/index.php?title=Reinforcement_learning&oldid=1259200247</a>"</div></div> <div id="catlinks" class="catlinks" data-mw="interface"><div id="mw-normal-catlinks" class="mw-normal-catlinks"><a href="/wiki/Help:Category" title="Help:Category">Categories</a>: <ul><li><a href="/wiki/Category:Reinforcement_learning" title="Category:Reinforcement learning">Reinforcement learning</a></li><li><a href="/wiki/Category:Markov_models" title="Category:Markov models">Markov models</a></li><li><a href="/wiki/Category:Belief_revision" title="Category:Belief revision">Belief revision</a></li></ul></div><div id="mw-hidden-catlinks" class="mw-hidden-catlinks mw-hidden-cats-hidden">Hidden categories: <ul><li><a href="/wiki/Category:CS1_maint:_location_missing_publisher" title="Category:CS1 maint: location missing publisher">CS1 maint: location missing publisher</a></li><li><a href="/wiki/Category:CS1_maint:_multiple_names:_authors_list" title="Category:CS1 maint: multiple names: authors list">CS1 maint: multiple names: authors list</a></li><li><a href="/wiki/Category:Articles_with_short_description" title="Category:Articles with short description">Articles with short description</a></li><li><a href="/wiki/Category:Short_description_matches_Wikidata" title="Category:Short description matches Wikidata">Short description matches Wikidata</a></li><li><a href="/wiki/Category:Wikipedia_articles_needing_clarification_from_January_2020" title="Category:Wikipedia articles needing clarification from January 2020">Wikipedia articles needing clarification from January 2020</a></li><li><a href="/wiki/Category:Articles_needing_additional_references_from_October_2022" title="Category:Articles needing additional references from October 2022">Articles needing additional references from October 2022</a></li><li><a href="/wiki/Category:All_articles_needing_additional_references" title="Category:All articles needing additional references">All articles needing additional references</a></li></ul></div></div> </div> </main> </div> <div class="mw-footer-container"> <footer id="footer" class="mw-footer" > <ul id="footer-info"> <li id="footer-info-lastmod"> This page was last edited on 23 November 2024, at 22:32<span class="anonymous-show"> (UTC)</span>.</li> <li id="footer-info-copyright">Text is available under the <a href="/wiki/Wikipedia:Text_of_the_Creative_Commons_Attribution-ShareAlike_4.0_International_License" title="Wikipedia:Text of the Creative Commons Attribution-ShareAlike 4.0 International License">Creative Commons Attribution-ShareAlike 4.0 License</a>; additional terms may apply. By using this site, you agree to the <a href="https://foundation.wikimedia.org/wiki/Special:MyLanguage/Policy:Terms_of_Use" class="extiw" title="foundation:Special:MyLanguage/Policy:Terms of Use">Terms of Use</a> and <a href="https://foundation.wikimedia.org/wiki/Special:MyLanguage/Policy:Privacy_policy" class="extiw" title="foundation:Special:MyLanguage/Policy:Privacy policy">Privacy Policy</a>. Wikipedia® is a registered trademark of the <a rel="nofollow" class="external text" href="https://wikimediafoundation.org/">Wikimedia Foundation, Inc.</a>, a non-profit organization.</li> </ul> <ul id="footer-places"> <li id="footer-places-privacy"><a href="https://foundation.wikimedia.org/wiki/Special:MyLanguage/Policy:Privacy_policy">Privacy policy</a></li> <li id="footer-places-about"><a href="/wiki/Wikipedia:About">About Wikipedia</a></li> <li id="footer-places-disclaimers"><a href="/wiki/Wikipedia:General_disclaimer">Disclaimers</a></li> <li id="footer-places-contact"><a href="//en.wikipedia.org/wiki/Wikipedia:Contact_us">Contact Wikipedia</a></li> <li id="footer-places-wm-codeofconduct"><a href="https://foundation.wikimedia.org/wiki/Special:MyLanguage/Policy:Universal_Code_of_Conduct">Code of Conduct</a></li> <li id="footer-places-developers"><a href="https://developer.wikimedia.org">Developers</a></li> <li id="footer-places-statslink"><a href="https://stats.wikimedia.org/#/en.wikipedia.org">Statistics</a></li> <li id="footer-places-cookiestatement"><a href="https://foundation.wikimedia.org/wiki/Special:MyLanguage/Policy:Cookie_statement">Cookie statement</a></li> <li id="footer-places-mobileview"><a href="//en.m.wikipedia.org/w/index.php?title=Reinforcement_learning&mobileaction=toggle_view_mobile" class="noprint stopMobileRedirectToggle">Mobile view</a></li> </ul> <ul id="footer-icons" class="noprint"> <li id="footer-copyrightico"><a href="https://wikimediafoundation.org/" class="cdx-button cdx-button--fake-button cdx-button--size-large cdx-button--fake-button--enabled"><img src="/static/images/footer/wikimedia-button.svg" width="84" height="29" alt="Wikimedia Foundation" loading="lazy"></a></li> <li id="footer-poweredbyico"><a href="https://www.mediawiki.org/" class="cdx-button cdx-button--fake-button cdx-button--size-large cdx-button--fake-button--enabled"><img src="/w/resources/assets/poweredby_mediawiki.svg" alt="Powered by MediaWiki" width="88" height="31" loading="lazy"></a></li> </ul> </footer> </div> </div> </div> <div class="vector-settings" id="p-dock-bottom"> <ul></ul> </div><script>(RLQ=window.RLQ||[]).push(function(){mw.config.set({"wgHostname":"mw-web.codfw.main-f69cdc8f6-9dtxv","wgBackendResponseTime":148,"wgPageParseReport":{"limitreport":{"cputime":"1.265","walltime":"1.579","ppvisitednodes":{"value":6131,"limit":1000000},"postexpandincludesize":{"value":276539,"limit":2097152},"templateargumentsize":{"value":3156,"limit":2097152},"expansiondepth":{"value":16,"limit":100},"expensivefunctioncount":{"value":7,"limit":500},"unstrip-depth":{"value":1,"limit":20},"unstrip-size":{"value":305219,"limit":5000000},"entityaccesscount":{"value":0,"limit":400},"timingprofile":["100.00% 1147.511 1 -total"," 46.13% 529.380 1 Template:Reflist"," 25.43% 291.755 33 Template:Cite_journal"," 9.97% 114.383 20 Template:Cite_book"," 9.63% 110.505 1 Template:Short_description"," 9.20% 105.578 1 Template:Machine_learning"," 8.65% 99.204 1 Template:Sidebar_with_collapsible_lists"," 5.94% 68.157 2 Template:Sfn"," 5.82% 66.763 3 Template:Navbox"," 5.57% 63.922 2 Template:Pagetype"]},"scribunto":{"limitreport-timeusage":{"value":"0.757","limit":"10.000"},"limitreport-memusage":{"value":9269473,"limit":52428800},"limitreport-logs":"anchor_id_list = table#1 {\n [\"CITEREFAnnaswamy2023\"] = 1,\n [\"CITEREFAuerJakschOrtner2010\"] = 1,\n [\"CITEREFBarto2013\"] = 1,\n [\"CITEREFBehzadanMunir2017\"] = 1,\n [\"CITEREFBerenji1994\"] = 1,\n [\"CITEREFBertsekas2023\"] = 1,\n [\"CITEREFBradtkeBarto1996\"] = 1,\n [\"CITEREFBurnetasKatehakis1997\"] = 1,\n [\"CITEREFBusoniuBabuskaDe_SchutterErnst2010\"] = 1,\n [\"CITEREFChowTamarMannorPavone2015\"] = 1,\n [\"CITEREFColas2019\"] = 1,\n [\"CITEREFDabneyOstrovskiSilverMunos2018\"] = 1,\n [\"CITEREFDabériusGranatKarlsson2020\"] = 1,\n [\"CITEREFDeisenrothNeumannPeters2013\"] = 1,\n [\"CITEREFDeySinghWangMcDonald-Maier2020\"] = 1,\n [\"CITEREFDuanWangXiao2023\"] = 1,\n [\"CITEREFEngstromIlyasSanturkarTsipras2019\"] = 1,\n [\"CITEREFFrancois-Lavet2018\"] = 1,\n [\"CITEREFFrançois-LavetHendersonIslamBellemare2018\"] = 1,\n [\"CITEREFGarcíaFernández2015\"] = 1,\n [\"CITEREFGeorge_KarimpanalBouffanais2019\"] = 1,\n [\"CITEREFGoodfellowShlensSzegedy2015\"] = 1,\n [\"CITEREFGosavi2003\"] = 1,\n [\"CITEREFGreenbergChowGhavamzadehMannor2022\"] = 1,\n [\"CITEREFGreenbergMannor2021\"] = 1,\n [\"CITEREFGrondmanVaandragerBusoniuBabuska2012\"] = 1,\n [\"CITEREFJ_DuanY_GuanS_Li2021\"] = 1,\n [\"CITEREFJuliani2016\"] = 1,\n [\"CITEREFKaelblingLittmanMoore1996\"] = 1,\n [\"CITEREFKaplanOudeyer2004\"] = 1,\n [\"CITEREFKlyubinPolaniNehaniv2008\"] = 1,\n [\"CITEREFKorkmaz2022\"] = 1,\n [\"CITEREFKulkarniNarasimhanSaeediTenenbaum2016\"] = 1,\n [\"CITEREFLeeSeoJung2012\"] = 1,\n [\"CITEREFLi2023\"] = 2,\n [\"CITEREFLin1992\"] = 1,\n [\"CITEREFMatzliachBen-GalKagan2022\"] = 1,\n [\"CITEREFMnih2015\"] = 1,\n [\"CITEREFNgRussell2000\"] = 1,\n [\"CITEREFPetersVijayakumarSchaal2003\"] = 1,\n [\"CITEREFPieter2017\"] = 1,\n [\"CITEREFPitombeira-NetoSantosCoelho_da_Silvade_Macedo2024\"] = 1,\n [\"CITEREFPowell2011\"] = 1,\n [\"CITEREFQuested\"] = 1,\n [\"CITEREFRenJiangZhanLi2022\"] = 1,\n [\"CITEREFRiveretGao2019\"] = 1,\n [\"CITEREFRussellNorvig2010\"] = 1,\n [\"CITEREFSalazar_DuqueGiraldoVergaraNguyen2022\"] = 1,\n [\"CITEREFSinghSutton1996\"] = 1,\n [\"CITEREFSoucek1992\"] = 1,\n [\"CITEREFSutton1984\"] = 1,\n [\"CITEREFSutton1988\"] = 1,\n [\"CITEREFSutton1990\"] = 1,\n [\"CITEREFSuttonBarto2018\"] = 1,\n [\"CITEREFSzitaSzepesvari2010\"] = 1,\n [\"CITEREFTamarGlassnerMannor2015\"] = 1,\n [\"CITEREFTokicPalm2011\"] = 1,\n [\"CITEREFVergaraSalazarGiraldoPalensky2022\"] = 1,\n [\"CITEREFVincze2017\"] = 1,\n [\"CITEREFWatkins1989\"] = 1,\n [\"CITEREFWilliams1987\"] = 1,\n [\"CITEREFWilliams2020\"] = 1,\n [\"CITEREFXieHung_Yu_LingNam_Hee_KimMichiel_van_de_Panne2020\"] = 1,\n [\"CITEREFY_RenJ_DuanS_Li2020\"] = 1,\n [\"CITEREFYamagataMcConvilleSantos-Rodriguez2021\"] = 1,\n [\"CITEREFZiebartMaasBagnellDey2008\"] = 1,\n [\"CITEREFZou2023\"] = 1,\n [\"CITEREFvan_HasseltHesselAslanides2019\"] = 1,\n [\"CITEREFvan_Otterlo,_M.Wiering,_M.2012\"] = 1,\n}\ntemplate_list = table#1 {\n [\"Artificial intelligence (AI)\"] = 1,\n [\"Citation\"] = 3,\n [\"Cite arXiv\"] = 3,\n [\"Cite book\"] = 20,\n [\"Cite conference\"] = 6,\n [\"Cite journal\"] = 33,\n [\"Cite thesis\"] = 2,\n [\"Cite web\"] = 7,\n [\"Clarify\"] = 1,\n [\"Computer science\"] = 1,\n [\"Div col\"] = 1,\n [\"Div col end\"] = 1,\n [\"For\"] = 1,\n [\"Machine learning\"] = 1,\n [\"Main\"] = 1,\n [\"More citations needed section\"] = 1,\n [\"Mvar\"] = 1,\n [\"Reflist\"] = 1,\n [\"Rp\"] = 2,\n [\"See also\"] = 1,\n [\"Sfn\"] = 2,\n [\"Short description\"] = 1,\n [\"Toclimit\"] = 1,\n}\narticle_whitelist = table#1 {\n}\n"},"cachereport":{"origin":"mw-web.codfw.main-f69cdc8f6-rhzm7","timestamp":"20241123223258","ttl":2592000,"transientcontent":false}}});});</script> <script type="application/ld+json">{"@context":"https:\/\/schema.org","@type":"Article","name":"Reinforcement learning","url":"https:\/\/en.wikipedia.org\/wiki\/Reinforcement_learning","sameAs":"http:\/\/www.wikidata.org\/entity\/Q830687","mainEntity":"http:\/\/www.wikidata.org\/entity\/Q830687","author":{"@type":"Organization","name":"Contributors to Wikimedia projects"},"publisher":{"@type":"Organization","name":"Wikimedia Foundation, Inc.","logo":{"@type":"ImageObject","url":"https:\/\/www.wikimedia.org\/static\/images\/wmf-hor-googpub.png"}},"datePublished":"2002-07-31T13:24:55Z","dateModified":"2024-11-23T22:32:41Z","headline":"field of machine learning"}</script> </body> </html>