CINXE.COM

Stochastic gradient descent - Wikipedia

<!DOCTYPE html> <html class="client-nojs vector-feature-language-in-header-enabled vector-feature-language-in-main-page-header-disabled vector-feature-page-tools-pinned-disabled vector-feature-toc-pinned-clientpref-1 vector-feature-main-menu-pinned-disabled vector-feature-limited-width-clientpref-1 vector-feature-limited-width-content-enabled vector-feature-custom-font-size-clientpref-1 vector-feature-appearance-pinned-clientpref-1 vector-feature-night-mode-enabled skin-theme-clientpref-day vector-sticky-header-enabled vector-toc-available" lang="en" dir="ltr"> <head> <meta charset="UTF-8"> <title>Stochastic gradient descent - Wikipedia</title> <script>(function(){var className="client-js vector-feature-language-in-header-enabled vector-feature-language-in-main-page-header-disabled vector-feature-page-tools-pinned-disabled vector-feature-toc-pinned-clientpref-1 vector-feature-main-menu-pinned-disabled vector-feature-limited-width-clientpref-1 vector-feature-limited-width-content-enabled vector-feature-custom-font-size-clientpref-1 vector-feature-appearance-pinned-clientpref-1 vector-feature-night-mode-enabled skin-theme-clientpref-day vector-sticky-header-enabled vector-toc-available";var cookie=document.cookie.match(/(?:^|; )enwikimwclientpreferences=([^;]+)/);if(cookie){cookie[1].split('%2C').forEach(function(pref){className=className.replace(new RegExp('(^| )'+pref.replace(/-clientpref-\w+$|[^\w-]+/g,'')+'-clientpref-\\w+( |$)'),'$1'+pref+'$2');});}document.documentElement.className=className;}());RLCONF={"wgBreakFrames":false,"wgSeparatorTransformTable":["",""],"wgDigitTransformTable":["",""],"wgDefaultDateFormat":"dmy", "wgMonthNames":["","January","February","March","April","May","June","July","August","September","October","November","December"],"wgRequestId":"fb492210-5e64-438b-acc0-ae43fc0faa55","wgCanonicalNamespace":"","wgCanonicalSpecialPageName":false,"wgNamespaceNumber":0,"wgPageName":"Stochastic_gradient_descent","wgTitle":"Stochastic gradient descent","wgCurRevisionId":1265558819,"wgRevisionId":1265558819,"wgArticleId":1180641,"wgIsArticle":true,"wgIsRedirect":false,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgCategories":["All articles with dead external links","Articles with dead external links from June 2018","Articles with permanently dead external links","CS1 errors: missing periodical","CS1 maint: multiple names: authors list","Articles with short description","Short description is different from Wikidata","All articles with unsourced statements","Articles with unsourced statements from July 2015","Wikipedia articles needing clarification from November 2023", "Articles with unsourced statements from June 2023","Articles with unsourced statements from June 2024","Articles to be expanded from June 2023","All articles to be expanded","Articles with unsourced statements from April 2020","Stochastic optimization","Computational statistics","Gradient methods","M-estimators","Machine learning algorithms","Convex optimization","Statistical approximations"],"wgPageViewLanguage":"en","wgPageContentLanguage":"en","wgPageContentModel":"wikitext","wgRelevantPageName":"Stochastic_gradient_descent","wgRelevantArticleId":1180641,"wgIsProbablyEditable":true,"wgRelevantPageIsProbablyEditable":true,"wgRestrictionEdit":[],"wgRestrictionMove":[],"wgNoticeProject":"wikipedia","wgCiteReferencePreviewsActive":false,"wgFlaggedRevsParams":{"tags":{"status":{"levels":1}}},"wgMediaViewerOnClick":true,"wgMediaViewerEnabledByDefault":true,"wgPopupsFlags":0,"wgVisualEditor":{"pageLanguageCode":"en","pageLanguageDir":"ltr","pageVariantFallbacks":"en"}, "wgMFDisplayWikibaseDescriptions":{"search":true,"watchlist":true,"tagline":false,"nearby":true},"wgWMESchemaEditAttemptStepOversample":false,"wgWMEPageLength":50000,"wgEditSubmitButtonLabelPublish":true,"wgULSPosition":"interlanguage","wgULSisCompactLinksEnabled":false,"wgVector2022LanguageInHeader":true,"wgULSisLanguageSelectorEmpty":false,"wgWikibaseItemId":"Q7617819","wgCheckUserClientHintsHeadersJsApi":["brands","architecture","bitness","fullVersionList","mobile","model","platform","platformVersion"],"GEHomepageSuggestedEditsEnableTopics":true,"wgGETopicsMatchModeEnabled":false,"wgGEStructuredTaskRejectionReasonTextInputEnabled":false,"wgGELevelingUpEnabledForUser":false};RLSTATE={"ext.globalCssJs.user.styles":"ready","site.styles":"ready","user.styles":"ready","ext.globalCssJs.user":"ready","user":"ready","user.options":"loading","ext.cite.styles":"ready","ext.math.styles":"ready","skins.vector.search.codex.styles":"ready","skins.vector.styles":"ready","skins.vector.icons": "ready","jquery.makeCollapsible.styles":"ready","ext.wikimediamessages.styles":"ready","ext.visualEditor.desktopArticleTarget.noscript":"ready","ext.uls.interlanguage":"ready","wikibase.client.init":"ready","ext.wikimediaBadges":"ready"};RLPAGEMODULES=["ext.cite.ux-enhancements","mediawiki.page.media","site","mediawiki.page.ready","jquery.makeCollapsible","mediawiki.toc","skins.vector.js","ext.centralNotice.geoIP","ext.centralNotice.startUp","ext.gadget.ReferenceTooltips","ext.gadget.switcher","ext.urlShortener.toolbar","ext.centralauth.centralautologin","mmv.bootstrap","ext.popups","ext.visualEditor.desktopArticleTarget.init","ext.visualEditor.targetLoader","ext.echo.centralauth","ext.eventLogging","ext.wikimediaEvents","ext.navigationTiming","ext.uls.interface","ext.cx.eventlogging.campaigns","ext.cx.uls.quick.actions","wikibase.client.vector-2022","ext.checkUser.clientHints","ext.growthExperiments.SuggestedEditSession"];</script> <script>(RLQ=window.RLQ||[]).push(function(){mw.loader.impl(function(){return["user.options@12s5i",function($,jQuery,require,module){mw.user.tokens.set({"patrolToken":"+\\","watchToken":"+\\","csrfToken":"+\\"}); }];});});</script> <link rel="stylesheet" href="/w/load.php?lang=en&amp;modules=ext.cite.styles%7Cext.math.styles%7Cext.uls.interlanguage%7Cext.visualEditor.desktopArticleTarget.noscript%7Cext.wikimediaBadges%7Cext.wikimediamessages.styles%7Cjquery.makeCollapsible.styles%7Cskins.vector.icons%2Cstyles%7Cskins.vector.search.codex.styles%7Cwikibase.client.init&amp;only=styles&amp;skin=vector-2022"> <script async="" src="/w/load.php?lang=en&amp;modules=startup&amp;only=scripts&amp;raw=1&amp;skin=vector-2022"></script> <meta name="ResourceLoaderDynamicStyles" content=""> <link rel="stylesheet" href="/w/load.php?lang=en&amp;modules=site.styles&amp;only=styles&amp;skin=vector-2022"> <meta name="generator" content="MediaWiki 1.44.0-wmf.15"> <meta name="referrer" content="origin"> <meta name="referrer" content="origin-when-cross-origin"> <meta name="robots" content="max-image-preview:standard"> <meta name="format-detection" content="telephone=no"> <meta name="viewport" content="width=1120"> <meta property="og:title" content="Stochastic gradient descent - Wikipedia"> <meta property="og:type" content="website"> <link rel="preconnect" href="//upload.wikimedia.org"> <link rel="alternate" media="only screen and (max-width: 640px)" href="//en.m.wikipedia.org/wiki/Stochastic_gradient_descent"> <link rel="alternate" type="application/x-wiki" title="Edit this page" href="/w/index.php?title=Stochastic_gradient_descent&amp;action=edit"> <link rel="apple-touch-icon" href="/static/apple-touch/wikipedia.png"> <link rel="icon" href="/static/favicon/wikipedia.ico"> <link rel="search" type="application/opensearchdescription+xml" href="/w/rest.php/v1/search" title="Wikipedia (en)"> <link rel="EditURI" type="application/rsd+xml" href="//en.wikipedia.org/w/api.php?action=rsd"> <link rel="canonical" href="https://en.wikipedia.org/wiki/Stochastic_gradient_descent"> <link rel="license" href="https://creativecommons.org/licenses/by-sa/4.0/deed.en"> <link rel="alternate" type="application/atom+xml" title="Wikipedia Atom feed" href="/w/index.php?title=Special:RecentChanges&amp;feed=atom"> <link rel="dns-prefetch" href="//meta.wikimedia.org" /> <link rel="dns-prefetch" href="login.wikimedia.org"> </head> <body class="skin--responsive skin-vector skin-vector-search-vue mediawiki ltr sitedir-ltr mw-hide-empty-elt ns-0 ns-subject mw-editable page-Stochastic_gradient_descent rootpage-Stochastic_gradient_descent skin-vector-2022 action-view"><a class="mw-jump-link" href="#bodyContent">Jump to content</a> <div class="vector-header-container"> <header class="vector-header mw-header"> <div class="vector-header-start"> <nav class="vector-main-menu-landmark" aria-label="Site"> <div id="vector-main-menu-dropdown" class="vector-dropdown vector-main-menu-dropdown vector-button-flush-left vector-button-flush-right" title="Main menu" > <input type="checkbox" id="vector-main-menu-dropdown-checkbox" role="button" aria-haspopup="true" data-event-name="ui.dropdown-vector-main-menu-dropdown" class="vector-dropdown-checkbox " aria-label="Main menu" > <label id="vector-main-menu-dropdown-label" for="vector-main-menu-dropdown-checkbox" class="vector-dropdown-label cdx-button cdx-button--fake-button cdx-button--fake-button--enabled cdx-button--weight-quiet cdx-button--icon-only " aria-hidden="true" ><span class="vector-icon mw-ui-icon-menu mw-ui-icon-wikimedia-menu"></span> <span class="vector-dropdown-label-text">Main menu</span> </label> <div class="vector-dropdown-content"> <div id="vector-main-menu-unpinned-container" class="vector-unpinned-container"> <div id="vector-main-menu" class="vector-main-menu vector-pinnable-element"> <div class="vector-pinnable-header vector-main-menu-pinnable-header vector-pinnable-header-unpinned" data-feature-name="main-menu-pinned" data-pinnable-element-id="vector-main-menu" data-pinned-container-id="vector-main-menu-pinned-container" data-unpinned-container-id="vector-main-menu-unpinned-container" > <div class="vector-pinnable-header-label">Main menu</div> <button class="vector-pinnable-header-toggle-button vector-pinnable-header-pin-button" data-event-name="pinnable-header.vector-main-menu.pin">move to sidebar</button> <button class="vector-pinnable-header-toggle-button vector-pinnable-header-unpin-button" data-event-name="pinnable-header.vector-main-menu.unpin">hide</button> </div> <div id="p-navigation" class="vector-menu mw-portlet mw-portlet-navigation" > <div class="vector-menu-heading"> Navigation </div> <div class="vector-menu-content"> <ul class="vector-menu-content-list"> <li id="n-mainpage-description" class="mw-list-item"><a href="/wiki/Main_Page" title="Visit the main page [z]" accesskey="z"><span>Main page</span></a></li><li id="n-contents" class="mw-list-item"><a href="/wiki/Wikipedia:Contents" title="Guides to browsing Wikipedia"><span>Contents</span></a></li><li id="n-currentevents" class="mw-list-item"><a href="/wiki/Portal:Current_events" title="Articles related to current events"><span>Current events</span></a></li><li id="n-randompage" class="mw-list-item"><a href="/wiki/Special:Random" title="Visit a randomly selected article [x]" accesskey="x"><span>Random article</span></a></li><li id="n-aboutsite" class="mw-list-item"><a href="/wiki/Wikipedia:About" title="Learn about Wikipedia and how it works"><span>About Wikipedia</span></a></li><li id="n-contactpage" class="mw-list-item"><a href="//en.wikipedia.org/wiki/Wikipedia:Contact_us" title="How to contact Wikipedia"><span>Contact us</span></a></li> </ul> </div> </div> <div id="p-interaction" class="vector-menu mw-portlet mw-portlet-interaction" > <div class="vector-menu-heading"> Contribute </div> <div class="vector-menu-content"> <ul class="vector-menu-content-list"> <li id="n-help" class="mw-list-item"><a href="/wiki/Help:Contents" title="Guidance on how to use and edit Wikipedia"><span>Help</span></a></li><li id="n-introduction" class="mw-list-item"><a href="/wiki/Help:Introduction" title="Learn how to edit Wikipedia"><span>Learn to edit</span></a></li><li id="n-portal" class="mw-list-item"><a href="/wiki/Wikipedia:Community_portal" title="The hub for editors"><span>Community portal</span></a></li><li id="n-recentchanges" class="mw-list-item"><a href="/wiki/Special:RecentChanges" title="A list of recent changes to Wikipedia [r]" accesskey="r"><span>Recent changes</span></a></li><li id="n-upload" class="mw-list-item"><a href="/wiki/Wikipedia:File_upload_wizard" title="Add images or other media for use on Wikipedia"><span>Upload file</span></a></li> </ul> </div> </div> </div> </div> </div> </div> </nav> <a href="/wiki/Main_Page" class="mw-logo"> <img class="mw-logo-icon" src="/static/images/icons/wikipedia.png" alt="" aria-hidden="true" height="50" width="50"> <span class="mw-logo-container skin-invert"> <img class="mw-logo-wordmark" alt="Wikipedia" src="/static/images/mobile/copyright/wikipedia-wordmark-en.svg" style="width: 7.5em; height: 1.125em;"> <img class="mw-logo-tagline" alt="The Free Encyclopedia" src="/static/images/mobile/copyright/wikipedia-tagline-en.svg" width="117" height="13" style="width: 7.3125em; height: 0.8125em;"> </span> </a> </div> <div class="vector-header-end"> <div id="p-search" role="search" class="vector-search-box-vue vector-search-box-collapses vector-search-box-show-thumbnail vector-search-box-auto-expand-width vector-search-box"> <a href="/wiki/Special:Search" class="cdx-button cdx-button--fake-button cdx-button--fake-button--enabled cdx-button--weight-quiet cdx-button--icon-only search-toggle" title="Search Wikipedia [f]" accesskey="f"><span class="vector-icon mw-ui-icon-search mw-ui-icon-wikimedia-search"></span> <span>Search</span> </a> <div class="vector-typeahead-search-container"> <div class="cdx-typeahead-search cdx-typeahead-search--show-thumbnail cdx-typeahead-search--auto-expand-width"> <form action="/w/index.php" id="searchform" class="cdx-search-input cdx-search-input--has-end-button"> <div id="simpleSearch" class="cdx-search-input__input-wrapper" data-search-loc="header-moved"> <div class="cdx-text-input cdx-text-input--has-start-icon"> <input class="cdx-text-input__input" type="search" name="search" placeholder="Search Wikipedia" aria-label="Search Wikipedia" autocapitalize="sentences" title="Search Wikipedia [f]" accesskey="f" id="searchInput" > <span class="cdx-text-input__icon cdx-text-input__start-icon"></span> </div> <input type="hidden" name="title" value="Special:Search"> </div> <button class="cdx-button cdx-search-input__end-button">Search</button> </form> </div> </div> </div> <nav class="vector-user-links vector-user-links-wide" aria-label="Personal tools"> <div class="vector-user-links-main"> <div id="p-vector-user-menu-preferences" class="vector-menu mw-portlet emptyPortlet" > <div class="vector-menu-content"> <ul class="vector-menu-content-list"> </ul> </div> </div> <div id="p-vector-user-menu-userpage" class="vector-menu mw-portlet emptyPortlet" > <div class="vector-menu-content"> <ul class="vector-menu-content-list"> </ul> </div> </div> <nav class="vector-appearance-landmark" aria-label="Appearance"> <div id="vector-appearance-dropdown" class="vector-dropdown " title="Change the appearance of the page&#039;s font size, width, and color" > <input type="checkbox" id="vector-appearance-dropdown-checkbox" role="button" aria-haspopup="true" data-event-name="ui.dropdown-vector-appearance-dropdown" class="vector-dropdown-checkbox " aria-label="Appearance" > <label id="vector-appearance-dropdown-label" for="vector-appearance-dropdown-checkbox" class="vector-dropdown-label cdx-button cdx-button--fake-button cdx-button--fake-button--enabled cdx-button--weight-quiet cdx-button--icon-only " aria-hidden="true" ><span class="vector-icon mw-ui-icon-appearance mw-ui-icon-wikimedia-appearance"></span> <span class="vector-dropdown-label-text">Appearance</span> </label> <div class="vector-dropdown-content"> <div id="vector-appearance-unpinned-container" class="vector-unpinned-container"> </div> </div> </div> </nav> <div id="p-vector-user-menu-notifications" class="vector-menu mw-portlet emptyPortlet" > <div class="vector-menu-content"> <ul class="vector-menu-content-list"> </ul> </div> </div> <div id="p-vector-user-menu-overflow" class="vector-menu mw-portlet" > <div class="vector-menu-content"> <ul class="vector-menu-content-list"> <li id="pt-sitesupport-2" class="user-links-collapsible-item mw-list-item user-links-collapsible-item"><a data-mw="interface" href="https://donate.wikimedia.org/?wmf_source=donate&amp;wmf_medium=sidebar&amp;wmf_campaign=en.wikipedia.org&amp;uselang=en" class=""><span>Donate</span></a> </li> <li id="pt-createaccount-2" class="user-links-collapsible-item mw-list-item user-links-collapsible-item"><a data-mw="interface" href="/w/index.php?title=Special:CreateAccount&amp;returnto=Stochastic+gradient+descent" title="You are encouraged to create an account and log in; however, it is not mandatory" class=""><span>Create account</span></a> </li> <li id="pt-login-2" class="user-links-collapsible-item mw-list-item user-links-collapsible-item"><a data-mw="interface" href="/w/index.php?title=Special:UserLogin&amp;returnto=Stochastic+gradient+descent" title="You&#039;re encouraged to log in; however, it&#039;s not mandatory. [o]" accesskey="o" class=""><span>Log in</span></a> </li> </ul> </div> </div> </div> <div id="vector-user-links-dropdown" class="vector-dropdown vector-user-menu vector-button-flush-right vector-user-menu-logged-out" title="Log in and more options" > <input type="checkbox" id="vector-user-links-dropdown-checkbox" role="button" aria-haspopup="true" data-event-name="ui.dropdown-vector-user-links-dropdown" class="vector-dropdown-checkbox " aria-label="Personal tools" > <label id="vector-user-links-dropdown-label" for="vector-user-links-dropdown-checkbox" class="vector-dropdown-label cdx-button cdx-button--fake-button cdx-button--fake-button--enabled cdx-button--weight-quiet cdx-button--icon-only " aria-hidden="true" ><span class="vector-icon mw-ui-icon-ellipsis mw-ui-icon-wikimedia-ellipsis"></span> <span class="vector-dropdown-label-text">Personal tools</span> </label> <div class="vector-dropdown-content"> <div id="p-personal" class="vector-menu mw-portlet mw-portlet-personal user-links-collapsible-item" title="User menu" > <div class="vector-menu-content"> <ul class="vector-menu-content-list"> <li id="pt-sitesupport" class="user-links-collapsible-item mw-list-item"><a href="https://donate.wikimedia.org/?wmf_source=donate&amp;wmf_medium=sidebar&amp;wmf_campaign=en.wikipedia.org&amp;uselang=en"><span>Donate</span></a></li><li id="pt-createaccount" class="user-links-collapsible-item mw-list-item"><a href="/w/index.php?title=Special:CreateAccount&amp;returnto=Stochastic+gradient+descent" title="You are encouraged to create an account and log in; however, it is not mandatory"><span class="vector-icon mw-ui-icon-userAdd mw-ui-icon-wikimedia-userAdd"></span> <span>Create account</span></a></li><li id="pt-login" class="user-links-collapsible-item mw-list-item"><a href="/w/index.php?title=Special:UserLogin&amp;returnto=Stochastic+gradient+descent" title="You&#039;re encouraged to log in; however, it&#039;s not mandatory. [o]" accesskey="o"><span class="vector-icon mw-ui-icon-logIn mw-ui-icon-wikimedia-logIn"></span> <span>Log in</span></a></li> </ul> </div> </div> <div id="p-user-menu-anon-editor" class="vector-menu mw-portlet mw-portlet-user-menu-anon-editor" > <div class="vector-menu-heading"> Pages for logged out editors <a href="/wiki/Help:Introduction" aria-label="Learn more about editing"><span>learn more</span></a> </div> <div class="vector-menu-content"> <ul class="vector-menu-content-list"> <li id="pt-anoncontribs" class="mw-list-item"><a href="/wiki/Special:MyContributions" title="A list of edits made from this IP address [y]" accesskey="y"><span>Contributions</span></a></li><li id="pt-anontalk" class="mw-list-item"><a href="/wiki/Special:MyTalk" title="Discussion about edits from this IP address [n]" accesskey="n"><span>Talk</span></a></li> </ul> </div> </div> </div> </div> </nav> </div> </header> </div> <div class="mw-page-container"> <div class="mw-page-container-inner"> <div class="vector-sitenotice-container"> <div id="siteNotice"><!-- CentralNotice --></div> </div> <div class="vector-column-start"> <div class="vector-main-menu-container"> <div id="mw-navigation"> <nav id="mw-panel" class="vector-main-menu-landmark" aria-label="Site"> <div id="vector-main-menu-pinned-container" class="vector-pinned-container"> </div> </nav> </div> </div> <div class="vector-sticky-pinned-container"> <nav id="mw-panel-toc" aria-label="Contents" data-event-name="ui.sidebar-toc" class="mw-table-of-contents-container vector-toc-landmark"> <div id="vector-toc-pinned-container" class="vector-pinned-container"> <div id="vector-toc" class="vector-toc vector-pinnable-element"> <div class="vector-pinnable-header vector-toc-pinnable-header vector-pinnable-header-pinned" data-feature-name="toc-pinned" data-pinnable-element-id="vector-toc" > <h2 class="vector-pinnable-header-label">Contents</h2> <button class="vector-pinnable-header-toggle-button vector-pinnable-header-pin-button" data-event-name="pinnable-header.vector-toc.pin">move to sidebar</button> <button class="vector-pinnable-header-toggle-button vector-pinnable-header-unpin-button" data-event-name="pinnable-header.vector-toc.unpin">hide</button> </div> <ul class="vector-toc-contents" id="mw-panel-toc-list"> <li id="toc-mw-content-text" class="vector-toc-list-item vector-toc-level-1"> <a href="#" class="vector-toc-link"> <div class="vector-toc-text">(Top)</div> </a> </li> <li id="toc-Background" class="vector-toc-list-item vector-toc-level-1 vector-toc-list-item-expanded"> <a class="vector-toc-link" href="#Background"> <div class="vector-toc-text"> <span class="vector-toc-numb">1</span> <span>Background</span> </div> </a> <ul id="toc-Background-sublist" class="vector-toc-list"> </ul> </li> <li id="toc-Iterative_method" class="vector-toc-list-item vector-toc-level-1 vector-toc-list-item-expanded"> <a class="vector-toc-link" href="#Iterative_method"> <div class="vector-toc-text"> <span class="vector-toc-numb">2</span> <span>Iterative method</span> </div> </a> <ul id="toc-Iterative_method-sublist" class="vector-toc-list"> </ul> </li> <li id="toc-Linear_regression" class="vector-toc-list-item vector-toc-level-1 vector-toc-list-item-expanded"> <a class="vector-toc-link" href="#Linear_regression"> <div class="vector-toc-text"> <span class="vector-toc-numb">3</span> <span>Linear regression</span> </div> </a> <ul id="toc-Linear_regression-sublist" class="vector-toc-list"> </ul> </li> <li id="toc-History" class="vector-toc-list-item vector-toc-level-1 vector-toc-list-item-expanded"> <a class="vector-toc-link" href="#History"> <div class="vector-toc-text"> <span class="vector-toc-numb">4</span> <span>History</span> </div> </a> <ul id="toc-History-sublist" class="vector-toc-list"> </ul> </li> <li id="toc-Notable_applications" class="vector-toc-list-item vector-toc-level-1 vector-toc-list-item-expanded"> <a class="vector-toc-link" href="#Notable_applications"> <div class="vector-toc-text"> <span class="vector-toc-numb">5</span> <span>Notable applications</span> </div> </a> <ul id="toc-Notable_applications-sublist" class="vector-toc-list"> </ul> </li> <li id="toc-Extensions_and_variants" class="vector-toc-list-item vector-toc-level-1 vector-toc-list-item-expanded"> <a class="vector-toc-link" href="#Extensions_and_variants"> <div class="vector-toc-text"> <span class="vector-toc-numb">6</span> <span>Extensions and variants</span> </div> </a> <button aria-controls="toc-Extensions_and_variants-sublist" class="cdx-button cdx-button--weight-quiet cdx-button--icon-only vector-toc-toggle"> <span class="vector-icon mw-ui-icon-wikimedia-expand"></span> <span>Toggle Extensions and variants subsection</span> </button> <ul id="toc-Extensions_and_variants-sublist" class="vector-toc-list"> <li id="toc-Implicit_updates_(ISGD)" class="vector-toc-list-item vector-toc-level-2"> <a class="vector-toc-link" href="#Implicit_updates_(ISGD)"> <div class="vector-toc-text"> <span class="vector-toc-numb">6.1</span> <span>Implicit updates (ISGD)</span> </div> </a> <ul id="toc-Implicit_updates_(ISGD)-sublist" class="vector-toc-list"> </ul> </li> <li id="toc-Momentum" class="vector-toc-list-item vector-toc-level-2"> <a class="vector-toc-link" href="#Momentum"> <div class="vector-toc-text"> <span class="vector-toc-numb">6.2</span> <span>Momentum</span> </div> </a> <ul id="toc-Momentum-sublist" class="vector-toc-list"> </ul> </li> <li id="toc-Averaging" class="vector-toc-list-item vector-toc-level-2"> <a class="vector-toc-link" href="#Averaging"> <div class="vector-toc-text"> <span class="vector-toc-numb">6.3</span> <span>Averaging</span> </div> </a> <ul id="toc-Averaging-sublist" class="vector-toc-list"> </ul> </li> <li id="toc-AdaGrad" class="vector-toc-list-item vector-toc-level-2"> <a class="vector-toc-link" href="#AdaGrad"> <div class="vector-toc-text"> <span class="vector-toc-numb">6.4</span> <span>AdaGrad</span> </div> </a> <ul id="toc-AdaGrad-sublist" class="vector-toc-list"> </ul> </li> <li id="toc-RMSProp" class="vector-toc-list-item vector-toc-level-2"> <a class="vector-toc-link" href="#RMSProp"> <div class="vector-toc-text"> <span class="vector-toc-numb">6.5</span> <span>RMSProp</span> </div> </a> <ul id="toc-RMSProp-sublist" class="vector-toc-list"> </ul> </li> <li id="toc-Adam" class="vector-toc-list-item vector-toc-level-2"> <a class="vector-toc-link" href="#Adam"> <div class="vector-toc-text"> <span class="vector-toc-numb">6.6</span> <span>Adam</span> </div> </a> <ul id="toc-Adam-sublist" class="vector-toc-list"> <li id="toc-Variants" class="vector-toc-list-item vector-toc-level-3"> <a class="vector-toc-link" href="#Variants"> <div class="vector-toc-text"> <span class="vector-toc-numb">6.6.1</span> <span>Variants</span> </div> </a> <ul id="toc-Variants-sublist" class="vector-toc-list"> </ul> </li> </ul> </li> <li id="toc-Sign-based_stochastic_gradient_descent" class="vector-toc-list-item vector-toc-level-2"> <a class="vector-toc-link" href="#Sign-based_stochastic_gradient_descent"> <div class="vector-toc-text"> <span class="vector-toc-numb">6.7</span> <span>Sign-based stochastic gradient descent</span> </div> </a> <ul id="toc-Sign-based_stochastic_gradient_descent-sublist" class="vector-toc-list"> </ul> </li> <li id="toc-Backtracking_line_search" class="vector-toc-list-item vector-toc-level-2"> <a class="vector-toc-link" href="#Backtracking_line_search"> <div class="vector-toc-text"> <span class="vector-toc-numb">6.8</span> <span>Backtracking line search</span> </div> </a> <ul id="toc-Backtracking_line_search-sublist" class="vector-toc-list"> </ul> </li> <li id="toc-Second-order_methods" class="vector-toc-list-item vector-toc-level-2"> <a class="vector-toc-link" href="#Second-order_methods"> <div class="vector-toc-text"> <span class="vector-toc-numb">6.9</span> <span>Second-order methods</span> </div> </a> <ul id="toc-Second-order_methods-sublist" class="vector-toc-list"> </ul> </li> </ul> </li> <li id="toc-Approximations_in_continuous_time" class="vector-toc-list-item vector-toc-level-1 vector-toc-list-item-expanded"> <a class="vector-toc-link" href="#Approximations_in_continuous_time"> <div class="vector-toc-text"> <span class="vector-toc-numb">7</span> <span>Approximations in continuous time</span> </div> </a> <ul id="toc-Approximations_in_continuous_time-sublist" class="vector-toc-list"> </ul> </li> <li id="toc-See_also" class="vector-toc-list-item vector-toc-level-1 vector-toc-list-item-expanded"> <a class="vector-toc-link" href="#See_also"> <div class="vector-toc-text"> <span class="vector-toc-numb">8</span> <span>See also</span> </div> </a> <ul id="toc-See_also-sublist" class="vector-toc-list"> </ul> </li> <li id="toc-Notes" class="vector-toc-list-item vector-toc-level-1 vector-toc-list-item-expanded"> <a class="vector-toc-link" href="#Notes"> <div class="vector-toc-text"> <span class="vector-toc-numb">9</span> <span>Notes</span> </div> </a> <ul id="toc-Notes-sublist" class="vector-toc-list"> </ul> </li> <li id="toc-References" class="vector-toc-list-item vector-toc-level-1 vector-toc-list-item-expanded"> <a class="vector-toc-link" href="#References"> <div class="vector-toc-text"> <span class="vector-toc-numb">10</span> <span><span dir="ltr">References</span></span> </div> </a> <ul id="toc-References-sublist" class="vector-toc-list"> </ul> </li> <li id="toc-Further_reading" class="vector-toc-list-item vector-toc-level-1 vector-toc-list-item-expanded"> <a class="vector-toc-link" href="#Further_reading"> <div class="vector-toc-text"> <span class="vector-toc-numb">11</span> <span>Further reading</span> </div> </a> <ul id="toc-Further_reading-sublist" class="vector-toc-list"> </ul> </li> <li id="toc-External_links" class="vector-toc-list-item vector-toc-level-1 vector-toc-list-item-expanded"> <a class="vector-toc-link" href="#External_links"> <div class="vector-toc-text"> <span class="vector-toc-numb">12</span> <span>External links</span> </div> </a> <ul id="toc-External_links-sublist" class="vector-toc-list"> </ul> </li> </ul> </div> </div> </nav> </div> </div> <div class="mw-content-container"> <main id="content" class="mw-body"> <header class="mw-body-header vector-page-titlebar"> <nav aria-label="Contents" class="vector-toc-landmark"> <div id="vector-page-titlebar-toc" class="vector-dropdown vector-page-titlebar-toc vector-button-flush-left" title="Table of Contents" > <input type="checkbox" id="vector-page-titlebar-toc-checkbox" role="button" aria-haspopup="true" data-event-name="ui.dropdown-vector-page-titlebar-toc" class="vector-dropdown-checkbox " aria-label="Toggle the table of contents" > <label id="vector-page-titlebar-toc-label" for="vector-page-titlebar-toc-checkbox" class="vector-dropdown-label cdx-button cdx-button--fake-button cdx-button--fake-button--enabled cdx-button--weight-quiet cdx-button--icon-only " aria-hidden="true" ><span class="vector-icon mw-ui-icon-listBullet mw-ui-icon-wikimedia-listBullet"></span> <span class="vector-dropdown-label-text">Toggle the table of contents</span> </label> <div class="vector-dropdown-content"> <div id="vector-page-titlebar-toc-unpinned-container" class="vector-unpinned-container"> </div> </div> </div> </nav> <h1 id="firstHeading" class="firstHeading mw-first-heading"><span class="mw-page-title-main">Stochastic gradient descent</span></h1> <div id="p-lang-btn" class="vector-dropdown mw-portlet mw-portlet-lang" > <input type="checkbox" id="p-lang-btn-checkbox" role="button" aria-haspopup="true" data-event-name="ui.dropdown-p-lang-btn" class="vector-dropdown-checkbox mw-interlanguage-selector" aria-label="Go to an article in another language. Available in 13 languages" > <label id="p-lang-btn-label" for="p-lang-btn-checkbox" class="vector-dropdown-label cdx-button cdx-button--fake-button cdx-button--fake-button--enabled cdx-button--weight-quiet cdx-button--action-progressive mw-portlet-lang-heading-13" aria-hidden="true" ><span class="vector-icon mw-ui-icon-language-progressive mw-ui-icon-wikimedia-language-progressive"></span> <span class="vector-dropdown-label-text">13 languages</span> </label> <div class="vector-dropdown-content"> <div class="vector-menu-content"> <ul class="vector-menu-content-list"> <li class="interlanguage-link interwiki-ca mw-list-item"><a href="https://ca.wikipedia.org/wiki/Algorisme_del_gradient_estoc%C3%A0stic" title="Algorisme del gradient estocàstic – Catalan" lang="ca" hreflang="ca" data-title="Algorisme del gradient estocàstic" data-language-autonym="Català" data-language-local-name="Catalan" class="interlanguage-link-target"><span>Català</span></a></li><li class="interlanguage-link interwiki-es mw-list-item"><a href="https://es.wikipedia.org/wiki/Descenso_de_gradiente_estoc%C3%A1stico" title="Descenso de gradiente estocástico – Spanish" lang="es" hreflang="es" data-title="Descenso de gradiente estocástico" data-language-autonym="Español" data-language-local-name="Spanish" class="interlanguage-link-target"><span>Español</span></a></li><li class="interlanguage-link interwiki-fa mw-list-item"><a href="https://fa.wikipedia.org/wiki/%DA%AF%D8%B1%D8%A7%D8%AF%DB%8C%D8%A7%D9%86_%DA%A9%D8%A7%D9%87%D8%B4%DB%8C_%D8%AA%D8%B5%D8%A7%D8%AF%D9%81%DB%8C" title="گرادیان کاهشی تصادفی – Persian" lang="fa" hreflang="fa" data-title="گرادیان کاهشی تصادفی" data-language-autonym="فارسی" data-language-local-name="Persian" class="interlanguage-link-target"><span>فارسی</span></a></li><li class="interlanguage-link interwiki-fr mw-list-item"><a href="https://fr.wikipedia.org/wiki/Algorithme_du_gradient_stochastique" title="Algorithme du gradient stochastique – French" lang="fr" hreflang="fr" data-title="Algorithme du gradient stochastique" data-language-autonym="Français" data-language-local-name="French" class="interlanguage-link-target"><span>Français</span></a></li><li class="interlanguage-link interwiki-ko mw-list-item"><a href="https://ko.wikipedia.org/wiki/%ED%99%95%EB%A5%A0%EC%A0%81_%EA%B2%BD%EC%82%AC_%ED%95%98%EA%B0%95%EB%B2%95" title="확률적 경사 하강법 – Korean" lang="ko" hreflang="ko" data-title="확률적 경사 하강법" data-language-autonym="한국어" data-language-local-name="Korean" class="interlanguage-link-target"><span>한국어</span></a></li><li class="interlanguage-link interwiki-id mw-list-item"><a href="https://id.wikipedia.org/wiki/Penurunan_gradien_stokastik" title="Penurunan gradien stokastik – Indonesian" lang="id" hreflang="id" data-title="Penurunan gradien stokastik" data-language-autonym="Bahasa Indonesia" data-language-local-name="Indonesian" class="interlanguage-link-target"><span>Bahasa Indonesia</span></a></li><li class="interlanguage-link interwiki-it mw-list-item"><a href="https://it.wikipedia.org/wiki/Discesa_stocastica_del_gradiente" title="Discesa stocastica del gradiente – Italian" lang="it" hreflang="it" data-title="Discesa stocastica del gradiente" data-language-autonym="Italiano" data-language-local-name="Italian" class="interlanguage-link-target"><span>Italiano</span></a></li><li class="interlanguage-link interwiki-ja mw-list-item"><a href="https://ja.wikipedia.org/wiki/%E7%A2%BA%E7%8E%87%E7%9A%84%E5%8B%BE%E9%85%8D%E9%99%8D%E4%B8%8B%E6%B3%95" title="確率的勾配降下法 – Japanese" lang="ja" hreflang="ja" data-title="確率的勾配降下法" data-language-autonym="日本語" data-language-local-name="Japanese" class="interlanguage-link-target"><span>日本語</span></a></li><li class="interlanguage-link interwiki-ru mw-list-item"><a href="https://ru.wikipedia.org/wiki/%D0%A1%D1%82%D0%BE%D1%85%D0%B0%D1%81%D1%82%D0%B8%D1%87%D0%B5%D1%81%D0%BA%D0%B8%D0%B9_%D0%B3%D1%80%D0%B0%D0%B4%D0%B8%D0%B5%D0%BD%D1%82%D0%BD%D1%8B%D0%B9_%D1%81%D0%BF%D1%83%D1%81%D0%BA" title="Стохастический градиентный спуск – Russian" lang="ru" hreflang="ru" data-title="Стохастический градиентный спуск" data-language-autonym="Русский" data-language-local-name="Russian" class="interlanguage-link-target"><span>Русский</span></a></li><li class="interlanguage-link interwiki-sq mw-list-item"><a href="https://sq.wikipedia.org/wiki/Zbritja_stokastike_e_gradientit" title="Zbritja stokastike e gradientit – Albanian" lang="sq" hreflang="sq" data-title="Zbritja stokastike e gradientit" data-language-autonym="Shqip" data-language-local-name="Albanian" class="interlanguage-link-target"><span>Shqip</span></a></li><li class="interlanguage-link interwiki-th mw-list-item"><a href="https://th.wikipedia.org/wiki/%E0%B8%81%E0%B8%B2%E0%B8%A3%E0%B9%80%E0%B8%84%E0%B8%A5%E0%B8%B7%E0%B9%88%E0%B8%AD%E0%B8%99%E0%B8%A5%E0%B8%87%E0%B8%95%E0%B8%B2%E0%B8%A1%E0%B8%84%E0%B8%A7%E0%B8%B2%E0%B8%A1%E0%B8%8A%E0%B8%B1%E0%B8%99%E0%B9%81%E0%B8%9A%E0%B8%9A%E0%B9%80%E0%B8%9F%E0%B9%89%E0%B8%99%E0%B8%AA%E0%B8%B8%E0%B9%88%E0%B8%A1" title="การเคลื่อนลงตามความชันแบบเฟ้นสุ่ม – Thai" lang="th" hreflang="th" data-title="การเคลื่อนลงตามความชันแบบเฟ้นสุ่ม" data-language-autonym="ไทย" data-language-local-name="Thai" class="interlanguage-link-target"><span>ไทย</span></a></li><li class="interlanguage-link interwiki-uk mw-list-item"><a href="https://uk.wikipedia.org/wiki/%D0%A1%D1%82%D0%BE%D1%85%D0%B0%D1%81%D1%82%D0%B8%D1%87%D0%BD%D0%B8%D0%B9_%D0%B3%D1%80%D0%B0%D0%B4%D1%96%D1%94%D0%BD%D1%82%D0%BD%D0%B8%D0%B9_%D1%81%D0%BF%D1%83%D1%81%D0%BA" title="Стохастичний градієнтний спуск – Ukrainian" lang="uk" hreflang="uk" data-title="Стохастичний градієнтний спуск" data-language-autonym="Українська" data-language-local-name="Ukrainian" class="interlanguage-link-target"><span>Українська</span></a></li><li class="interlanguage-link interwiki-zh-yue mw-list-item"><a href="https://zh-yue.wikipedia.org/wiki/%E9%9A%A8%E6%A9%9F%E6%A2%AF%E5%BA%A6%E4%B8%8B%E9%99%8D%E6%B3%95" title="隨機梯度下降法 – Cantonese" lang="yue" hreflang="yue" data-title="隨機梯度下降法" data-language-autonym="粵語" data-language-local-name="Cantonese" class="interlanguage-link-target"><span>粵語</span></a></li> </ul> <div class="after-portlet after-portlet-lang"><span class="wb-langlinks-edit wb-langlinks-link"><a href="https://www.wikidata.org/wiki/Special:EntityPage/Q7617819#sitelinks-wikipedia" title="Edit interlanguage links" class="wbc-editpage">Edit links</a></span></div> </div> </div> </div> </header> <div class="vector-page-toolbar"> <div class="vector-page-toolbar-container"> <div id="left-navigation"> <nav aria-label="Namespaces"> <div id="p-associated-pages" class="vector-menu vector-menu-tabs mw-portlet mw-portlet-associated-pages" > <div class="vector-menu-content"> <ul class="vector-menu-content-list"> <li id="ca-nstab-main" class="selected vector-tab-noicon mw-list-item"><a href="/wiki/Stochastic_gradient_descent" title="View the content page [c]" accesskey="c"><span>Article</span></a></li><li id="ca-talk" class="vector-tab-noicon mw-list-item"><a href="/wiki/Talk:Stochastic_gradient_descent" rel="discussion" title="Discuss improvements to the content page [t]" accesskey="t"><span>Talk</span></a></li> </ul> </div> </div> <div id="vector-variants-dropdown" class="vector-dropdown emptyPortlet" > <input type="checkbox" id="vector-variants-dropdown-checkbox" role="button" aria-haspopup="true" data-event-name="ui.dropdown-vector-variants-dropdown" class="vector-dropdown-checkbox " aria-label="Change language variant" > <label id="vector-variants-dropdown-label" for="vector-variants-dropdown-checkbox" class="vector-dropdown-label cdx-button cdx-button--fake-button cdx-button--fake-button--enabled cdx-button--weight-quiet" aria-hidden="true" ><span class="vector-dropdown-label-text">English</span> </label> <div class="vector-dropdown-content"> <div id="p-variants" class="vector-menu mw-portlet mw-portlet-variants emptyPortlet" > <div class="vector-menu-content"> <ul class="vector-menu-content-list"> </ul> </div> </div> </div> </div> </nav> </div> <div id="right-navigation" class="vector-collapsible"> <nav aria-label="Views"> <div id="p-views" class="vector-menu vector-menu-tabs mw-portlet mw-portlet-views" > <div class="vector-menu-content"> <ul class="vector-menu-content-list"> <li id="ca-view" class="selected vector-tab-noicon mw-list-item"><a href="/wiki/Stochastic_gradient_descent"><span>Read</span></a></li><li id="ca-edit" class="vector-tab-noicon mw-list-item"><a href="/w/index.php?title=Stochastic_gradient_descent&amp;action=edit" title="Edit this page [e]" accesskey="e"><span>Edit</span></a></li><li id="ca-history" class="vector-tab-noicon mw-list-item"><a href="/w/index.php?title=Stochastic_gradient_descent&amp;action=history" title="Past revisions of this page [h]" accesskey="h"><span>View history</span></a></li> </ul> </div> </div> </nav> <nav class="vector-page-tools-landmark" aria-label="Page tools"> <div id="vector-page-tools-dropdown" class="vector-dropdown vector-page-tools-dropdown" > <input type="checkbox" id="vector-page-tools-dropdown-checkbox" role="button" aria-haspopup="true" data-event-name="ui.dropdown-vector-page-tools-dropdown" class="vector-dropdown-checkbox " aria-label="Tools" > <label id="vector-page-tools-dropdown-label" for="vector-page-tools-dropdown-checkbox" class="vector-dropdown-label cdx-button cdx-button--fake-button cdx-button--fake-button--enabled cdx-button--weight-quiet" aria-hidden="true" ><span class="vector-dropdown-label-text">Tools</span> </label> <div class="vector-dropdown-content"> <div id="vector-page-tools-unpinned-container" class="vector-unpinned-container"> <div id="vector-page-tools" class="vector-page-tools vector-pinnable-element"> <div class="vector-pinnable-header vector-page-tools-pinnable-header vector-pinnable-header-unpinned" data-feature-name="page-tools-pinned" data-pinnable-element-id="vector-page-tools" data-pinned-container-id="vector-page-tools-pinned-container" data-unpinned-container-id="vector-page-tools-unpinned-container" > <div class="vector-pinnable-header-label">Tools</div> <button class="vector-pinnable-header-toggle-button vector-pinnable-header-pin-button" data-event-name="pinnable-header.vector-page-tools.pin">move to sidebar</button> <button class="vector-pinnable-header-toggle-button vector-pinnable-header-unpin-button" data-event-name="pinnable-header.vector-page-tools.unpin">hide</button> </div> <div id="p-cactions" class="vector-menu mw-portlet mw-portlet-cactions emptyPortlet vector-has-collapsible-items" title="More options" > <div class="vector-menu-heading"> Actions </div> <div class="vector-menu-content"> <ul class="vector-menu-content-list"> <li id="ca-more-view" class="selected vector-more-collapsible-item mw-list-item"><a href="/wiki/Stochastic_gradient_descent"><span>Read</span></a></li><li id="ca-more-edit" class="vector-more-collapsible-item mw-list-item"><a href="/w/index.php?title=Stochastic_gradient_descent&amp;action=edit" title="Edit this page [e]" accesskey="e"><span>Edit</span></a></li><li id="ca-more-history" class="vector-more-collapsible-item mw-list-item"><a href="/w/index.php?title=Stochastic_gradient_descent&amp;action=history"><span>View history</span></a></li> </ul> </div> </div> <div id="p-tb" class="vector-menu mw-portlet mw-portlet-tb" > <div class="vector-menu-heading"> General </div> <div class="vector-menu-content"> <ul class="vector-menu-content-list"> <li id="t-whatlinkshere" class="mw-list-item"><a href="/wiki/Special:WhatLinksHere/Stochastic_gradient_descent" title="List of all English Wikipedia pages containing links to this page [j]" accesskey="j"><span>What links here</span></a></li><li id="t-recentchangeslinked" class="mw-list-item"><a href="/wiki/Special:RecentChangesLinked/Stochastic_gradient_descent" rel="nofollow" title="Recent changes in pages linked from this page [k]" accesskey="k"><span>Related changes</span></a></li><li id="t-upload" class="mw-list-item"><a href="//en.wikipedia.org/wiki/Wikipedia:File_Upload_Wizard" title="Upload files [u]" accesskey="u"><span>Upload file</span></a></li><li id="t-specialpages" class="mw-list-item"><a href="/wiki/Special:SpecialPages" title="A list of all special pages [q]" accesskey="q"><span>Special pages</span></a></li><li id="t-permalink" class="mw-list-item"><a href="/w/index.php?title=Stochastic_gradient_descent&amp;oldid=1265558819" title="Permanent link to this revision of this page"><span>Permanent link</span></a></li><li id="t-info" class="mw-list-item"><a href="/w/index.php?title=Stochastic_gradient_descent&amp;action=info" title="More information about this page"><span>Page information</span></a></li><li id="t-cite" class="mw-list-item"><a href="/w/index.php?title=Special:CiteThisPage&amp;page=Stochastic_gradient_descent&amp;id=1265558819&amp;wpFormIdentifier=titleform" title="Information on how to cite this page"><span>Cite this page</span></a></li><li id="t-urlshortener" class="mw-list-item"><a href="/w/index.php?title=Special:UrlShortener&amp;url=https%3A%2F%2Fen.wikipedia.org%2Fwiki%2FStochastic_gradient_descent"><span>Get shortened URL</span></a></li><li id="t-urlshortener-qrcode" class="mw-list-item"><a href="/w/index.php?title=Special:QrCode&amp;url=https%3A%2F%2Fen.wikipedia.org%2Fwiki%2FStochastic_gradient_descent"><span>Download QR code</span></a></li> </ul> </div> </div> <div id="p-coll-print_export" class="vector-menu mw-portlet mw-portlet-coll-print_export" > <div class="vector-menu-heading"> Print/export </div> <div class="vector-menu-content"> <ul class="vector-menu-content-list"> <li id="coll-download-as-rl" class="mw-list-item"><a href="/w/index.php?title=Special:DownloadAsPdf&amp;page=Stochastic_gradient_descent&amp;action=show-download-screen" title="Download this page as a PDF file"><span>Download as PDF</span></a></li><li id="t-print" class="mw-list-item"><a href="/w/index.php?title=Stochastic_gradient_descent&amp;printable=yes" title="Printable version of this page [p]" accesskey="p"><span>Printable version</span></a></li> </ul> </div> </div> <div id="p-wikibase-otherprojects" class="vector-menu mw-portlet mw-portlet-wikibase-otherprojects" > <div class="vector-menu-heading"> In other projects </div> <div class="vector-menu-content"> <ul class="vector-menu-content-list"> <li id="t-wikibase" class="wb-otherproject-link wb-otherproject-wikibase-dataitem mw-list-item"><a href="https://www.wikidata.org/wiki/Special:EntityPage/Q7617819" title="Structured data on this page hosted by Wikidata [g]" accesskey="g"><span>Wikidata item</span></a></li> </ul> </div> </div> </div> </div> </div> </div> </nav> </div> </div> </div> <div class="vector-column-end"> <div class="vector-sticky-pinned-container"> <nav class="vector-page-tools-landmark" aria-label="Page tools"> <div id="vector-page-tools-pinned-container" class="vector-pinned-container"> </div> </nav> <nav class="vector-appearance-landmark" aria-label="Appearance"> <div id="vector-appearance-pinned-container" class="vector-pinned-container"> <div id="vector-appearance" class="vector-appearance vector-pinnable-element"> <div class="vector-pinnable-header vector-appearance-pinnable-header vector-pinnable-header-pinned" data-feature-name="appearance-pinned" data-pinnable-element-id="vector-appearance" data-pinned-container-id="vector-appearance-pinned-container" data-unpinned-container-id="vector-appearance-unpinned-container" > <div class="vector-pinnable-header-label">Appearance</div> <button class="vector-pinnable-header-toggle-button vector-pinnable-header-pin-button" data-event-name="pinnable-header.vector-appearance.pin">move to sidebar</button> <button class="vector-pinnable-header-toggle-button vector-pinnable-header-unpin-button" data-event-name="pinnable-header.vector-appearance.unpin">hide</button> </div> </div> </div> </nav> </div> </div> <div id="bodyContent" class="vector-body" aria-labelledby="firstHeading" data-mw-ve-target-container> <div class="vector-body-before-content"> <div class="mw-indicators"> </div> <div id="siteSub" class="noprint">From Wikipedia, the free encyclopedia</div> </div> <div id="contentSub"><div id="mw-content-subtitle"></div></div> <div id="mw-content-text" class="mw-body-content"><div class="mw-content-ltr mw-parser-output" lang="en" dir="ltr"><div class="shortdescription nomobile noexcerpt noprint searchaux" style="display:none">Optimization algorithm</div> <style data-mw-deduplicate="TemplateStyles:r1244144826">.mw-parser-output .machine-learning-list-title{background-color:#ddddff}html.skin-theme-clientpref-night .mw-parser-output .machine-learning-list-title{background-color:#222}@media(prefers-color-scheme:dark){html.skin-theme-clientpref-os .mw-parser-output .machine-learning-list-title{background-color:#222}}</style> <style data-mw-deduplicate="TemplateStyles:r1129693374">.mw-parser-output .hlist dl,.mw-parser-output .hlist ol,.mw-parser-output .hlist ul{margin:0;padding:0}.mw-parser-output .hlist dd,.mw-parser-output .hlist dt,.mw-parser-output .hlist li{margin:0;display:inline}.mw-parser-output .hlist.inline,.mw-parser-output .hlist.inline dl,.mw-parser-output .hlist.inline ol,.mw-parser-output .hlist.inline ul,.mw-parser-output .hlist dl dl,.mw-parser-output .hlist dl ol,.mw-parser-output .hlist dl ul,.mw-parser-output .hlist ol dl,.mw-parser-output .hlist ol ol,.mw-parser-output .hlist ol ul,.mw-parser-output .hlist ul dl,.mw-parser-output .hlist ul ol,.mw-parser-output .hlist ul ul{display:inline}.mw-parser-output .hlist .mw-empty-li{display:none}.mw-parser-output .hlist dt::after{content:": "}.mw-parser-output .hlist dd::after,.mw-parser-output .hlist li::after{content:" · ";font-weight:bold}.mw-parser-output .hlist dd:last-child::after,.mw-parser-output .hlist dt:last-child::after,.mw-parser-output .hlist li:last-child::after{content:none}.mw-parser-output .hlist dd dd:first-child::before,.mw-parser-output .hlist dd dt:first-child::before,.mw-parser-output .hlist dd li:first-child::before,.mw-parser-output .hlist dt dd:first-child::before,.mw-parser-output .hlist dt dt:first-child::before,.mw-parser-output .hlist dt li:first-child::before,.mw-parser-output .hlist li dd:first-child::before,.mw-parser-output .hlist li dt:first-child::before,.mw-parser-output .hlist li li:first-child::before{content:" (";font-weight:normal}.mw-parser-output .hlist dd dd:last-child::after,.mw-parser-output .hlist dd dt:last-child::after,.mw-parser-output .hlist dd li:last-child::after,.mw-parser-output .hlist dt dd:last-child::after,.mw-parser-output .hlist dt dt:last-child::after,.mw-parser-output .hlist dt li:last-child::after,.mw-parser-output .hlist li dd:last-child::after,.mw-parser-output .hlist li dt:last-child::after,.mw-parser-output .hlist li li:last-child::after{content:")";font-weight:normal}.mw-parser-output .hlist ol{counter-reset:listitem}.mw-parser-output .hlist ol>li{counter-increment:listitem}.mw-parser-output .hlist ol>li::before{content:" "counter(listitem)"\a0 "}.mw-parser-output .hlist dd ol>li:first-child::before,.mw-parser-output .hlist dt ol>li:first-child::before,.mw-parser-output .hlist li ol>li:first-child::before{content:" ("counter(listitem)"\a0 "}</style><style data-mw-deduplicate="TemplateStyles:r1246091330">.mw-parser-output .sidebar{width:22em;float:right;clear:right;margin:0.5em 0 1em 1em;background:var(--background-color-neutral-subtle,#f8f9fa);border:1px solid var(--border-color-base,#a2a9b1);padding:0.2em;text-align:center;line-height:1.4em;font-size:88%;border-collapse:collapse;display:table}body.skin-minerva .mw-parser-output .sidebar{display:table!important;float:right!important;margin:0.5em 0 1em 1em!important}.mw-parser-output .sidebar-subgroup{width:100%;margin:0;border-spacing:0}.mw-parser-output .sidebar-left{float:left;clear:left;margin:0.5em 1em 1em 0}.mw-parser-output .sidebar-none{float:none;clear:both;margin:0.5em 1em 1em 0}.mw-parser-output .sidebar-outer-title{padding:0 0.4em 0.2em;font-size:125%;line-height:1.2em;font-weight:bold}.mw-parser-output .sidebar-top-image{padding:0.4em}.mw-parser-output .sidebar-top-caption,.mw-parser-output .sidebar-pretitle-with-top-image,.mw-parser-output .sidebar-caption{padding:0.2em 0.4em 0;line-height:1.2em}.mw-parser-output .sidebar-pretitle{padding:0.4em 0.4em 0;line-height:1.2em}.mw-parser-output .sidebar-title,.mw-parser-output .sidebar-title-with-pretitle{padding:0.2em 0.8em;font-size:145%;line-height:1.2em}.mw-parser-output .sidebar-title-with-pretitle{padding:0.1em 0.4em}.mw-parser-output .sidebar-image{padding:0.2em 0.4em 0.4em}.mw-parser-output .sidebar-heading{padding:0.1em 0.4em}.mw-parser-output .sidebar-content{padding:0 0.5em 0.4em}.mw-parser-output .sidebar-content-with-subgroup{padding:0.1em 0.4em 0.2em}.mw-parser-output .sidebar-above,.mw-parser-output .sidebar-below{padding:0.3em 0.8em;font-weight:bold}.mw-parser-output .sidebar-collapse .sidebar-above,.mw-parser-output .sidebar-collapse .sidebar-below{border-top:1px solid #aaa;border-bottom:1px solid #aaa}.mw-parser-output .sidebar-navbar{text-align:right;font-size:115%;padding:0 0.4em 0.4em}.mw-parser-output .sidebar-list-title{padding:0 0.4em;text-align:left;font-weight:bold;line-height:1.6em;font-size:105%}.mw-parser-output .sidebar-list-title-c{padding:0 0.4em;text-align:center;margin:0 3.3em}@media(max-width:640px){body.mediawiki .mw-parser-output .sidebar{width:100%!important;clear:both;float:none!important;margin-left:0!important;margin-right:0!important}}body.skin--responsive .mw-parser-output .sidebar a>img{max-width:none!important}@media screen{html.skin-theme-clientpref-night .mw-parser-output .sidebar:not(.notheme) .sidebar-list-title,html.skin-theme-clientpref-night .mw-parser-output .sidebar:not(.notheme) .sidebar-title-with-pretitle{background:transparent!important}html.skin-theme-clientpref-night .mw-parser-output .sidebar:not(.notheme) .sidebar-title-with-pretitle a{color:var(--color-progressive)!important}}@media screen and (prefers-color-scheme:dark){html.skin-theme-clientpref-os .mw-parser-output .sidebar:not(.notheme) .sidebar-list-title,html.skin-theme-clientpref-os .mw-parser-output .sidebar:not(.notheme) .sidebar-title-with-pretitle{background:transparent!important}html.skin-theme-clientpref-os .mw-parser-output .sidebar:not(.notheme) .sidebar-title-with-pretitle a{color:var(--color-progressive)!important}}@media print{body.ns-0 .mw-parser-output .sidebar{display:none!important}}</style><style data-mw-deduplicate="TemplateStyles:r886047488">.mw-parser-output .nobold{font-weight:normal}</style><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r886047488"><table class="sidebar sidebar-collapse nomobile nowraplinks"><tbody><tr><td class="sidebar-pretitle">Part of a series on</td></tr><tr><th class="sidebar-title-with-pretitle"><a href="/wiki/Machine_learning" title="Machine learning">Machine learning</a><br />and <a href="/wiki/Data_mining" title="Data mining">data mining</a></th></tr><tr><td class="sidebar-content"> <div class="sidebar-list mw-collapsible mw-collapsed machine-learning-list-title"><div class="sidebar-list-title" style="border-top:1px solid #aaa; text-align:center;;color: var(--color-base)">Paradigms</div><div class="sidebar-list-content mw-collapsible-content hlist" style="background-color: #FFFFFF;"> <ul><li><a href="/wiki/Supervised_learning" title="Supervised learning">Supervised learning</a></li> <li><a href="/wiki/Unsupervised_learning" title="Unsupervised learning">Unsupervised learning</a></li> <li><a href="/wiki/Semi-supervised_learning" class="mw-redirect" title="Semi-supervised learning">Semi-supervised learning</a></li> <li><a href="/wiki/Self-supervised_learning" title="Self-supervised learning">Self-supervised learning</a></li> <li><a href="/wiki/Reinforcement_learning" title="Reinforcement learning">Reinforcement learning</a></li> <li><a href="/wiki/Meta-learning_(computer_science)" title="Meta-learning (computer science)">Meta-learning</a></li> <li><a href="/wiki/Online_machine_learning" title="Online machine learning">Online learning</a></li> <li><a href="/wiki/Batch_learning" class="mw-redirect" title="Batch learning">Batch learning</a></li> <li><a href="/wiki/Curriculum_learning" title="Curriculum learning">Curriculum learning</a></li> <li><a href="/wiki/Rule-based_machine_learning" title="Rule-based machine learning">Rule-based learning</a></li> <li><a href="/wiki/Neuro-symbolic_AI" title="Neuro-symbolic AI">Neuro-symbolic AI</a></li> <li><a href="/wiki/Neuromorphic_engineering" class="mw-redirect" title="Neuromorphic engineering">Neuromorphic engineering</a></li> <li><a href="/wiki/Quantum_machine_learning" title="Quantum machine learning">Quantum machine learning</a></li></ul></div></div></td> </tr><tr><td class="sidebar-content"> <div class="sidebar-list mw-collapsible mw-collapsed machine-learning-list-title"><div class="sidebar-list-title" style="border-top:1px solid #aaa; text-align:center;;color: var(--color-base)">Problems</div><div class="sidebar-list-content mw-collapsible-content hlist" style="background-color: #FFFFFF;"> <ul><li><a href="/wiki/Statistical_classification" title="Statistical classification">Classification</a></li> <li><a href="/wiki/Generative_model" title="Generative model">Generative modeling</a></li> <li><a href="/wiki/Regression_analysis" title="Regression analysis">Regression</a></li> <li><a href="/wiki/Cluster_analysis" title="Cluster analysis">Clustering</a></li> <li><a href="/wiki/Dimensionality_reduction" title="Dimensionality reduction">Dimensionality reduction</a></li> <li><a href="/wiki/Density_estimation" title="Density estimation">Density estimation</a></li> <li><a href="/wiki/Anomaly_detection" title="Anomaly detection">Anomaly detection</a></li> <li><a href="/wiki/Data_cleaning" class="mw-redirect" title="Data cleaning">Data cleaning</a></li> <li><a href="/wiki/Automated_machine_learning" title="Automated machine learning">AutoML</a></li> <li><a href="/wiki/Association_rule_learning" title="Association rule learning">Association rules</a></li> <li><a href="/wiki/Semantic_analysis_(machine_learning)" title="Semantic analysis (machine learning)">Semantic analysis</a></li> <li><a href="/wiki/Structured_prediction" title="Structured prediction">Structured prediction</a></li> <li><a href="/wiki/Feature_engineering" title="Feature engineering">Feature engineering</a></li> <li><a href="/wiki/Feature_learning" title="Feature learning">Feature learning</a></li> <li><a href="/wiki/Learning_to_rank" title="Learning to rank">Learning to rank</a></li> <li><a href="/wiki/Grammar_induction" title="Grammar induction">Grammar induction</a></li> <li><a href="/wiki/Ontology_learning" title="Ontology learning">Ontology learning</a></li> <li><a href="/wiki/Multimodal_learning" title="Multimodal learning">Multimodal learning</a></li></ul></div></div></td> </tr><tr><td class="sidebar-content"> <div class="sidebar-list mw-collapsible mw-collapsed machine-learning-list-title"><div class="sidebar-list-title" style="border-top:1px solid #aaa; text-align:center;;color: var(--color-base)"><div style="display: inline-block; line-height: 1.2em; padding: .1em 0;"><a href="/wiki/Supervised_learning" title="Supervised learning">Supervised learning</a><br /><span class="nobold"><span style="font-size:85%;">(<b><a href="/wiki/Statistical_classification" title="Statistical classification">classification</a></b>&#160;&#8226;&#32;<b><a href="/wiki/Regression_analysis" title="Regression analysis">regression</a></b>)</span></span> </div></div><div class="sidebar-list-content mw-collapsible-content hlist" style="background-color: #FFFFFF;"> <ul><li><a href="/wiki/Apprenticeship_learning" title="Apprenticeship learning">Apprenticeship learning</a></li> <li><a href="/wiki/Decision_tree_learning" title="Decision tree learning">Decision trees</a></li> <li><a href="/wiki/Ensemble_learning" title="Ensemble learning">Ensembles</a> <ul><li><a href="/wiki/Bootstrap_aggregating" title="Bootstrap aggregating">Bagging</a></li> <li><a href="/wiki/Boosting_(machine_learning)" title="Boosting (machine learning)">Boosting</a></li> <li><a href="/wiki/Random_forest" title="Random forest">Random forest</a></li></ul></li> <li><a href="/wiki/K-nearest_neighbors_algorithm" title="K-nearest neighbors algorithm"><i>k</i>-NN</a></li> <li><a href="/wiki/Linear_regression" title="Linear regression">Linear regression</a></li> <li><a href="/wiki/Naive_Bayes_classifier" title="Naive Bayes classifier">Naive Bayes</a></li> <li><a href="/wiki/Artificial_neural_network" class="mw-redirect" title="Artificial neural network">Artificial neural networks</a></li> <li><a href="/wiki/Logistic_regression" title="Logistic regression">Logistic regression</a></li> <li><a href="/wiki/Perceptron" title="Perceptron">Perceptron</a></li> <li><a href="/wiki/Relevance_vector_machine" title="Relevance vector machine">Relevance vector machine (RVM)</a></li> <li><a href="/wiki/Support_vector_machine" title="Support vector machine">Support vector machine (SVM)</a></li></ul></div></div></td> </tr><tr><td class="sidebar-content"> <div class="sidebar-list mw-collapsible mw-collapsed machine-learning-list-title"><div class="sidebar-list-title" style="border-top:1px solid #aaa; text-align:center;;color: var(--color-base)"><a href="/wiki/Cluster_analysis" title="Cluster analysis">Clustering</a></div><div class="sidebar-list-content mw-collapsible-content hlist" style="background-color: #FFFFFF;"> <ul><li><a href="/wiki/BIRCH" title="BIRCH">BIRCH</a></li> <li><a href="/wiki/CURE_algorithm" title="CURE algorithm">CURE</a></li> <li><a href="/wiki/Hierarchical_clustering" title="Hierarchical clustering">Hierarchical</a></li> <li><a href="/wiki/K-means_clustering" title="K-means clustering"><i>k</i>-means</a></li> <li><a href="/wiki/Fuzzy_clustering" title="Fuzzy clustering">Fuzzy</a></li> <li><a href="/wiki/Expectation%E2%80%93maximization_algorithm" title="Expectation–maximization algorithm">Expectation–maximization (EM)</a></li> <li><br /><a href="/wiki/DBSCAN" title="DBSCAN">DBSCAN</a></li> <li><a href="/wiki/OPTICS_algorithm" title="OPTICS algorithm">OPTICS</a></li> <li><a href="/wiki/Mean_shift" title="Mean shift">Mean shift</a></li></ul></div></div></td> </tr><tr><td class="sidebar-content"> <div class="sidebar-list mw-collapsible mw-collapsed machine-learning-list-title"><div class="sidebar-list-title" style="border-top:1px solid #aaa; text-align:center;;color: var(--color-base)"><a href="/wiki/Dimensionality_reduction" title="Dimensionality reduction">Dimensionality reduction</a></div><div class="sidebar-list-content mw-collapsible-content hlist" style="background-color: #FFFFFF;"> <ul><li><a href="/wiki/Factor_analysis" title="Factor analysis">Factor analysis</a></li> <li><a href="/wiki/Canonical_correlation" title="Canonical correlation">CCA</a></li> <li><a href="/wiki/Independent_component_analysis" title="Independent component analysis">ICA</a></li> <li><a href="/wiki/Linear_discriminant_analysis" title="Linear discriminant analysis">LDA</a></li> <li><a href="/wiki/Non-negative_matrix_factorization" title="Non-negative matrix factorization">NMF</a></li> <li><a href="/wiki/Principal_component_analysis" title="Principal component analysis">PCA</a></li> <li><a href="/wiki/Proper_generalized_decomposition" title="Proper generalized decomposition">PGD</a></li> <li><a href="/wiki/T-distributed_stochastic_neighbor_embedding" title="T-distributed stochastic neighbor embedding">t-SNE</a></li> <li><a href="/wiki/Sparse_dictionary_learning" title="Sparse dictionary learning">SDL</a></li></ul></div></div></td> </tr><tr><td class="sidebar-content"> <div class="sidebar-list mw-collapsible mw-collapsed machine-learning-list-title"><div class="sidebar-list-title" style="border-top:1px solid #aaa; text-align:center;;color: var(--color-base)"><a href="/wiki/Structured_prediction" title="Structured prediction">Structured prediction</a></div><div class="sidebar-list-content mw-collapsible-content hlist" style="background-color: #FFFFFF;"> <ul><li><a href="/wiki/Graphical_model" title="Graphical model">Graphical models</a> <ul><li><a href="/wiki/Bayesian_network" title="Bayesian network">Bayes net</a></li> <li><a href="/wiki/Conditional_random_field" title="Conditional random field">Conditional random field</a></li> <li><a href="/wiki/Hidden_Markov_model" title="Hidden Markov model">Hidden Markov</a></li></ul></li></ul></div></div></td> </tr><tr><td class="sidebar-content"> <div class="sidebar-list mw-collapsible mw-collapsed machine-learning-list-title"><div class="sidebar-list-title" style="border-top:1px solid #aaa; text-align:center;;color: var(--color-base)"><a href="/wiki/Anomaly_detection" title="Anomaly detection">Anomaly detection</a></div><div class="sidebar-list-content mw-collapsible-content hlist" style="background-color: #FFFFFF;"> <ul><li><a href="/wiki/Random_sample_consensus" title="Random sample consensus">RANSAC</a></li> <li><a href="/wiki/K-nearest_neighbors_algorithm" title="K-nearest neighbors algorithm"><i>k</i>-NN</a></li> <li><a href="/wiki/Local_outlier_factor" title="Local outlier factor">Local outlier factor</a></li> <li><a href="/wiki/Isolation_forest" title="Isolation forest">Isolation forest</a></li></ul></div></div></td> </tr><tr><td class="sidebar-content"> <div class="sidebar-list mw-collapsible mw-collapsed machine-learning-list-title"><div class="sidebar-list-title" style="border-top:1px solid #aaa; text-align:center;;color: var(--color-base)"><a href="/wiki/Artificial_neural_network" class="mw-redirect" title="Artificial neural network">Artificial neural network</a></div><div class="sidebar-list-content mw-collapsible-content hlist" style="background-color: #FFFFFF;"> <ul><li><a href="/wiki/Autoencoder" title="Autoencoder">Autoencoder</a></li> <li><a href="/wiki/Deep_learning" title="Deep learning">Deep learning</a></li> <li><a href="/wiki/Feedforward_neural_network" title="Feedforward neural network">Feedforward neural network</a></li> <li><a href="/wiki/Recurrent_neural_network" title="Recurrent neural network">Recurrent neural network</a> <ul><li><a href="/wiki/Long_short-term_memory" title="Long short-term memory">LSTM</a></li> <li><a href="/wiki/Gated_recurrent_unit" title="Gated recurrent unit">GRU</a></li> <li><a href="/wiki/Echo_state_network" title="Echo state network">ESN</a></li> <li><a href="/wiki/Reservoir_computing" title="Reservoir computing">reservoir computing</a></li></ul></li> <li><a href="/wiki/Boltzmann_machine" title="Boltzmann machine">Boltzmann machine</a> <ul><li><a href="/wiki/Restricted_Boltzmann_machine" title="Restricted Boltzmann machine">Restricted</a></li></ul></li> <li><a href="/wiki/Generative_adversarial_network" title="Generative adversarial network">GAN</a></li> <li><a href="/wiki/Diffusion_model" title="Diffusion model">Diffusion model</a></li> <li><a href="/wiki/Self-organizing_map" title="Self-organizing map">SOM</a></li> <li><a href="/wiki/Convolutional_neural_network" title="Convolutional neural network">Convolutional neural network</a> <ul><li><a href="/wiki/U-Net" title="U-Net">U-Net</a></li> <li><a href="/wiki/LeNet" title="LeNet">LeNet</a></li> <li><a href="/wiki/AlexNet" title="AlexNet">AlexNet</a></li> <li><a href="/wiki/DeepDream" title="DeepDream">DeepDream</a></li></ul></li> <li><a href="/wiki/Neural_radiance_field" title="Neural radiance field">Neural radiance field</a></li> <li><a href="/wiki/Transformer_(machine_learning_model)" class="mw-redirect" title="Transformer (machine learning model)">Transformer</a> <ul><li><a href="/wiki/Vision_transformer" title="Vision transformer">Vision</a></li></ul></li> <li><a href="/wiki/Mamba_(deep_learning_architecture)" title="Mamba (deep learning architecture)">Mamba</a></li> <li><a href="/wiki/Spiking_neural_network" title="Spiking neural network">Spiking neural network</a></li> <li><a href="/wiki/Memtransistor" title="Memtransistor">Memtransistor</a></li> <li><a href="/wiki/Electrochemical_RAM" title="Electrochemical RAM">Electrochemical RAM</a> (ECRAM)</li></ul></div></div></td> </tr><tr><td class="sidebar-content"> <div class="sidebar-list mw-collapsible mw-collapsed machine-learning-list-title"><div class="sidebar-list-title" style="border-top:1px solid #aaa; text-align:center;;color: var(--color-base)"><a href="/wiki/Reinforcement_learning" title="Reinforcement learning">Reinforcement learning</a></div><div class="sidebar-list-content mw-collapsible-content hlist" style="background-color: #FFFFFF;"> <ul><li><a href="/wiki/Q-learning" title="Q-learning">Q-learning</a></li> <li><a href="/wiki/State%E2%80%93action%E2%80%93reward%E2%80%93state%E2%80%93action" title="State–action–reward–state–action">SARSA</a></li> <li><a href="/wiki/Temporal_difference_learning" title="Temporal difference learning">Temporal difference (TD)</a></li> <li><a href="/wiki/Multi-agent_reinforcement_learning" title="Multi-agent reinforcement learning">Multi-agent</a> <ul><li><a href="/wiki/Self-play_(reinforcement_learning_technique)" class="mw-redirect" title="Self-play (reinforcement learning technique)">Self-play</a></li></ul></li></ul></div></div></td> </tr><tr><td class="sidebar-content"> <div class="sidebar-list mw-collapsible mw-collapsed machine-learning-list-title"><div class="sidebar-list-title" style="border-top:1px solid #aaa; text-align:center;;color: var(--color-base)">Learning with humans</div><div class="sidebar-list-content mw-collapsible-content hlist" style="background-color: #FFFFFF;"> <ul><li><a href="/wiki/Active_learning_(machine_learning)" title="Active learning (machine learning)">Active learning</a></li> <li><a href="/wiki/Crowdsourcing" title="Crowdsourcing">Crowdsourcing</a></li> <li><a href="/wiki/Human-in-the-loop" title="Human-in-the-loop">Human-in-the-loop</a></li> <li><a href="/wiki/Reinforcement_learning_from_human_feedback" title="Reinforcement learning from human feedback">RLHF</a></li></ul></div></div></td> </tr><tr><td class="sidebar-content"> <div class="sidebar-list mw-collapsible mw-collapsed machine-learning-list-title"><div class="sidebar-list-title" style="border-top:1px solid #aaa; text-align:center;;color: var(--color-base)">Model diagnostics</div><div class="sidebar-list-content mw-collapsible-content hlist" style="background-color: #FFFFFF;"> <ul><li><a href="/wiki/Coefficient_of_determination" title="Coefficient of determination">Coefficient of determination</a></li> <li><a href="/wiki/Confusion_matrix" title="Confusion matrix">Confusion matrix</a></li> <li><a href="/wiki/Learning_curve_(machine_learning)" title="Learning curve (machine learning)">Learning curve</a></li> <li><a href="/wiki/Receiver_operating_characteristic" title="Receiver operating characteristic">ROC curve</a></li></ul></div></div></td> </tr><tr><td class="sidebar-content"> <div class="sidebar-list mw-collapsible mw-collapsed machine-learning-list-title"><div class="sidebar-list-title" style="border-top:1px solid #aaa; text-align:center;;color: var(--color-base)">Mathematical foundations</div><div class="sidebar-list-content mw-collapsible-content hlist" style="background-color: #FFFFFF;"> <ul><li><a href="/wiki/Kernel_machines" class="mw-redirect" title="Kernel machines">Kernel machines</a></li> <li><a href="/wiki/Bias%E2%80%93variance_tradeoff" title="Bias–variance tradeoff">Bias–variance tradeoff</a></li> <li><a href="/wiki/Computational_learning_theory" title="Computational learning theory">Computational learning theory</a></li> <li><a href="/wiki/Empirical_risk_minimization" title="Empirical risk minimization">Empirical risk minimization</a></li> <li><a href="/wiki/Occam_learning" title="Occam learning">Occam learning</a></li> <li><a href="/wiki/Probably_approximately_correct_learning" title="Probably approximately correct learning">PAC learning</a></li> <li><a href="/wiki/Statistical_learning_theory" title="Statistical learning theory">Statistical learning</a></li> <li><a href="/wiki/Vapnik%E2%80%93Chervonenkis_theory" title="Vapnik–Chervonenkis theory">VC theory</a></li> <li><a href="/wiki/Topological_deep_learning" title="Topological deep learning">Topological deep learning</a></li></ul></div></div></td> </tr><tr><td class="sidebar-content"> <div class="sidebar-list mw-collapsible mw-collapsed machine-learning-list-title"><div class="sidebar-list-title" style="border-top:1px solid #aaa; text-align:center;;color: var(--color-base)">Journals and conferences</div><div class="sidebar-list-content mw-collapsible-content hlist" style="background-color: #FFFFFF;"> <ul><li><a href="/wiki/ECML_PKDD" title="ECML PKDD">ECML PKDD</a></li> <li><a href="/wiki/Conference_on_Neural_Information_Processing_Systems" title="Conference on Neural Information Processing Systems">NeurIPS</a></li> <li><a href="/wiki/International_Conference_on_Machine_Learning" title="International Conference on Machine Learning">ICML</a></li> <li><a href="/wiki/International_Conference_on_Learning_Representations" title="International Conference on Learning Representations">ICLR</a></li> <li><a href="/wiki/International_Joint_Conference_on_Artificial_Intelligence" title="International Joint Conference on Artificial Intelligence">IJCAI</a></li> <li><a href="/wiki/Machine_Learning_(journal)" title="Machine Learning (journal)">ML</a></li> <li><a href="/wiki/Journal_of_Machine_Learning_Research" title="Journal of Machine Learning Research">JMLR</a></li></ul></div></div></td> </tr><tr><td class="sidebar-content"> <div class="sidebar-list mw-collapsible mw-collapsed machine-learning-list-title"><div class="sidebar-list-title" style="border-top:1px solid #aaa; text-align:center;;color: var(--color-base)">Related articles</div><div class="sidebar-list-content mw-collapsible-content hlist" style="background-color: #FFFFFF;"> <ul><li><a href="/wiki/Glossary_of_artificial_intelligence" title="Glossary of artificial intelligence">Glossary of artificial intelligence</a></li> <li><a href="/wiki/List_of_datasets_for_machine-learning_research" title="List of datasets for machine-learning research">List of datasets for machine-learning research</a> <ul><li><a href="/wiki/List_of_datasets_in_computer_vision_and_image_processing" title="List of datasets in computer vision and image processing">List of datasets in computer vision and image processing</a></li></ul></li> <li><a href="/wiki/Outline_of_machine_learning" title="Outline of machine learning">Outline of machine learning</a></li></ul></div></div></td> </tr><tr><td class="sidebar-navbar"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1129693374"><style data-mw-deduplicate="TemplateStyles:r1239400231">.mw-parser-output .navbar{display:inline;font-size:88%;font-weight:normal}.mw-parser-output .navbar-collapse{float:left;text-align:left}.mw-parser-output .navbar-boxtext{word-spacing:0}.mw-parser-output .navbar ul{display:inline-block;white-space:nowrap;line-height:inherit}.mw-parser-output .navbar-brackets::before{margin-right:-0.125em;content:"[ "}.mw-parser-output .navbar-brackets::after{margin-left:-0.125em;content:" ]"}.mw-parser-output .navbar li{word-spacing:-0.125em}.mw-parser-output .navbar a>span,.mw-parser-output .navbar a>abbr{text-decoration:inherit}.mw-parser-output .navbar-mini abbr{font-variant:small-caps;border-bottom:none;text-decoration:none;cursor:inherit}.mw-parser-output .navbar-ct-full{font-size:114%;margin:0 7em}.mw-parser-output .navbar-ct-mini{font-size:114%;margin:0 4em}html.skin-theme-clientpref-night .mw-parser-output .navbar li a abbr{color:var(--color-base)!important}@media(prefers-color-scheme:dark){html.skin-theme-clientpref-os .mw-parser-output .navbar li a abbr{color:var(--color-base)!important}}@media print{.mw-parser-output .navbar{display:none!important}}</style><div class="navbar plainlinks hlist navbar-mini"><ul><li class="nv-view"><a href="/wiki/Template:Machine_learning" title="Template:Machine learning"><abbr title="View this template">v</abbr></a></li><li class="nv-talk"><a href="/wiki/Template_talk:Machine_learning" title="Template talk:Machine learning"><abbr title="Discuss this template">t</abbr></a></li><li class="nv-edit"><a href="/wiki/Special:EditPage/Template:Machine_learning" title="Special:EditPage/Template:Machine learning"><abbr title="Edit this template">e</abbr></a></li></ul></div></td></tr></tbody></table> <p><b>Stochastic gradient descent</b> (often abbreviated <b>SGD</b>) is an <a href="/wiki/Iterative_method" title="Iterative method">iterative</a> method for optimizing an <a href="/wiki/Objective_function" class="mw-redirect" title="Objective function">objective function</a> with suitable <a href="/wiki/Smoothness" title="Smoothness">smoothness</a> properties (e.g. <a href="/wiki/Differentiable_function" title="Differentiable function">differentiable</a> or <a href="/wiki/Subderivative" title="Subderivative">subdifferentiable</a>). It can be regarded as a <a href="/wiki/Stochastic_approximation" title="Stochastic approximation">stochastic approximation</a> of <a href="/wiki/Gradient_descent" title="Gradient descent">gradient descent</a> optimization, since it replaces the actual gradient (calculated from the entire <a href="/wiki/Data_set" title="Data set">data set</a>) by an estimate thereof (calculated from a randomly selected subset of the data). Especially in <a href="/wiki/High-dimensional" class="mw-redirect" title="High-dimensional">high-dimensional</a> optimization problems this reduces the very high <a href="/wiki/Computational_complexity" title="Computational complexity">computational burden</a>, achieving faster iterations in exchange for a lower <a href="/wiki/Rate_of_convergence" title="Rate of convergence">convergence rate</a>.<sup id="cite_ref-1" class="reference"><a href="#cite_note-1"><span class="cite-bracket">&#91;</span>1<span class="cite-bracket">&#93;</span></a></sup> </p><p>The basic idea behind stochastic approximation can be traced back to the <a href="/wiki/Robbins%E2%80%93Monro_algorithm" class="mw-redirect" title="Robbins–Monro algorithm">Robbins–Monro algorithm</a> of the 1950s. Today, stochastic gradient descent has become an important optimization method in <a href="/wiki/Machine_learning" title="Machine learning">machine learning</a>.<sup id="cite_ref-Bottou_1998_2-0" class="reference"><a href="#cite_note-Bottou_1998-2"><span class="cite-bracket">&#91;</span>2<span class="cite-bracket">&#93;</span></a></sup> </p> <meta property="mw:PageProp/toc" /> <div class="mw-heading mw-heading2"><h2 id="Background">Background</h2><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Stochastic_gradient_descent&amp;action=edit&amp;section=1" title="Edit section: Background"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <style data-mw-deduplicate="TemplateStyles:r1236090951">.mw-parser-output .hatnote{font-style:italic}.mw-parser-output div.hatnote{padding-left:1.6em;margin-bottom:0.5em}.mw-parser-output .hatnote i{font-style:normal}.mw-parser-output .hatnote+link+.hatnote{margin-top:-0.5em}@media print{body.ns-0 .mw-parser-output .hatnote{display:none!important}}</style><div role="note" class="hatnote navigation-not-searchable">Main article: <a href="/wiki/M-estimation" class="mw-redirect" title="M-estimation">M-estimation</a></div> <link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1236090951"><div role="note" class="hatnote navigation-not-searchable">See also: <a href="/wiki/Estimating_equation" class="mw-redirect" title="Estimating equation">Estimating equation</a></div> <p>Both <a href="/wiki/Statistics" title="Statistics">statistical</a> <a href="/wiki/M-estimation" class="mw-redirect" title="M-estimation">estimation</a> and <a href="/wiki/Machine_learning" title="Machine learning">machine learning</a> consider the problem of <a href="/wiki/Mathematical_optimization" title="Mathematical optimization">minimizing</a> an <a href="/wiki/Objective_function" class="mw-redirect" title="Objective function">objective function</a> that has the form of a sum: <span class="mwe-math-element"><span class="mwe-math-mathml-display mwe-math-mathml-a11y" style="display: none;"><math display="block" xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle Q(w)={\frac {1}{n}}\sum _{i=1}^{n}Q_{i}(w),}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mi>Q</mi> <mo stretchy="false">(</mo> <mi>w</mi> <mo stretchy="false">)</mo> <mo>=</mo> <mrow class="MJX-TeXAtom-ORD"> <mfrac> <mn>1</mn> <mi>n</mi> </mfrac> </mrow> <munderover> <mo>&#x2211;<!-- ∑ --></mo> <mrow class="MJX-TeXAtom-ORD"> <mi>i</mi> <mo>=</mo> <mn>1</mn> </mrow> <mrow class="MJX-TeXAtom-ORD"> <mi>n</mi> </mrow> </munderover> <msub> <mi>Q</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>i</mi> </mrow> </msub> <mo stretchy="false">(</mo> <mi>w</mi> <mo stretchy="false">)</mo> <mo>,</mo> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle Q(w)={\frac {1}{n}}\sum _{i=1}^{n}Q_{i}(w),}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/7f38e0bdfea090cfd651222e7db9806dce6164cd" class="mwe-math-fallback-image-display mw-invert skin-invert" aria-hidden="true" style="vertical-align: -3.005ex; width:21.529ex; height:6.843ex;" alt="{\displaystyle Q(w)={\frac {1}{n}}\sum _{i=1}^{n}Q_{i}(w),}"></span> where the <a href="/wiki/Parametric_statistics" title="Parametric statistics">parameter</a> <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle w}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mi>w</mi> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle w}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/88b1e0c8e1be5ebe69d18a8010676fa42d7961e6" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.338ex; width:1.664ex; height:1.676ex;" alt="{\displaystyle w}"></span> that minimizes <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle Q(w)}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mi>Q</mi> <mo stretchy="false">(</mo> <mi>w</mi> <mo stretchy="false">)</mo> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle Q(w)}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/6346b36d378401a300ca8fea88d2a37d48973d0a" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.838ex; width:5.312ex; height:2.843ex;" alt="{\displaystyle Q(w)}"></span> is to be <a href="/wiki/Estimator" title="Estimator">estimated</a>. Each summand function <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle Q_{i}}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <msub> <mi>Q</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>i</mi> </mrow> </msub> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle Q_{i}}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/b9f7193081d440425e522698e80817b5d558df03" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.671ex; width:2.638ex; height:2.509ex;" alt="{\displaystyle Q_{i}}"></span> is typically associated with the <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle i}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mi>i</mi> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle i}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/add78d8608ad86e54951b8c8bd6c8d8416533d20" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.338ex; width:0.802ex; height:2.176ex;" alt="{\displaystyle i}"></span>-th <a href="/wiki/Observation_(statistics)" class="mw-redirect" title="Observation (statistics)">observation</a> in the <a href="/wiki/Data_set" title="Data set">data set</a> (used for training). </p><p>In classical statistics, sum-minimization problems arise in <a href="/wiki/Least_squares" title="Least squares">least squares</a> and in <a href="/wiki/Maximum-likelihood_estimation" class="mw-redirect" title="Maximum-likelihood estimation">maximum-likelihood estimation</a> (for independent observations). The general class of estimators that arise as minimizers of sums are called <a href="/wiki/M-estimator" title="M-estimator">M-estimators</a>. However, in statistics, it has been long recognized that requiring even local minimization is too restrictive for some problems of maximum-likelihood estimation.<sup id="cite_ref-3" class="reference"><a href="#cite_note-3"><span class="cite-bracket">&#91;</span>3<span class="cite-bracket">&#93;</span></a></sup> Therefore, contemporary statistical theorists often consider <a href="/wiki/Stationary_point" title="Stationary point">stationary points</a> of the <a href="/wiki/Likelihood_function" title="Likelihood function">likelihood function</a> (or zeros of its derivative, the <a href="/wiki/Score_(statistics)" class="mw-redirect" title="Score (statistics)">score function</a>, and other <a href="/wiki/Estimating_equations" title="Estimating equations">estimating equations</a>). </p><p>The sum-minimization problem also arises for <a href="/wiki/Empirical_risk_minimization" title="Empirical risk minimization">empirical risk minimization</a>. There, <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle Q_{i}(w)}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <msub> <mi>Q</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>i</mi> </mrow> </msub> <mo stretchy="false">(</mo> <mi>w</mi> <mo stretchy="false">)</mo> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle Q_{i}(w)}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/25b1170e62c103ff59c79ea424f1c409c4742225" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.838ex; width:6.111ex; height:2.843ex;" alt="{\displaystyle Q_{i}(w)}"></span> is the value of the <a href="/wiki/Loss_function" title="Loss function">loss function</a> at <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle i}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mi>i</mi> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle i}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/add78d8608ad86e54951b8c8bd6c8d8416533d20" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.338ex; width:0.802ex; height:2.176ex;" alt="{\displaystyle i}"></span>-th example, and <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle Q(w)}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mi>Q</mi> <mo stretchy="false">(</mo> <mi>w</mi> <mo stretchy="false">)</mo> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle Q(w)}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/6346b36d378401a300ca8fea88d2a37d48973d0a" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.838ex; width:5.312ex; height:2.843ex;" alt="{\displaystyle Q(w)}"></span> is the empirical risk. </p><p>When used to minimize the above function, a standard (or "batch") <a href="/wiki/Gradient_descent" title="Gradient descent">gradient descent</a> method would perform the following iterations: <span class="mwe-math-element"><span class="mwe-math-mathml-display mwe-math-mathml-a11y" style="display: none;"><math display="block" xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle w:=w-\eta \,\nabla Q(w)=w-{\frac {\eta }{n}}\sum _{i=1}^{n}\nabla Q_{i}(w).}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mi>w</mi> <mo>:=</mo> <mi>w</mi> <mo>&#x2212;<!-- − --></mo> <mi>&#x03B7;<!-- η --></mi> <mspace width="thinmathspace" /> <mi mathvariant="normal">&#x2207;<!-- ∇ --></mi> <mi>Q</mi> <mo stretchy="false">(</mo> <mi>w</mi> <mo stretchy="false">)</mo> <mo>=</mo> <mi>w</mi> <mo>&#x2212;<!-- − --></mo> <mrow class="MJX-TeXAtom-ORD"> <mfrac> <mi>&#x03B7;<!-- η --></mi> <mi>n</mi> </mfrac> </mrow> <munderover> <mo>&#x2211;<!-- ∑ --></mo> <mrow class="MJX-TeXAtom-ORD"> <mi>i</mi> <mo>=</mo> <mn>1</mn> </mrow> <mrow class="MJX-TeXAtom-ORD"> <mi>n</mi> </mrow> </munderover> <mi mathvariant="normal">&#x2207;<!-- ∇ --></mi> <msub> <mi>Q</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>i</mi> </mrow> </msub> <mo stretchy="false">(</mo> <mi>w</mi> <mo stretchy="false">)</mo> <mo>.</mo> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle w:=w-\eta \,\nabla Q(w)=w-{\frac {\eta }{n}}\sum _{i=1}^{n}\nabla Q_{i}(w).}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/b56e64f17f26302a097e801b4d6c64c1d984194c" class="mwe-math-fallback-image-display mw-invert skin-invert" aria-hidden="true" style="vertical-align: -3.005ex; width:41.375ex; height:6.843ex;" alt="{\displaystyle w:=w-\eta \,\nabla Q(w)=w-{\frac {\eta }{n}}\sum _{i=1}^{n}\nabla Q_{i}(w).}"></span> The step size is denoted by <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle \eta }"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mi>&#x03B7;<!-- η --></mi> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle \eta }</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/e4d701857cf5fbec133eebaf94deadf722537f64" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.838ex; width:1.169ex; height:2.176ex;" alt="{\displaystyle \eta }"></span> (sometimes called the <i><a href="/wiki/Learning_rate" title="Learning rate">learning rate</a></i> in machine learning) and here "<span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle :=}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mo>:=</mo> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle :=}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/3b9a320a04a814e22f58952141fd0d92cd5ac402" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.338ex; width:2.455ex; height:1.676ex;" alt="{\displaystyle :=}"></span>" denotes the update of a variable in the algorithm. </p><p>In many cases, the summand functions have a simple form that enables inexpensive evaluations of the sum-function and the sum gradient. For example, in statistics, <a href="/wiki/Exponential_families" class="mw-redirect" title="Exponential families">one-parameter exponential families</a> allow economical function-evaluations and gradient-evaluations. </p><p>However, in other cases, evaluating the sum-gradient may require expensive evaluations of the gradients from all summand functions. When the training set is enormous and no simple formulas exist, evaluating the sums of gradients becomes very expensive, because evaluating the gradient requires evaluating all the summand functions' gradients. To economize on the computational cost at every iteration, stochastic gradient descent <a href="/wiki/Sampling_(statistics)" title="Sampling (statistics)">samples</a> a subset of summand functions at every step. This is very effective in the case of large-scale machine learning problems.<sup id="cite_ref-4" class="reference"><a href="#cite_note-4"><span class="cite-bracket">&#91;</span>4<span class="cite-bracket">&#93;</span></a></sup> </p> <div class="mw-heading mw-heading2"><h2 id="Iterative_method">Iterative method</h2><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Stochastic_gradient_descent&amp;action=edit&amp;section=2" title="Edit section: Iterative method"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <figure class="mw-default-size mw-halign-right" typeof="mw:File/Thumb"><a href="/wiki/File:Stogra.png" class="mw-file-description"><img src="//upload.wikimedia.org/wikipedia/commons/thumb/f/f3/Stogra.png/220px-Stogra.png" decoding="async" width="220" height="173" class="mw-file-element" srcset="//upload.wikimedia.org/wikipedia/commons/thumb/f/f3/Stogra.png/330px-Stogra.png 1.5x, //upload.wikimedia.org/wikipedia/commons/thumb/f/f3/Stogra.png/440px-Stogra.png 2x" data-file-width="484" data-file-height="380" /></a><figcaption>Fluctuations in the total objective function as gradient steps with respect to mini-batches are taken.</figcaption></figure> <p>In stochastic (or "on-line") gradient descent, the true gradient of <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle Q(w)}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mi>Q</mi> <mo stretchy="false">(</mo> <mi>w</mi> <mo stretchy="false">)</mo> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle Q(w)}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/6346b36d378401a300ca8fea88d2a37d48973d0a" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.838ex; width:5.312ex; height:2.843ex;" alt="{\displaystyle Q(w)}"></span> is approximated by a gradient at a single sample: <span class="mwe-math-element"><span class="mwe-math-mathml-display mwe-math-mathml-a11y" style="display: none;"><math display="block" xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle w:=w-\eta \,\nabla Q_{i}(w).}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mi>w</mi> <mo>:=</mo> <mi>w</mi> <mo>&#x2212;<!-- − --></mo> <mi>&#x03B7;<!-- η --></mi> <mspace width="thinmathspace" /> <mi mathvariant="normal">&#x2207;<!-- ∇ --></mi> <msub> <mi>Q</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>i</mi> </mrow> </msub> <mo stretchy="false">(</mo> <mi>w</mi> <mo stretchy="false">)</mo> <mo>.</mo> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle w:=w-\eta \,\nabla Q_{i}(w).}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/70916511ef8cbf2a59686723caf35c007ec20894" class="mwe-math-fallback-image-display mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.838ex; width:20.165ex; height:2.843ex;" alt="{\displaystyle w:=w-\eta \,\nabla Q_{i}(w).}"></span> As the algorithm sweeps through the training set, it performs the above update for each training sample. Several passes can be made over the training set until the algorithm converges. If this is done, the data can be shuffled for each pass to prevent cycles. Typical implementations may use an <a href="/wiki/Adaptive_learning_rate" class="mw-redirect" title="Adaptive learning rate">adaptive learning rate</a> so that the algorithm converges.<sup id="cite_ref-5" class="reference"><a href="#cite_note-5"><span class="cite-bracket">&#91;</span>5<span class="cite-bracket">&#93;</span></a></sup> </p><p>In pseudocode, stochastic gradient descent can be presented as&#160;: </p> <div style="margin-left: 35px; width: 600px"> <style data-mw-deduplicate="TemplateStyles:r1269837509">.mw-parser-output .framebox-container{margin-bottom:1.25em}.mw-parser-output .framebox-header{height:8px;margin:0;border:0;padding:0;font-size:1px}.mw-parser-output .framebox-inner{padding:5px;font-size:85%}</style> <div class="framebox-container" style="width: auto; margin-left: auto; border:1px solid #8898BF; background: transparent; color: var(--color-base, #202122)"> <div class="framebox-header" style="border-bottom:1px solid #8898BF; background: #C8D8FF"></div> <div class="framebox-inner"> <ul><li>Choose an initial vector of parameters <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle w}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mi>w</mi> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle w}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/88b1e0c8e1be5ebe69d18a8010676fa42d7961e6" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.338ex; width:1.664ex; height:1.676ex;" alt="{\displaystyle w}"></span> and learning rate <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle \eta }"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mi>&#x03B7;<!-- η --></mi> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle \eta }</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/e4d701857cf5fbec133eebaf94deadf722537f64" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.838ex; width:1.169ex; height:2.176ex;" alt="{\displaystyle \eta }"></span>.</li> <li>Repeat until an approximate minimum is obtained: <ul><li>Randomly shuffle samples in the training set.</li> <li>For <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle i=1,2,...,n}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mi>i</mi> <mo>=</mo> <mn>1</mn> <mo>,</mo> <mn>2</mn> <mo>,</mo> <mo>.</mo> <mo>.</mo> <mo>.</mo> <mo>,</mo> <mi>n</mi> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle i=1,2,...,n}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/daff11f8a99a009698fc116a303769863a68cd35" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.671ex; width:13.824ex; height:2.509ex;" alt="{\displaystyle i=1,2,...,n}"></span>, do: <ul><li><span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle w:=w-\eta \,\nabla Q_{i}(w).}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mi>w</mi> <mo>:=</mo> <mi>w</mi> <mo>&#x2212;<!-- − --></mo> <mi>&#x03B7;<!-- η --></mi> <mspace width="thinmathspace" /> <mi mathvariant="normal">&#x2207;<!-- ∇ --></mi> <msub> <mi>Q</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>i</mi> </mrow> </msub> <mo stretchy="false">(</mo> <mi>w</mi> <mo stretchy="false">)</mo> <mo>.</mo> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle w:=w-\eta \,\nabla Q_{i}(w).}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/70916511ef8cbf2a59686723caf35c007ec20894" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.838ex; width:20.165ex; height:2.843ex;" alt="{\displaystyle w:=w-\eta \,\nabla Q_{i}(w).}"></span></li></ul></li></ul></li></ul> </div></div> </div> <p>A compromise between computing the true gradient and the gradient at a single sample is to compute the gradient against more than one training sample (called a "mini-batch") at each step. This can perform significantly better than "true" stochastic gradient descent described, because the code can make use of <a href="/wiki/Vectorization_(mathematics)" title="Vectorization (mathematics)">vectorization</a> libraries rather than computing each step separately as was first shown in <sup id="cite_ref-6" class="reference"><a href="#cite_note-6"><span class="cite-bracket">&#91;</span>6<span class="cite-bracket">&#93;</span></a></sup> where it was called "the bunch-mode back-propagation algorithm". It may also result in smoother convergence, as the gradient computed at each step is averaged over more training samples. </p><p>The convergence of stochastic gradient descent has been analyzed using the theories of <a href="/wiki/Convex_optimization" title="Convex optimization">convex minimization</a> and of <a href="/wiki/Stochastic_approximation" title="Stochastic approximation">stochastic approximation</a>. Briefly, when the <a href="/wiki/Learning_rate" title="Learning rate">learning rates</a> <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle \eta }"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mi>&#x03B7;<!-- η --></mi> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle \eta }</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/e4d701857cf5fbec133eebaf94deadf722537f64" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.838ex; width:1.169ex; height:2.176ex;" alt="{\displaystyle \eta }"></span> decrease with an appropriate rate, and subject to relatively mild assumptions, stochastic gradient descent converges <a href="/wiki/Almost_surely" title="Almost surely">almost surely</a> to a global minimum when the objective function is <a href="/wiki/Convex_function" title="Convex function">convex</a> or <a href="/wiki/Pseudoconvex_function" title="Pseudoconvex function">pseudoconvex</a>, and otherwise converges almost surely to a local minimum.<sup id="cite_ref-Bottou_1998_2-1" class="reference"><a href="#cite_note-Bottou_1998-2"><span class="cite-bracket">&#91;</span>2<span class="cite-bracket">&#93;</span></a></sup><sup id="cite_ref-7" class="reference"><a href="#cite_note-7"><span class="cite-bracket">&#91;</span>7<span class="cite-bracket">&#93;</span></a></sup> This is in fact a consequence of the <a href="/w/index.php?title=Robbins%E2%80%93Siegmund_theorem&amp;action=edit&amp;redlink=1" class="new" title="Robbins–Siegmund theorem (page does not exist)">Robbins–Siegmund theorem</a>.<sup id="cite_ref-8" class="reference"><a href="#cite_note-8"><span class="cite-bracket">&#91;</span>8<span class="cite-bracket">&#93;</span></a></sup> </p> <div class="mw-heading mw-heading2"><h2 id="Linear_regression">Linear regression</h2><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Stochastic_gradient_descent&amp;action=edit&amp;section=3" title="Edit section: Linear regression"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <p>Suppose we want to fit a straight line <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle {\hat {y}}=w_{1}+w_{2}x}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mrow class="MJX-TeXAtom-ORD"> <mrow class="MJX-TeXAtom-ORD"> <mover> <mi>y</mi> <mo stretchy="false">&#x005E;<!-- ^ --></mo> </mover> </mrow> </mrow> <mo>=</mo> <msub> <mi>w</mi> <mrow class="MJX-TeXAtom-ORD"> <mn>1</mn> </mrow> </msub> <mo>+</mo> <msub> <mi>w</mi> <mrow class="MJX-TeXAtom-ORD"> <mn>2</mn> </mrow> </msub> <mi>x</mi> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle {\hat {y}}=w_{1}+w_{2}x}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/9afc31397b8bbd3a9be8349908cd7987881aea78" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.671ex; width:14.008ex; height:2.509ex;" alt="{\displaystyle {\hat {y}}=w_{1}+w_{2}x}"></span> to a training set with observations <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle ((x_{1},y_{1}),(x_{2},y_{2})\ldots ,(x_{n},y_{n}))}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mo stretchy="false">(</mo> <mo stretchy="false">(</mo> <msub> <mi>x</mi> <mrow class="MJX-TeXAtom-ORD"> <mn>1</mn> </mrow> </msub> <mo>,</mo> <msub> <mi>y</mi> <mrow class="MJX-TeXAtom-ORD"> <mn>1</mn> </mrow> </msub> <mo stretchy="false">)</mo> <mo>,</mo> <mo stretchy="false">(</mo> <msub> <mi>x</mi> <mrow class="MJX-TeXAtom-ORD"> <mn>2</mn> </mrow> </msub> <mo>,</mo> <msub> <mi>y</mi> <mrow class="MJX-TeXAtom-ORD"> <mn>2</mn> </mrow> </msub> <mo stretchy="false">)</mo> <mo>&#x2026;<!-- … --></mo> <mo>,</mo> <mo stretchy="false">(</mo> <msub> <mi>x</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>n</mi> </mrow> </msub> <mo>,</mo> <msub> <mi>y</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>n</mi> </mrow> </msub> <mo stretchy="false">)</mo> <mo stretchy="false">)</mo> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle ((x_{1},y_{1}),(x_{2},y_{2})\ldots ,(x_{n},y_{n}))}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/59a10f5235f347054d9a4dddbab23b6c6828cf82" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.838ex; width:29.965ex; height:2.843ex;" alt="{\displaystyle ((x_{1},y_{1}),(x_{2},y_{2})\ldots ,(x_{n},y_{n}))}"></span> and corresponding estimated responses <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle ({\hat {y}}_{1},{\hat {y}}_{2},\ldots ,{\hat {y}}_{n})}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mo stretchy="false">(</mo> <msub> <mrow class="MJX-TeXAtom-ORD"> <mrow class="MJX-TeXAtom-ORD"> <mover> <mi>y</mi> <mo stretchy="false">&#x005E;<!-- ^ --></mo> </mover> </mrow> </mrow> <mrow class="MJX-TeXAtom-ORD"> <mn>1</mn> </mrow> </msub> <mo>,</mo> <msub> <mrow class="MJX-TeXAtom-ORD"> <mrow class="MJX-TeXAtom-ORD"> <mover> <mi>y</mi> <mo stretchy="false">&#x005E;<!-- ^ --></mo> </mover> </mrow> </mrow> <mrow class="MJX-TeXAtom-ORD"> <mn>2</mn> </mrow> </msub> <mo>,</mo> <mo>&#x2026;<!-- … --></mo> <mo>,</mo> <msub> <mrow class="MJX-TeXAtom-ORD"> <mrow class="MJX-TeXAtom-ORD"> <mover> <mi>y</mi> <mo stretchy="false">&#x005E;<!-- ^ --></mo> </mover> </mrow> </mrow> <mrow class="MJX-TeXAtom-ORD"> <mi>n</mi> </mrow> </msub> <mo stretchy="false">)</mo> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle ({\hat {y}}_{1},{\hat {y}}_{2},\ldots ,{\hat {y}}_{n})}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/b55207c74907230fdb4067f3202fe08c735cbadb" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.838ex; width:15.255ex; height:2.843ex;" alt="{\displaystyle ({\hat {y}}_{1},{\hat {y}}_{2},\ldots ,{\hat {y}}_{n})}"></span> using <a href="/wiki/Least_squares" title="Least squares">least squares</a>. The objective function to be minimized is <span class="mwe-math-element"><span class="mwe-math-mathml-display mwe-math-mathml-a11y" style="display: none;"><math display="block" xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle Q(w)=\sum _{i=1}^{n}Q_{i}(w)=\sum _{i=1}^{n}\left({\hat {y}}_{i}-y_{i}\right)^{2}=\sum _{i=1}^{n}\left(w_{1}+w_{2}x_{i}-y_{i}\right)^{2}.}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mi>Q</mi> <mo stretchy="false">(</mo> <mi>w</mi> <mo stretchy="false">)</mo> <mo>=</mo> <munderover> <mo>&#x2211;<!-- ∑ --></mo> <mrow class="MJX-TeXAtom-ORD"> <mi>i</mi> <mo>=</mo> <mn>1</mn> </mrow> <mrow class="MJX-TeXAtom-ORD"> <mi>n</mi> </mrow> </munderover> <msub> <mi>Q</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>i</mi> </mrow> </msub> <mo stretchy="false">(</mo> <mi>w</mi> <mo stretchy="false">)</mo> <mo>=</mo> <munderover> <mo>&#x2211;<!-- ∑ --></mo> <mrow class="MJX-TeXAtom-ORD"> <mi>i</mi> <mo>=</mo> <mn>1</mn> </mrow> <mrow class="MJX-TeXAtom-ORD"> <mi>n</mi> </mrow> </munderover> <msup> <mrow> <mo>(</mo> <mrow> <msub> <mrow class="MJX-TeXAtom-ORD"> <mrow class="MJX-TeXAtom-ORD"> <mover> <mi>y</mi> <mo stretchy="false">&#x005E;<!-- ^ --></mo> </mover> </mrow> </mrow> <mrow class="MJX-TeXAtom-ORD"> <mi>i</mi> </mrow> </msub> <mo>&#x2212;<!-- − --></mo> <msub> <mi>y</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>i</mi> </mrow> </msub> </mrow> <mo>)</mo> </mrow> <mrow class="MJX-TeXAtom-ORD"> <mn>2</mn> </mrow> </msup> <mo>=</mo> <munderover> <mo>&#x2211;<!-- ∑ --></mo> <mrow class="MJX-TeXAtom-ORD"> <mi>i</mi> <mo>=</mo> <mn>1</mn> </mrow> <mrow class="MJX-TeXAtom-ORD"> <mi>n</mi> </mrow> </munderover> <msup> <mrow> <mo>(</mo> <mrow> <msub> <mi>w</mi> <mrow class="MJX-TeXAtom-ORD"> <mn>1</mn> </mrow> </msub> <mo>+</mo> <msub> <mi>w</mi> <mrow class="MJX-TeXAtom-ORD"> <mn>2</mn> </mrow> </msub> <msub> <mi>x</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>i</mi> </mrow> </msub> <mo>&#x2212;<!-- − --></mo> <msub> <mi>y</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>i</mi> </mrow> </msub> </mrow> <mo>)</mo> </mrow> <mrow class="MJX-TeXAtom-ORD"> <mn>2</mn> </mrow> </msup> <mo>.</mo> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle Q(w)=\sum _{i=1}^{n}Q_{i}(w)=\sum _{i=1}^{n}\left({\hat {y}}_{i}-y_{i}\right)^{2}=\sum _{i=1}^{n}\left(w_{1}+w_{2}x_{i}-y_{i}\right)^{2}.}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/e8ae4720d8afa9a114ee8e8f48fd4f226617add9" class="mwe-math-fallback-image-display mw-invert skin-invert" aria-hidden="true" style="vertical-align: -3.005ex; width:60.386ex; height:6.843ex;" alt="{\displaystyle Q(w)=\sum _{i=1}^{n}Q_{i}(w)=\sum _{i=1}^{n}\left({\hat {y}}_{i}-y_{i}\right)^{2}=\sum _{i=1}^{n}\left(w_{1}+w_{2}x_{i}-y_{i}\right)^{2}.}"></span> The last line in the above pseudocode for this specific problem will become: <span class="mwe-math-element"><span class="mwe-math-mathml-display mwe-math-mathml-a11y" style="display: none;"><math display="block" xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle {\begin{bmatrix}w_{1}\\w_{2}\end{bmatrix}}\leftarrow {\begin{bmatrix}w_{1}\\w_{2}\end{bmatrix}}-\eta {\begin{bmatrix}{\frac {\partial }{\partial w_{1}}}(w_{1}+w_{2}x_{i}-y_{i})^{2}\\{\frac {\partial }{\partial w_{2}}}(w_{1}+w_{2}x_{i}-y_{i})^{2}\end{bmatrix}}={\begin{bmatrix}w_{1}\\w_{2}\end{bmatrix}}-\eta {\begin{bmatrix}2(w_{1}+w_{2}x_{i}-y_{i})\\2x_{i}(w_{1}+w_{2}x_{i}-y_{i})\end{bmatrix}}.}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mrow class="MJX-TeXAtom-ORD"> <mrow> <mo>[</mo> <mtable rowspacing="4pt" columnspacing="1em"> <mtr> <mtd> <msub> <mi>w</mi> <mrow class="MJX-TeXAtom-ORD"> <mn>1</mn> </mrow> </msub> </mtd> </mtr> <mtr> <mtd> <msub> <mi>w</mi> <mrow class="MJX-TeXAtom-ORD"> <mn>2</mn> </mrow> </msub> </mtd> </mtr> </mtable> <mo>]</mo> </mrow> </mrow> <mo stretchy="false">&#x2190;<!-- ← --></mo> <mrow class="MJX-TeXAtom-ORD"> <mrow> <mo>[</mo> <mtable rowspacing="4pt" columnspacing="1em"> <mtr> <mtd> <msub> <mi>w</mi> <mrow class="MJX-TeXAtom-ORD"> <mn>1</mn> </mrow> </msub> </mtd> </mtr> <mtr> <mtd> <msub> <mi>w</mi> <mrow class="MJX-TeXAtom-ORD"> <mn>2</mn> </mrow> </msub> </mtd> </mtr> </mtable> <mo>]</mo> </mrow> </mrow> <mo>&#x2212;<!-- − --></mo> <mi>&#x03B7;<!-- η --></mi> <mrow class="MJX-TeXAtom-ORD"> <mrow> <mo>[</mo> <mtable rowspacing="4pt" columnspacing="1em"> <mtr> <mtd> <mrow class="MJX-TeXAtom-ORD"> <mfrac> <mi mathvariant="normal">&#x2202;<!-- ∂ --></mi> <mrow> <mi mathvariant="normal">&#x2202;<!-- ∂ --></mi> <msub> <mi>w</mi> <mrow class="MJX-TeXAtom-ORD"> <mn>1</mn> </mrow> </msub> </mrow> </mfrac> </mrow> <mo stretchy="false">(</mo> <msub> <mi>w</mi> <mrow class="MJX-TeXAtom-ORD"> <mn>1</mn> </mrow> </msub> <mo>+</mo> <msub> <mi>w</mi> <mrow class="MJX-TeXAtom-ORD"> <mn>2</mn> </mrow> </msub> <msub> <mi>x</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>i</mi> </mrow> </msub> <mo>&#x2212;<!-- − --></mo> <msub> <mi>y</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>i</mi> </mrow> </msub> <msup> <mo stretchy="false">)</mo> <mrow class="MJX-TeXAtom-ORD"> <mn>2</mn> </mrow> </msup> </mtd> </mtr> <mtr> <mtd> <mrow class="MJX-TeXAtom-ORD"> <mfrac> <mi mathvariant="normal">&#x2202;<!-- ∂ --></mi> <mrow> <mi mathvariant="normal">&#x2202;<!-- ∂ --></mi> <msub> <mi>w</mi> <mrow class="MJX-TeXAtom-ORD"> <mn>2</mn> </mrow> </msub> </mrow> </mfrac> </mrow> <mo stretchy="false">(</mo> <msub> <mi>w</mi> <mrow class="MJX-TeXAtom-ORD"> <mn>1</mn> </mrow> </msub> <mo>+</mo> <msub> <mi>w</mi> <mrow class="MJX-TeXAtom-ORD"> <mn>2</mn> </mrow> </msub> <msub> <mi>x</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>i</mi> </mrow> </msub> <mo>&#x2212;<!-- − --></mo> <msub> <mi>y</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>i</mi> </mrow> </msub> <msup> <mo stretchy="false">)</mo> <mrow class="MJX-TeXAtom-ORD"> <mn>2</mn> </mrow> </msup> </mtd> </mtr> </mtable> <mo>]</mo> </mrow> </mrow> <mo>=</mo> <mrow class="MJX-TeXAtom-ORD"> <mrow> <mo>[</mo> <mtable rowspacing="4pt" columnspacing="1em"> <mtr> <mtd> <msub> <mi>w</mi> <mrow class="MJX-TeXAtom-ORD"> <mn>1</mn> </mrow> </msub> </mtd> </mtr> <mtr> <mtd> <msub> <mi>w</mi> <mrow class="MJX-TeXAtom-ORD"> <mn>2</mn> </mrow> </msub> </mtd> </mtr> </mtable> <mo>]</mo> </mrow> </mrow> <mo>&#x2212;<!-- − --></mo> <mi>&#x03B7;<!-- η --></mi> <mrow class="MJX-TeXAtom-ORD"> <mrow> <mo>[</mo> <mtable rowspacing="4pt" columnspacing="1em"> <mtr> <mtd> <mn>2</mn> <mo stretchy="false">(</mo> <msub> <mi>w</mi> <mrow class="MJX-TeXAtom-ORD"> <mn>1</mn> </mrow> </msub> <mo>+</mo> <msub> <mi>w</mi> <mrow class="MJX-TeXAtom-ORD"> <mn>2</mn> </mrow> </msub> <msub> <mi>x</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>i</mi> </mrow> </msub> <mo>&#x2212;<!-- − --></mo> <msub> <mi>y</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>i</mi> </mrow> </msub> <mo stretchy="false">)</mo> </mtd> </mtr> <mtr> <mtd> <mn>2</mn> <msub> <mi>x</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>i</mi> </mrow> </msub> <mo stretchy="false">(</mo> <msub> <mi>w</mi> <mrow class="MJX-TeXAtom-ORD"> <mn>1</mn> </mrow> </msub> <mo>+</mo> <msub> <mi>w</mi> <mrow class="MJX-TeXAtom-ORD"> <mn>2</mn> </mrow> </msub> <msub> <mi>x</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>i</mi> </mrow> </msub> <mo>&#x2212;<!-- − --></mo> <msub> <mi>y</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>i</mi> </mrow> </msub> <mo stretchy="false">)</mo> </mtd> </mtr> </mtable> <mo>]</mo> </mrow> </mrow> <mo>.</mo> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle {\begin{bmatrix}w_{1}\\w_{2}\end{bmatrix}}\leftarrow {\begin{bmatrix}w_{1}\\w_{2}\end{bmatrix}}-\eta {\begin{bmatrix}{\frac {\partial }{\partial w_{1}}}(w_{1}+w_{2}x_{i}-y_{i})^{2}\\{\frac {\partial }{\partial w_{2}}}(w_{1}+w_{2}x_{i}-y_{i})^{2}\end{bmatrix}}={\begin{bmatrix}w_{1}\\w_{2}\end{bmatrix}}-\eta {\begin{bmatrix}2(w_{1}+w_{2}x_{i}-y_{i})\\2x_{i}(w_{1}+w_{2}x_{i}-y_{i})\end{bmatrix}}.}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/22a0c6b6d2e19d1332a0be2203dc567ee8651da0" class="mwe-math-fallback-image-display mw-invert skin-invert" aria-hidden="true" style="vertical-align: -3.838ex; width:82.325ex; height:8.676ex;" alt="{\displaystyle {\begin{bmatrix}w_{1}\\w_{2}\end{bmatrix}}\leftarrow {\begin{bmatrix}w_{1}\\w_{2}\end{bmatrix}}-\eta {\begin{bmatrix}{\frac {\partial }{\partial w_{1}}}(w_{1}+w_{2}x_{i}-y_{i})^{2}\\{\frac {\partial }{\partial w_{2}}}(w_{1}+w_{2}x_{i}-y_{i})^{2}\end{bmatrix}}={\begin{bmatrix}w_{1}\\w_{2}\end{bmatrix}}-\eta {\begin{bmatrix}2(w_{1}+w_{2}x_{i}-y_{i})\\2x_{i}(w_{1}+w_{2}x_{i}-y_{i})\end{bmatrix}}.}"></span>Note that in each iteration or update step, the gradient is only evaluated at a single <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle x_{i}}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <msub> <mi>x</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>i</mi> </mrow> </msub> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle x_{i}}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/e87000dd6142b81d041896a30fe58f0c3acb2158" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.671ex; width:2.129ex; height:2.009ex;" alt="{\displaystyle x_{i}}"></span>. This is the key difference between stochastic gradient descent and batched gradient descent. </p><p>In general, given a linear regression <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle {\hat {y}}=\sum _{k\in 1:m}w_{k}x_{k}}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mrow class="MJX-TeXAtom-ORD"> <mrow class="MJX-TeXAtom-ORD"> <mover> <mi>y</mi> <mo stretchy="false">&#x005E;<!-- ^ --></mo> </mover> </mrow> </mrow> <mo>=</mo> <munder> <mo>&#x2211;<!-- ∑ --></mo> <mrow class="MJX-TeXAtom-ORD"> <mi>k</mi> <mo>&#x2208;<!-- ∈ --></mo> <mn>1</mn> <mo>:</mo> <mi>m</mi> </mrow> </munder> <msub> <mi>w</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>k</mi> </mrow> </msub> <msub> <mi>x</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>k</mi> </mrow> </msub> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle {\hat {y}}=\sum _{k\in 1:m}w_{k}x_{k}}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/b257a20462b0facc5392c509fcaed79818839dce" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -3.171ex; width:14.634ex; height:5.676ex;" alt="{\displaystyle {\hat {y}}=\sum _{k\in 1:m}w_{k}x_{k}}"></span> problem, stochastic gradient descent behaves differently when <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle m&lt;n}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mi>m</mi> <mo>&lt;</mo> <mi>n</mi> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle m&lt;n}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/490c01b0cb770144f28afd17bb5fef277daf6f38" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.338ex; width:6.534ex; height:1.843ex;" alt="{\displaystyle m&lt;n}"></span> (underparameterized) and <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle m\geq n}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mi>m</mi> <mo>&#x2265;<!-- ≥ --></mo> <mi>n</mi> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle m\geq n}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/6b02f25e62da7fe3162ac80446437cdc1c0fd341" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.505ex; width:6.534ex; height:2.176ex;" alt="{\displaystyle m\geq n}"></span> (overparameterized). In the overparameterized case, stochastic gradient descent converges to <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle \arg \min _{w:w^{T}x_{k}=y_{k}\forall k\in 1:n}\|w-w_{0}\|}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mi>arg</mi> <mo>&#x2061;<!-- ⁡ --></mo> <munder> <mo movablelimits="true" form="prefix">min</mo> <mrow class="MJX-TeXAtom-ORD"> <mi>w</mi> <mo>:</mo> <msup> <mi>w</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>T</mi> </mrow> </msup> <msub> <mi>x</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>k</mi> </mrow> </msub> <mo>=</mo> <msub> <mi>y</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>k</mi> </mrow> </msub> <mi mathvariant="normal">&#x2200;<!-- ∀ --></mi> <mi>k</mi> <mo>&#x2208;<!-- ∈ --></mo> <mn>1</mn> <mo>:</mo> <mi>n</mi> </mrow> </munder> <mo fence="false" stretchy="false">&#x2016;<!-- ‖ --></mo> <mi>w</mi> <mo>&#x2212;<!-- − --></mo> <msub> <mi>w</mi> <mrow class="MJX-TeXAtom-ORD"> <mn>0</mn> </mrow> </msub> <mo fence="false" stretchy="false">&#x2016;<!-- ‖ --></mo> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle \arg \min _{w:w^{T}x_{k}=y_{k}\forall k\in 1:n}\|w-w_{0}\|}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/b137d23823cc5852973cecf05a0e754a191777b7" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -2.838ex; width:27.349ex; height:4.843ex;" alt="{\displaystyle \arg \min _{w:w^{T}x_{k}=y_{k}\forall k\in 1:n}\|w-w_{0}\|}"></span>. That is, SGD converges to the interpolation solution with minimum distance from the starting <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle w_{0}}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <msub> <mi>w</mi> <mrow class="MJX-TeXAtom-ORD"> <mn>0</mn> </mrow> </msub> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle w_{0}}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/7aa052386ec49846179aa8bbe2b279b57a675e00" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.671ex; width:2.718ex; height:2.009ex;" alt="{\displaystyle w_{0}}"></span>. This is true even when the learning rate remains constant. In the underparameterized case, SGD does not converge if learning rate remains constant.<sup id="cite_ref-9" class="reference"><a href="#cite_note-9"><span class="cite-bracket">&#91;</span>9<span class="cite-bracket">&#93;</span></a></sup> </p> <div class="mw-heading mw-heading2"><h2 id="History">History</h2><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Stochastic_gradient_descent&amp;action=edit&amp;section=4" title="Edit section: History"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <p>In 1951, <a href="/wiki/Herbert_Robbins" title="Herbert Robbins">Herbert Robbins</a> and <a href="/wiki/John_U._Monro#Personal_life" title="John U. Monro">Sutton Monro</a> introduced the earliest stochastic approximation methods, preceding stochastic gradient descent.<sup id="cite_ref-rm_10-0" class="reference"><a href="#cite_note-rm-10"><span class="cite-bracket">&#91;</span>10<span class="cite-bracket">&#93;</span></a></sup> Building on this work one year later, <a href="/wiki/Jack_Kiefer_(statistician)" title="Jack Kiefer (statistician)">Jack Kiefer</a> and <a href="/wiki/Jacob_Wolfowitz" title="Jacob Wolfowitz">Jacob Wolfowitz</a> published <a href="/wiki/Stochastic_approximation#Kiefer–Wolfowitz_algorithm" title="Stochastic approximation">an optimization algorithm</a> very close to stochastic gradient descent, using <a href="/wiki/Finite_difference#Basic_types" title="Finite difference">central differences</a> as an approximation of the gradient.<sup id="cite_ref-11" class="reference"><a href="#cite_note-11"><span class="cite-bracket">&#91;</span>11<span class="cite-bracket">&#93;</span></a></sup> Later in the 1950s, <a href="/wiki/Frank_Rosenblatt" title="Frank Rosenblatt">Frank Rosenblatt</a> used SGD to optimize his <a href="/wiki/Perceptron" title="Perceptron">perceptron model</a>, demonstrating the first applicability of stochastic gradient descent to neural networks.<sup id="cite_ref-12" class="reference"><a href="#cite_note-12"><span class="cite-bracket">&#91;</span>12<span class="cite-bracket">&#93;</span></a></sup> </p><p><a href="/wiki/Backpropagation" title="Backpropagation">Backpropagation</a> was first described in 1986, with stochastic gradient descent being used to efficiently optimize parameters across neural networks with multiple <a href="/wiki/Artificial_neural_network" class="mw-redirect" title="Artificial neural network">hidden layers</a>. Soon after, another improvement was developed: mini-batch gradient descent, where small batches of data are substituted for single samples. In 1997, the practical performance benefits from vectorization achievable with such small batches were first explored,<sup id="cite_ref-13" class="reference"><a href="#cite_note-13"><span class="cite-bracket">&#91;</span>13<span class="cite-bracket">&#93;</span></a></sup> paving the way for efficient optimization in machine learning. As of 2023, this mini-batch approach remains the norm for training neural networks, balancing the benefits of stochastic gradient descent with <a href="/wiki/Gradient_descent" title="Gradient descent">gradient descent</a>.<sup id="cite_ref-14" class="reference"><a href="#cite_note-14"><span class="cite-bracket">&#91;</span>14<span class="cite-bracket">&#93;</span></a></sup> </p><p>By the 1980s, <a href="/wiki/Momentum_(machine_learning)" class="mw-redirect" title="Momentum (machine learning)">momentum</a> had already been introduced, and was added to SGD optimization techniques in 1986.<sup id="cite_ref-15" class="reference"><a href="#cite_note-15"><span class="cite-bracket">&#91;</span>15<span class="cite-bracket">&#93;</span></a></sup> However, these optimization techniques assumed constant <a href="/wiki/Hyperparameter_(machine_learning)" title="Hyperparameter (machine learning)">hyperparameters</a>, i.e. a fixed learning rate and momentum parameter. In the 2010s, adaptive approaches to applying SGD with a per-parameter learning rate were introduced with AdaGrad (for "Adaptive Gradient") in 2011<sup id="cite_ref-duchi2_16-0" class="reference"><a href="#cite_note-duchi2-16"><span class="cite-bracket">&#91;</span>16<span class="cite-bracket">&#93;</span></a></sup> and RMSprop (for "Root Mean Square Propagation") in 2012.<sup id="cite_ref-rmsprop2_17-0" class="reference"><a href="#cite_note-rmsprop2-17"><span class="cite-bracket">&#91;</span>17<span class="cite-bracket">&#93;</span></a></sup> In 2014, Adam (for "Adaptive Moment Estimation") was published, applying the adaptive approaches of RMSprop to momentum; many improvements and branches of Adam were then developed such as Adadelta, Adagrad, AdamW, and Adamax.<sup id="cite_ref-Adam20142_18-0" class="reference"><a href="#cite_note-Adam20142-18"><span class="cite-bracket">&#91;</span>18<span class="cite-bracket">&#93;</span></a></sup><sup id="cite_ref-pytorch.org_19-0" class="reference"><a href="#cite_note-pytorch.org-19"><span class="cite-bracket">&#91;</span>19<span class="cite-bracket">&#93;</span></a></sup> </p><p>Within machine learning, approaches to optimization in 2023 are dominated by Adam-derived optimizers. TensorFlow and PyTorch, by far the most popular machine learning libraries,<sup id="cite_ref-20" class="reference"><a href="#cite_note-20"><span class="cite-bracket">&#91;</span>20<span class="cite-bracket">&#93;</span></a></sup> as of 2023 largely only include Adam-derived optimizers, as well as predecessors to Adam such as RMSprop and classic SGD. PyTorch also partially supports <a href="/wiki/Limited-memory_BFGS" title="Limited-memory BFGS">Limited-memory BFGS</a>, a line-search method, but only for single-device setups without parameter groups.<sup id="cite_ref-pytorch.org_19-1" class="reference"><a href="#cite_note-pytorch.org-19"><span class="cite-bracket">&#91;</span>19<span class="cite-bracket">&#93;</span></a></sup><sup id="cite_ref-21" class="reference"><a href="#cite_note-21"><span class="cite-bracket">&#91;</span>21<span class="cite-bracket">&#93;</span></a></sup> </p> <div class="mw-heading mw-heading2"><h2 id="Notable_applications">Notable applications</h2><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Stochastic_gradient_descent&amp;action=edit&amp;section=5" title="Edit section: Notable applications"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <p>Stochastic gradient descent is a popular algorithm for training a wide range of models in <a href="/wiki/Machine_learning" title="Machine learning">machine learning</a>, including (linear) <a href="/wiki/Support_vector_machine" title="Support vector machine">support vector machines</a>, <a href="/wiki/Logistic_regression" title="Logistic regression">logistic regression</a> (see, e.g., <a href="/wiki/Vowpal_Wabbit" title="Vowpal Wabbit">Vowpal Wabbit</a>) and <a href="/wiki/Graphical_model" title="Graphical model">graphical models</a>.<sup id="cite_ref-22" class="reference"><a href="#cite_note-22"><span class="cite-bracket">&#91;</span>22<span class="cite-bracket">&#93;</span></a></sup> When combined with the <a href="/wiki/Backpropagation" title="Backpropagation">back propagation</a> algorithm, it is the <i>de facto</i> standard algorithm for training <a href="/wiki/Artificial_neural_network" class="mw-redirect" title="Artificial neural network">artificial neural networks</a>.<sup id="cite_ref-23" class="reference"><a href="#cite_note-23"><span class="cite-bracket">&#91;</span>23<span class="cite-bracket">&#93;</span></a></sup> Its use has been also reported in the <a href="/wiki/Geophysics" title="Geophysics">Geophysics</a> community, specifically to applications of Full Waveform Inversion (FWI).<sup id="cite_ref-24" class="reference"><a href="#cite_note-24"><span class="cite-bracket">&#91;</span>24<span class="cite-bracket">&#93;</span></a></sup> </p><p>Stochastic gradient descent competes with the <a href="/wiki/Limited-memory_BFGS" title="Limited-memory BFGS">L-BFGS</a> algorithm,<sup class="noprint Inline-Template Template-Fact" style="white-space:nowrap;">&#91;<i><a href="/wiki/Wikipedia:Citation_needed" title="Wikipedia:Citation needed"><span title="This claim needs references to reliable sources. (July 2015)">citation needed</span></a></i>&#93;</sup> which is also widely used. Stochastic gradient descent has been used since at least 1960 for training <a href="/wiki/Linear_regression" title="Linear regression">linear regression</a> models, originally under the name <a href="/wiki/ADALINE" title="ADALINE">ADALINE</a>.<sup id="cite_ref-25" class="reference"><a href="#cite_note-25"><span class="cite-bracket">&#91;</span>25<span class="cite-bracket">&#93;</span></a></sup> </p><p>Another stochastic gradient descent algorithm is the <a href="/wiki/Least_mean_squares_filter" title="Least mean squares filter">least mean squares (LMS)</a> adaptive filter. </p> <div class="mw-heading mw-heading2"><h2 id="Extensions_and_variants">Extensions and variants</h2><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Stochastic_gradient_descent&amp;action=edit&amp;section=6" title="Edit section: Extensions and variants"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <p>Many improvements on the basic stochastic gradient descent algorithm have been proposed and used. In particular, in machine learning, the need to set a <a href="/wiki/Learning_rate" title="Learning rate">learning rate</a> (step size) has been recognized as problematic. Setting this parameter too high can cause the algorithm to diverge; setting it too low makes it slow to converge.<sup id="cite_ref-26" class="reference"><a href="#cite_note-26"><span class="cite-bracket">&#91;</span>26<span class="cite-bracket">&#93;</span></a></sup> A conceptually simple extension of stochastic gradient descent makes the learning rate a decreasing function <span class="texhtml mvar" style="font-style:italic;">η<sub>t</sub></span> of the iteration number <span class="texhtml mvar" style="font-style:italic;">t</span>, giving a <i>learning rate schedule</i>, so that the first iterations cause large changes in the parameters, while the later ones do only fine-tuning. Such schedules have been known since the work of MacQueen on <a href="/wiki/K-means_clustering" title="K-means clustering"><span class="texhtml mvar" style="font-style:italic;">k</span>-means clustering</a>.<sup id="cite_ref-27" class="reference"><a href="#cite_note-27"><span class="cite-bracket">&#91;</span>27<span class="cite-bracket">&#93;</span></a></sup> Practical guidance on choosing the step size in several variants of SGD is given by Spall.<sup id="cite_ref-28" class="reference"><a href="#cite_note-28"><span class="cite-bracket">&#91;</span>28<span class="cite-bracket">&#93;</span></a></sup> </p> <figure class="mw-default-size" typeof="mw:File/Thumb"><a href="/wiki/File:Optimizer_Animations.gif" class="mw-file-description"><img src="//upload.wikimedia.org/wikipedia/commons/thumb/0/01/Optimizer_Animations.gif/220px-Optimizer_Animations.gif" decoding="async" width="220" height="147" class="mw-file-element" srcset="//upload.wikimedia.org/wikipedia/commons/thumb/0/01/Optimizer_Animations.gif/330px-Optimizer_Animations.gif 1.5x, //upload.wikimedia.org/wikipedia/commons/thumb/0/01/Optimizer_Animations.gif/440px-Optimizer_Animations.gif 2x" data-file-width="900" data-file-height="600" /></a><figcaption>A graph visualizing the behavior of a selected set of optimizers, using a 3D perspective projection of a loss function f(x, y)</figcaption></figure> <figure class="mw-default-size" typeof="mw:File/Thumb"><a href="/wiki/File:Optimizer_Animations_Birds-Eye.gif" class="mw-file-description"><img src="//upload.wikimedia.org/wikipedia/commons/thumb/3/35/Optimizer_Animations_Birds-Eye.gif/220px-Optimizer_Animations_Birds-Eye.gif" decoding="async" width="220" height="147" class="mw-file-element" srcset="//upload.wikimedia.org/wikipedia/commons/thumb/3/35/Optimizer_Animations_Birds-Eye.gif/330px-Optimizer_Animations_Birds-Eye.gif 1.5x, //upload.wikimedia.org/wikipedia/commons/thumb/3/35/Optimizer_Animations_Birds-Eye.gif/440px-Optimizer_Animations_Birds-Eye.gif 2x" data-file-width="900" data-file-height="600" /></a><figcaption>A graph visualizing the behavior of a selected set of optimizers</figcaption></figure> <div class="mw-heading mw-heading3"><h3 id="Implicit_updates_(ISGD)"><span id="Implicit_updates_.28ISGD.29"></span>Implicit updates (ISGD)</h3><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Stochastic_gradient_descent&amp;action=edit&amp;section=7" title="Edit section: Implicit updates (ISGD)"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <p>As mentioned earlier, classical stochastic gradient descent is generally sensitive to <a href="/wiki/Learning_rate" title="Learning rate">learning rate</a> <span class="texhtml mvar" style="font-style:italic;">η</span>. Fast convergence requires large learning rates but this may induce numerical instability. The problem can be largely solved<sup id="cite_ref-29" class="reference"><a href="#cite_note-29"><span class="cite-bracket">&#91;</span>29<span class="cite-bracket">&#93;</span></a></sup> by considering <i>implicit updates</i> whereby the stochastic gradient is evaluated at the next iterate rather than the current one: <span class="mwe-math-element"><span class="mwe-math-mathml-display mwe-math-mathml-a11y" style="display: none;"><math display="block" xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle w^{\text{new}}:=w^{\text{old}}-\eta \,\nabla Q_{i}(w^{\text{new}}).}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <msup> <mi>w</mi> <mrow class="MJX-TeXAtom-ORD"> <mtext>new</mtext> </mrow> </msup> <mo>:=</mo> <msup> <mi>w</mi> <mrow class="MJX-TeXAtom-ORD"> <mtext>old</mtext> </mrow> </msup> <mo>&#x2212;<!-- − --></mo> <mi>&#x03B7;<!-- η --></mi> <mspace width="thinmathspace" /> <mi mathvariant="normal">&#x2207;<!-- ∇ --></mi> <msub> <mi>Q</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>i</mi> </mrow> </msub> <mo stretchy="false">(</mo> <msup> <mi>w</mi> <mrow class="MJX-TeXAtom-ORD"> <mtext>new</mtext> </mrow> </msup> <mo stretchy="false">)</mo> <mo>.</mo> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle w^{\text{new}}:=w^{\text{old}}-\eta \,\nabla Q_{i}(w^{\text{new}}).}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/03aae99e804805a651997417e4dba5683fad4c23" class="mwe-math-fallback-image-display mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.838ex; width:28.716ex; height:3.176ex;" alt="{\displaystyle w^{\text{new}}:=w^{\text{old}}-\eta \,\nabla Q_{i}(w^{\text{new}}).}"></span> </p><p>This equation is implicit since <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle w^{\text{new}}}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <msup> <mi>w</mi> <mrow class="MJX-TeXAtom-ORD"> <mtext>new</mtext> </mrow> </msup> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle w^{\text{new}}}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/e680936ce832182c4d333abb8236370fb69d8e07" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.338ex; width:4.727ex; height:2.343ex;" alt="{\displaystyle w^{\text{new}}}"></span> appears on both sides of the equation. It is a stochastic form of the <a href="/wiki/Proximal_gradient_method" title="Proximal gradient method">proximal gradient method</a> since the update can also be written as: <span class="mwe-math-element"><span class="mwe-math-mathml-display mwe-math-mathml-a11y" style="display: none;"><math display="block" xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle w^{\text{new}}:=\arg \min _{w}\left\{Q_{i}(w)+{\frac {1}{2\eta }}\left\|w-w^{\text{old}}\right\|^{2}\right\}.}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <msup> <mi>w</mi> <mrow class="MJX-TeXAtom-ORD"> <mtext>new</mtext> </mrow> </msup> <mo>:=</mo> <mi>arg</mi> <mo>&#x2061;<!-- ⁡ --></mo> <munder> <mo movablelimits="true" form="prefix">min</mo> <mrow class="MJX-TeXAtom-ORD"> <mi>w</mi> </mrow> </munder> <mrow> <mo>{</mo> <mrow> <msub> <mi>Q</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>i</mi> </mrow> </msub> <mo stretchy="false">(</mo> <mi>w</mi> <mo stretchy="false">)</mo> <mo>+</mo> <mrow class="MJX-TeXAtom-ORD"> <mfrac> <mn>1</mn> <mrow> <mn>2</mn> <mi>&#x03B7;<!-- η --></mi> </mrow> </mfrac> </mrow> <msup> <mrow> <mo symmetric="true">&#x2016;</mo> <mrow> <mi>w</mi> <mo>&#x2212;<!-- − --></mo> <msup> <mi>w</mi> <mrow class="MJX-TeXAtom-ORD"> <mtext>old</mtext> </mrow> </msup> </mrow> <mo symmetric="true">&#x2016;</mo> </mrow> <mrow class="MJX-TeXAtom-ORD"> <mn>2</mn> </mrow> </msup> </mrow> <mo>}</mo> </mrow> <mo>.</mo> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle w^{\text{new}}:=\arg \min _{w}\left\{Q_{i}(w)+{\frac {1}{2\eta }}\left\|w-w^{\text{old}}\right\|^{2}\right\}.}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/60b3552fa12faddb14196bbaaf7b38919d8088a9" class="mwe-math-fallback-image-display mw-invert skin-invert" aria-hidden="true" style="vertical-align: -2.505ex; width:44.972ex; height:6.176ex;" alt="{\displaystyle w^{\text{new}}:=\arg \min _{w}\left\{Q_{i}(w)+{\frac {1}{2\eta }}\left\|w-w^{\text{old}}\right\|^{2}\right\}.}"></span> </p><p>As an example, consider least squares with features <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle x_{1},\ldots ,x_{n}\in \mathbb {R} ^{p}}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <msub> <mi>x</mi> <mrow class="MJX-TeXAtom-ORD"> <mn>1</mn> </mrow> </msub> <mo>,</mo> <mo>&#x2026;<!-- … --></mo> <mo>,</mo> <msub> <mi>x</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>n</mi> </mrow> </msub> <mo>&#x2208;<!-- ∈ --></mo> <msup> <mrow class="MJX-TeXAtom-ORD"> <mi mathvariant="double-struck">R</mi> </mrow> <mrow class="MJX-TeXAtom-ORD"> <mi>p</mi> </mrow> </msup> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle x_{1},\ldots ,x_{n}\in \mathbb {R} ^{p}}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/4a086282964125832b4f3aeefa25380cbd8ef11b" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.671ex; width:15.688ex; height:2.676ex;" alt="{\displaystyle x_{1},\ldots ,x_{n}\in \mathbb {R} ^{p}}"></span> and observations <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle y_{1},\ldots ,y_{n}\in \mathbb {R} }"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <msub> <mi>y</mi> <mrow class="MJX-TeXAtom-ORD"> <mn>1</mn> </mrow> </msub> <mo>,</mo> <mo>&#x2026;<!-- … --></mo> <mo>,</mo> <msub> <mi>y</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>n</mi> </mrow> </msub> <mo>&#x2208;<!-- ∈ --></mo> <mrow class="MJX-TeXAtom-ORD"> <mi mathvariant="double-struck">R</mi> </mrow> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle y_{1},\ldots ,y_{n}\in \mathbb {R} }</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/25d616c0660db8c623d25dd85a7261ace02a5738" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.671ex; width:14.248ex; height:2.509ex;" alt="{\displaystyle y_{1},\ldots ,y_{n}\in \mathbb {R} }"></span>. We wish to solve: <span class="mwe-math-element"><span class="mwe-math-mathml-display mwe-math-mathml-a11y" style="display: none;"><math display="block" xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle \min _{w}\sum _{j=1}^{n}\left(y_{j}-x_{j}'w\right)^{2},}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <munder> <mo movablelimits="true" form="prefix">min</mo> <mrow class="MJX-TeXAtom-ORD"> <mi>w</mi> </mrow> </munder> <munderover> <mo>&#x2211;<!-- ∑ --></mo> <mrow class="MJX-TeXAtom-ORD"> <mi>j</mi> <mo>=</mo> <mn>1</mn> </mrow> <mrow class="MJX-TeXAtom-ORD"> <mi>n</mi> </mrow> </munderover> <msup> <mrow> <mo>(</mo> <mrow> <msub> <mi>y</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>j</mi> </mrow> </msub> <mo>&#x2212;<!-- − --></mo> <msubsup> <mi>x</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>j</mi> </mrow> <mo>&#x2032;</mo> </msubsup> <mi>w</mi> </mrow> <mo>)</mo> </mrow> <mrow class="MJX-TeXAtom-ORD"> <mn>2</mn> </mrow> </msup> <mo>,</mo> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle \min _{w}\sum _{j=1}^{n}\left(y_{j}-x_{j}'w\right)^{2},}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/1958d8915c0f74ac227f212f18d47c4feef94f36" class="mwe-math-fallback-image-display mw-invert skin-invert" aria-hidden="true" style="vertical-align: -3.338ex; width:21.274ex; height:7.176ex;" alt="{\displaystyle \min _{w}\sum _{j=1}^{n}\left(y_{j}-x_{j}&#039;w\right)^{2},}"></span> where <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle x_{j}'w=x_{j1}w_{1}+x_{j,2}w_{2}+...+x_{j,p}w_{p}}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <msubsup> <mi>x</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>j</mi> </mrow> <mo>&#x2032;</mo> </msubsup> <mi>w</mi> <mo>=</mo> <msub> <mi>x</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>j</mi> <mn>1</mn> </mrow> </msub> <msub> <mi>w</mi> <mrow class="MJX-TeXAtom-ORD"> <mn>1</mn> </mrow> </msub> <mo>+</mo> <msub> <mi>x</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>j</mi> <mo>,</mo> <mn>2</mn> </mrow> </msub> <msub> <mi>w</mi> <mrow class="MJX-TeXAtom-ORD"> <mn>2</mn> </mrow> </msub> <mo>+</mo> <mo>.</mo> <mo>.</mo> <mo>.</mo> <mo>+</mo> <msub> <mi>x</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>j</mi> <mo>,</mo> <mi>p</mi> </mrow> </msub> <msub> <mi>w</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>p</mi> </mrow> </msub> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle x_{j}'w=x_{j1}w_{1}+x_{j,2}w_{2}+...+x_{j,p}w_{p}}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/2edd7364cc83b0e3e474cd0d8b40c632b8683656" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -1.338ex; width:34.824ex; height:3.176ex;" alt="{\displaystyle x_{j}&#039;w=x_{j1}w_{1}+x_{j,2}w_{2}+...+x_{j,p}w_{p}}"></span> indicates the inner product. Note that <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle x}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mi>x</mi> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle x}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/87f9e315fd7e2ba406057a97300593c4802b53e4" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.338ex; width:1.33ex; height:1.676ex;" alt="{\displaystyle x}"></span> could have "1" as the first element to include an intercept. Classical stochastic gradient descent proceeds as follows: <span class="mwe-math-element"><span class="mwe-math-mathml-display mwe-math-mathml-a11y" style="display: none;"><math display="block" xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle w^{\text{new}}=w^{\text{old}}+\eta \left(y_{i}-x_{i}'w^{\text{old}}\right)x_{i}}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <msup> <mi>w</mi> <mrow class="MJX-TeXAtom-ORD"> <mtext>new</mtext> </mrow> </msup> <mo>=</mo> <msup> <mi>w</mi> <mrow class="MJX-TeXAtom-ORD"> <mtext>old</mtext> </mrow> </msup> <mo>+</mo> <mi>&#x03B7;<!-- η --></mi> <mrow> <mo>(</mo> <mrow> <msub> <mi>y</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>i</mi> </mrow> </msub> <mo>&#x2212;<!-- − --></mo> <msubsup> <mi>x</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>i</mi> </mrow> <mo>&#x2032;</mo> </msubsup> <msup> <mi>w</mi> <mrow class="MJX-TeXAtom-ORD"> <mtext>old</mtext> </mrow> </msup> </mrow> <mo>)</mo> </mrow> <msub> <mi>x</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>i</mi> </mrow> </msub> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle w^{\text{new}}=w^{\text{old}}+\eta \left(y_{i}-x_{i}'w^{\text{old}}\right)x_{i}}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/57a733e0617ed9b39b93eba14aa1e87cf82d0bb1" class="mwe-math-fallback-image-display mw-invert skin-invert" aria-hidden="true" style="vertical-align: -1.005ex; width:31.957ex; height:3.343ex;" alt="{\displaystyle w^{\text{new}}=w^{\text{old}}+\eta \left(y_{i}-x_{i}&#039;w^{\text{old}}\right)x_{i}}"></span> </p><p>where <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle i}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mi>i</mi> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle i}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/add78d8608ad86e54951b8c8bd6c8d8416533d20" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.338ex; width:0.802ex; height:2.176ex;" alt="{\displaystyle i}"></span> is uniformly sampled between 1 and <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle n}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mi>n</mi> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle n}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/a601995d55609f2d9f5e233e36fbe9ea26011b3b" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.338ex; width:1.395ex; height:1.676ex;" alt="{\displaystyle n}"></span>. Although theoretical convergence of this procedure happens under relatively mild assumptions, in practice the procedure can be quite unstable. In particular, when <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle \eta }"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mi>&#x03B7;<!-- η --></mi> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle \eta }</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/e4d701857cf5fbec133eebaf94deadf722537f64" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.838ex; width:1.169ex; height:2.176ex;" alt="{\displaystyle \eta }"></span> is misspecified so that <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle I-\eta x_{i}x_{i}'}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mi>I</mi> <mo>&#x2212;<!-- − --></mo> <mi>&#x03B7;<!-- η --></mi> <msub> <mi>x</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>i</mi> </mrow> </msub> <msubsup> <mi>x</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>i</mi> </mrow> <mo>&#x2032;</mo> </msubsup> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle I-\eta x_{i}x_{i}'}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/04a418d15fb3bca31f5dcc2f05508dbe851aa212" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -1.005ex; width:9.44ex; height:2.843ex;" alt="{\displaystyle I-\eta x_{i}x_{i}&#039;}"></span> has large absolute eigenvalues with high probability, the procedure may diverge numerically within a few iterations. In contrast, <i>implicit stochastic gradient descent</i> (shortened as ISGD) can be solved in closed-form as: <span class="mwe-math-element"><span class="mwe-math-mathml-display mwe-math-mathml-a11y" style="display: none;"><math display="block" xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle w^{\text{new}}=w^{\text{old}}+{\frac {\eta }{1+\eta \left\|x_{i}\right\|^{2}}}\left(y_{i}-x_{i}'w^{\text{old}}\right)x_{i}.}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <msup> <mi>w</mi> <mrow class="MJX-TeXAtom-ORD"> <mtext>new</mtext> </mrow> </msup> <mo>=</mo> <msup> <mi>w</mi> <mrow class="MJX-TeXAtom-ORD"> <mtext>old</mtext> </mrow> </msup> <mo>+</mo> <mrow class="MJX-TeXAtom-ORD"> <mfrac> <mi>&#x03B7;<!-- η --></mi> <mrow> <mn>1</mn> <mo>+</mo> <mi>&#x03B7;<!-- η --></mi> <msup> <mrow> <mo symmetric="true">&#x2016;</mo> <msub> <mi>x</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>i</mi> </mrow> </msub> <mo symmetric="true">&#x2016;</mo> </mrow> <mrow class="MJX-TeXAtom-ORD"> <mn>2</mn> </mrow> </msup> </mrow> </mfrac> </mrow> <mrow> <mo>(</mo> <mrow> <msub> <mi>y</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>i</mi> </mrow> </msub> <mo>&#x2212;<!-- − --></mo> <msubsup> <mi>x</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>i</mi> </mrow> <mo>&#x2032;</mo> </msubsup> <msup> <mi>w</mi> <mrow class="MJX-TeXAtom-ORD"> <mtext>old</mtext> </mrow> </msup> </mrow> <mo>)</mo> </mrow> <msub> <mi>x</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>i</mi> </mrow> </msub> <mo>.</mo> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle w^{\text{new}}=w^{\text{old}}+{\frac {\eta }{1+\eta \left\|x_{i}\right\|^{2}}}\left(y_{i}-x_{i}'w^{\text{old}}\right)x_{i}.}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/e1feb68fd084d22b91acc6932ae62fa205c7668b" class="mwe-math-fallback-image-display mw-invert skin-invert" aria-hidden="true" style="vertical-align: -3.171ex; width:42.951ex; height:6.176ex;" alt="{\displaystyle w^{\text{new}}=w^{\text{old}}+{\frac {\eta }{1+\eta \left\|x_{i}\right\|^{2}}}\left(y_{i}-x_{i}&#039;w^{\text{old}}\right)x_{i}.}"></span> </p><p>This procedure will remain numerically stable virtually for all <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle \eta }"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mi>&#x03B7;<!-- η --></mi> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle \eta }</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/e4d701857cf5fbec133eebaf94deadf722537f64" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.838ex; width:1.169ex; height:2.176ex;" alt="{\displaystyle \eta }"></span> as the <a href="/wiki/Learning_rate" title="Learning rate">learning rate</a> is now normalized. Such comparison between classical and implicit stochastic gradient descent in the least squares problem is very similar to the comparison between <a href="/wiki/Least_mean_squares_filter" title="Least mean squares filter">least mean squares (LMS)</a> and <a href="/wiki/Least_mean_squares_filter#Normalized_least_mean_squares_filter_(NLMS)" title="Least mean squares filter">normalized least mean squares filter (NLMS)</a>. </p><p>Even though a closed-form solution for ISGD is only possible in least squares, the procedure can be efficiently implemented in a wide range of models. Specifically, suppose that <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle Q_{i}(w)}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <msub> <mi>Q</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>i</mi> </mrow> </msub> <mo stretchy="false">(</mo> <mi>w</mi> <mo stretchy="false">)</mo> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle Q_{i}(w)}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/25b1170e62c103ff59c79ea424f1c409c4742225" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.838ex; width:6.111ex; height:2.843ex;" alt="{\displaystyle Q_{i}(w)}"></span> depends on <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle w}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mi>w</mi> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle w}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/88b1e0c8e1be5ebe69d18a8010676fa42d7961e6" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.338ex; width:1.664ex; height:1.676ex;" alt="{\displaystyle w}"></span> only through a linear combination with features <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle x_{i}}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <msub> <mi>x</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>i</mi> </mrow> </msub> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle x_{i}}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/e87000dd6142b81d041896a30fe58f0c3acb2158" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.671ex; width:2.129ex; height:2.009ex;" alt="{\displaystyle x_{i}}"></span>, so that we can write <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle \nabla _{w}Q_{i}(w)=-q(x_{i}'w)x_{i}}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <msub> <mi mathvariant="normal">&#x2207;<!-- ∇ --></mi> <mrow class="MJX-TeXAtom-ORD"> <mi>w</mi> </mrow> </msub> <msub> <mi>Q</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>i</mi> </mrow> </msub> <mo stretchy="false">(</mo> <mi>w</mi> <mo stretchy="false">)</mo> <mo>=</mo> <mo>&#x2212;<!-- − --></mo> <mi>q</mi> <mo stretchy="false">(</mo> <msubsup> <mi>x</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>i</mi> </mrow> <mo>&#x2032;</mo> </msubsup> <mi>w</mi> <mo stretchy="false">)</mo> <msub> <mi>x</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>i</mi> </mrow> </msub> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle \nabla _{w}Q_{i}(w)=-q(x_{i}'w)x_{i}}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/bc60760be82f4ab7ced0dc1e8d7108591e22a319" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -1.005ex; width:23.165ex; height:3.009ex;" alt="{\displaystyle \nabla _{w}Q_{i}(w)=-q(x_{i}&#039;w)x_{i}}"></span>, where <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle q()\in \mathbb {R} }"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mi>q</mi> <mo stretchy="false">(</mo> <mo stretchy="false">)</mo> <mo>&#x2208;<!-- ∈ --></mo> <mrow class="MJX-TeXAtom-ORD"> <mi mathvariant="double-struck">R</mi> </mrow> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle q()\in \mathbb {R} }</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/f611c4f9e2388ee543ce8cd3d5c5248f51b78414" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.838ex; width:7.398ex; height:2.843ex;" alt="{\displaystyle q()\in \mathbb {R} }"></span> may depend on <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle x_{i},y_{i}}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <msub> <mi>x</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>i</mi> </mrow> </msub> <mo>,</mo> <msub> <mi>y</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>i</mi> </mrow> </msub> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle x_{i},y_{i}}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/2dccea2bb6a826b389fd2d042508c3d30443dd77" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.671ex; width:5.102ex; height:2.009ex;" alt="{\displaystyle x_{i},y_{i}}"></span> as well but not on <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle w}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mi>w</mi> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle w}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/88b1e0c8e1be5ebe69d18a8010676fa42d7961e6" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.338ex; width:1.664ex; height:1.676ex;" alt="{\displaystyle w}"></span> except through <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle x_{i}'w}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <msubsup> <mi>x</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>i</mi> </mrow> <mo>&#x2032;</mo> </msubsup> <mi>w</mi> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle x_{i}'w}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/01ca12457e89344c13cf259fd19b79dabf903b57" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -1.005ex; width:3.793ex; height:2.843ex;" alt="{\displaystyle x_{i}&#039;w}"></span>. Least squares obeys this rule, and so does <a href="/wiki/Logistic_regression" title="Logistic regression">logistic regression</a>, and most <a href="/wiki/Generalized_linear_model" title="Generalized linear model">generalized linear models</a>. For instance, in least squares, <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle q(x_{i}'w)=y_{i}-x_{i}'w}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mi>q</mi> <mo stretchy="false">(</mo> <msubsup> <mi>x</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>i</mi> </mrow> <mo>&#x2032;</mo> </msubsup> <mi>w</mi> <mo stretchy="false">)</mo> <mo>=</mo> <msub> <mi>y</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>i</mi> </mrow> </msub> <mo>&#x2212;<!-- − --></mo> <msubsup> <mi>x</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>i</mi> </mrow> <mo>&#x2032;</mo> </msubsup> <mi>w</mi> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle q(x_{i}'w)=y_{i}-x_{i}'w}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/607ddbf97fe348c4be9e04075ace1daa9f7cd328" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -1.005ex; width:18.344ex; height:3.009ex;" alt="{\displaystyle q(x_{i}&#039;w)=y_{i}-x_{i}&#039;w}"></span>, and in logistic regression <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle q(x_{i}'w)=y_{i}-S(x_{i}'w)}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mi>q</mi> <mo stretchy="false">(</mo> <msubsup> <mi>x</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>i</mi> </mrow> <mo>&#x2032;</mo> </msubsup> <mi>w</mi> <mo stretchy="false">)</mo> <mo>=</mo> <msub> <mi>y</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>i</mi> </mrow> </msub> <mo>&#x2212;<!-- − --></mo> <mi>S</mi> <mo stretchy="false">(</mo> <msubsup> <mi>x</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>i</mi> </mrow> <mo>&#x2032;</mo> </msubsup> <mi>w</mi> <mo stretchy="false">)</mo> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle q(x_{i}'w)=y_{i}-S(x_{i}'w)}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/e6f282d5b99039cb5ed58b1fdfc87831f73e1613" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -1.005ex; width:21.652ex; height:3.009ex;" alt="{\displaystyle q(x_{i}&#039;w)=y_{i}-S(x_{i}&#039;w)}"></span>, where <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle S(u)=e^{u}/(1+e^{u})}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mi>S</mi> <mo stretchy="false">(</mo> <mi>u</mi> <mo stretchy="false">)</mo> <mo>=</mo> <msup> <mi>e</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>u</mi> </mrow> </msup> <mrow class="MJX-TeXAtom-ORD"> <mo>/</mo> </mrow> <mo stretchy="false">(</mo> <mn>1</mn> <mo>+</mo> <msup> <mi>e</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>u</mi> </mrow> </msup> <mo stretchy="false">)</mo> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle S(u)=e^{u}/(1+e^{u})}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/de2504987ff339b8e7bd2e2c86ed8636a8d7578e" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.838ex; width:19.223ex; height:2.843ex;" alt="{\displaystyle S(u)=e^{u}/(1+e^{u})}"></span> is the <a href="/wiki/Logistic_function" title="Logistic function">logistic function</a>. In <a href="/wiki/Poisson_regression" title="Poisson regression">Poisson regression</a>, <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle q(x_{i}'w)=y_{i}-e^{x_{i}'w}}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mi>q</mi> <mo stretchy="false">(</mo> <msubsup> <mi>x</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>i</mi> </mrow> <mo>&#x2032;</mo> </msubsup> <mi>w</mi> <mo stretchy="false">)</mo> <mo>=</mo> <msub> <mi>y</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>i</mi> </mrow> </msub> <mo>&#x2212;<!-- − --></mo> <msup> <mi>e</mi> <mrow class="MJX-TeXAtom-ORD"> <msubsup> <mi>x</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>i</mi> </mrow> <mo>&#x2032;</mo> </msubsup> <mi>w</mi> </mrow> </msup> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle q(x_{i}'w)=y_{i}-e^{x_{i}'w}}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/78f90a4477fb0482519879c1b580ecd8e8a559b1" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -1.005ex; width:18.608ex; height:3.509ex;" alt="{\displaystyle q(x_{i}&#039;w)=y_{i}-e^{x_{i}&#039;w}}"></span>, and so on. </p><p>In such settings, ISGD is simply implemented as follows. Let <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle f(\xi )=\eta q(x_{i}'w^{\text{old}}+\xi \|x_{i}\|^{2})}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mi>f</mi> <mo stretchy="false">(</mo> <mi>&#x03BE;<!-- ξ --></mi> <mo stretchy="false">)</mo> <mo>=</mo> <mi>&#x03B7;<!-- η --></mi> <mi>q</mi> <mo stretchy="false">(</mo> <msubsup> <mi>x</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>i</mi> </mrow> <mo>&#x2032;</mo> </msubsup> <msup> <mi>w</mi> <mrow class="MJX-TeXAtom-ORD"> <mtext>old</mtext> </mrow> </msup> <mo>+</mo> <mi>&#x03BE;<!-- ξ --></mi> <mo fence="false" stretchy="false">&#x2016;<!-- ‖ --></mo> <msub> <mi>x</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>i</mi> </mrow> </msub> <msup> <mo fence="false" stretchy="false">&#x2016;<!-- ‖ --></mo> <mrow class="MJX-TeXAtom-ORD"> <mn>2</mn> </mrow> </msup> <mo stretchy="false">)</mo> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle f(\xi )=\eta q(x_{i}'w^{\text{old}}+\xi \|x_{i}\|^{2})}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/9ff7039597962c00ffb835401deb9bc92027c48e" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -1.005ex; width:26.863ex; height:3.343ex;" alt="{\displaystyle f(\xi )=\eta q(x_{i}&#039;w^{\text{old}}+\xi \|x_{i}\|^{2})}"></span>, where <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle \xi }"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mi>&#x03BE;<!-- ξ --></mi> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle \xi }</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/e0b461aaf61091abd5d2c808931c48b8ff9647db" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.671ex; width:1.03ex; height:2.509ex;" alt="{\displaystyle \xi }"></span> is scalar. Then, ISGD is equivalent to: <span class="mwe-math-element"><span class="mwe-math-mathml-display mwe-math-mathml-a11y" style="display: none;"><math display="block" xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle w^{\text{new}}=w^{\text{old}}+\xi ^{\ast }x_{i},~{\text{where}}~\xi ^{\ast }=f(\xi ^{\ast }).}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <msup> <mi>w</mi> <mrow class="MJX-TeXAtom-ORD"> <mtext>new</mtext> </mrow> </msup> <mo>=</mo> <msup> <mi>w</mi> <mrow class="MJX-TeXAtom-ORD"> <mtext>old</mtext> </mrow> </msup> <mo>+</mo> <msup> <mi>&#x03BE;<!-- ξ --></mi> <mrow class="MJX-TeXAtom-ORD"> <mo>&#x2217;<!-- ∗ --></mo> </mrow> </msup> <msub> <mi>x</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>i</mi> </mrow> </msub> <mo>,</mo> <mtext>&#xA0;</mtext> <mrow class="MJX-TeXAtom-ORD"> <mtext>where</mtext> </mrow> <mtext>&#xA0;</mtext> <msup> <mi>&#x03BE;<!-- ξ --></mi> <mrow class="MJX-TeXAtom-ORD"> <mo>&#x2217;<!-- ∗ --></mo> </mrow> </msup> <mo>=</mo> <mi>f</mi> <mo stretchy="false">(</mo> <msup> <mi>&#x03BE;<!-- ξ --></mi> <mrow class="MJX-TeXAtom-ORD"> <mo>&#x2217;<!-- ∗ --></mo> </mrow> </msup> <mo stretchy="false">)</mo> <mo>.</mo> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle w^{\text{new}}=w^{\text{old}}+\xi ^{\ast }x_{i},~{\text{where}}~\xi ^{\ast }=f(\xi ^{\ast }).}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/5070f3dcc857cf3a957637f863c2edc3829d212f" class="mwe-math-fallback-image-display mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.838ex; width:38.124ex; height:3.176ex;" alt="{\displaystyle w^{\text{new}}=w^{\text{old}}+\xi ^{\ast }x_{i},~{\text{where}}~\xi ^{\ast }=f(\xi ^{\ast }).}"></span> </p><p>The scaling factor <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle \xi ^{\ast }\in \mathbb {R} }"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <msup> <mi>&#x03BE;<!-- ξ --></mi> <mrow class="MJX-TeXAtom-ORD"> <mo>&#x2217;<!-- ∗ --></mo> </mrow> </msup> <mo>&#x2208;<!-- ∈ --></mo> <mrow class="MJX-TeXAtom-ORD"> <mi mathvariant="double-struck">R</mi> </mrow> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle \xi ^{\ast }\in \mathbb {R} }</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/fc2f9ce598e6b32c18ecac6bd64ba037209855da" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.671ex; width:6.607ex; height:2.676ex;" alt="{\displaystyle \xi ^{\ast }\in \mathbb {R} }"></span> can be found through the <a href="/wiki/Bisection_method" title="Bisection method">bisection method</a> since in most regular models, such as the aforementioned generalized linear models, function <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle q()}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mi>q</mi> <mo stretchy="false">(</mo> <mo stretchy="false">)</mo> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle q()}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/eaa4e64f94c9b3a7b9c178c756dff3c17228e1d8" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.838ex; width:2.879ex; height:2.843ex;" alt="{\displaystyle q()}"></span> is decreasing, and thus the search bounds for <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle \xi ^{\ast }}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <msup> <mi>&#x03BE;<!-- ξ --></mi> <mrow class="MJX-TeXAtom-ORD"> <mo>&#x2217;<!-- ∗ --></mo> </mrow> </msup> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle \xi ^{\ast }}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/74ce2447e43e305cf9923127dc902a8f51db7fad" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.671ex; width:2.088ex; height:2.676ex;" alt="{\displaystyle \xi ^{\ast }}"></span> are <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle [\min(0,f(0)),\max(0,f(0))]}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mo stretchy="false">[</mo> <mo movablelimits="true" form="prefix">min</mo> <mo stretchy="false">(</mo> <mn>0</mn> <mo>,</mo> <mi>f</mi> <mo stretchy="false">(</mo> <mn>0</mn> <mo stretchy="false">)</mo> <mo stretchy="false">)</mo> <mo>,</mo> <mo movablelimits="true" form="prefix">max</mo> <mo stretchy="false">(</mo> <mn>0</mn> <mo>,</mo> <mi>f</mi> <mo stretchy="false">(</mo> <mn>0</mn> <mo stretchy="false">)</mo> <mo stretchy="false">)</mo> <mo stretchy="false">]</mo> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle [\min(0,f(0)),\max(0,f(0))]}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/103d0e4214316fcef57cd3bbba35919dc2708884" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.838ex; width:27.041ex; height:2.843ex;" alt="{\displaystyle [\min(0,f(0)),\max(0,f(0))]}"></span>. </p> <div class="mw-heading mw-heading3"><h3 id="Momentum">Momentum</h3><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Stochastic_gradient_descent&amp;action=edit&amp;section=8" title="Edit section: Momentum"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <p><span class="anchor" id="Momentum"></span><span class="anchor" id="Nesterov"></span> </p><p>Further proposals include the <i>momentum method</i> or the <i>heavy ball method</i>, which in ML context appeared in <a href="/wiki/David_Rumelhart" title="David Rumelhart">Rumelhart</a>, <a href="/wiki/Geoffrey_Hinton" title="Geoffrey Hinton">Hinton</a> and <a href="/wiki/Ronald_J._Williams" title="Ronald J. Williams">Williams</a>' paper on backpropagation learning<sup id="cite_ref-Rumelhart1986_30-0" class="reference"><a href="#cite_note-Rumelhart1986-30"><span class="cite-bracket">&#91;</span>30<span class="cite-bracket">&#93;</span></a></sup> and borrowed the idea from Soviet mathematician Boris Polyak's 1964 article on solving functional equations.<sup id="cite_ref-31" class="reference"><a href="#cite_note-31"><span class="cite-bracket">&#91;</span>31<span class="cite-bracket">&#93;</span></a></sup> Stochastic gradient descent with momentum remembers the update <span class="texhtml">Δ<i>w</i></span> at each iteration, and determines the next update as a <a href="/wiki/Linear_combination" title="Linear combination">linear combination</a> of the gradient and the previous update:<sup id="cite_ref-Sutskever2013_32-0" class="reference"><a href="#cite_note-Sutskever2013-32"><span class="cite-bracket">&#91;</span>32<span class="cite-bracket">&#93;</span></a></sup><sup id="cite_ref-SutskeverPhD_33-0" class="reference"><a href="#cite_note-SutskeverPhD-33"><span class="cite-bracket">&#91;</span>33<span class="cite-bracket">&#93;</span></a></sup> <span class="mwe-math-element"><span class="mwe-math-mathml-display mwe-math-mathml-a11y" style="display: none;"><math display="block" xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle \Delta w:=\alpha \Delta w-\eta \,\nabla Q_{i}(w)}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mi mathvariant="normal">&#x0394;<!-- Δ --></mi> <mi>w</mi> <mo>:=</mo> <mi>&#x03B1;<!-- α --></mi> <mi mathvariant="normal">&#x0394;<!-- Δ --></mi> <mi>w</mi> <mo>&#x2212;<!-- − --></mo> <mi>&#x03B7;<!-- η --></mi> <mspace width="thinmathspace" /> <mi mathvariant="normal">&#x2207;<!-- ∇ --></mi> <msub> <mi>Q</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>i</mi> </mrow> </msub> <mo stretchy="false">(</mo> <mi>w</mi> <mo stretchy="false">)</mo> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle \Delta w:=\alpha \Delta w-\eta \,\nabla Q_{i}(w)}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/f758199c43fb11261e2dfd575de807b401b6496e" class="mwe-math-fallback-image-display mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.838ex; width:24.877ex; height:2.843ex;" alt="{\displaystyle \Delta w:=\alpha \Delta w-\eta \,\nabla Q_{i}(w)}"></span> <span class="mwe-math-element"><span class="mwe-math-mathml-display mwe-math-mathml-a11y" style="display: none;"><math display="block" xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle w:=w+\Delta w}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mi>w</mi> <mo>:=</mo> <mi>w</mi> <mo>+</mo> <mi mathvariant="normal">&#x0394;<!-- Δ --></mi> <mi>w</mi> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle w:=w+\Delta w}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/9305a474135c05fa8767aebd66c404e189506d03" class="mwe-math-fallback-image-display mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.505ex; width:13.514ex; height:2.343ex;" alt="{\displaystyle w:=w+\Delta w}"></span> that leads to: <span class="mwe-math-element"><span class="mwe-math-mathml-display mwe-math-mathml-a11y" style="display: none;"><math display="block" xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle w:=w-\eta \,\nabla Q_{i}(w)+\alpha \Delta w}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mi>w</mi> <mo>:=</mo> <mi>w</mi> <mo>&#x2212;<!-- − --></mo> <mi>&#x03B7;<!-- η --></mi> <mspace width="thinmathspace" /> <mi mathvariant="normal">&#x2207;<!-- ∇ --></mi> <msub> <mi>Q</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>i</mi> </mrow> </msub> <mo stretchy="false">(</mo> <mi>w</mi> <mo stretchy="false">)</mo> <mo>+</mo> <mi>&#x03B1;<!-- α --></mi> <mi mathvariant="normal">&#x0394;<!-- Δ --></mi> <mi>w</mi> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle w:=w-\eta \,\nabla Q_{i}(w)+\alpha \Delta w}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/34d0a4a8529cc81e00a33c9c4a79db57ab76dd5d" class="mwe-math-fallback-image-display mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.838ex; width:27.446ex; height:2.843ex;" alt="{\displaystyle w:=w-\eta \,\nabla Q_{i}(w)+\alpha \Delta w}"></span> </p><p>where the <a href="/wiki/Parametric_statistics" title="Parametric statistics">parameter</a> <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle w}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mi>w</mi> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle w}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/88b1e0c8e1be5ebe69d18a8010676fa42d7961e6" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.338ex; width:1.664ex; height:1.676ex;" alt="{\displaystyle w}"></span> which minimizes <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle Q(w)}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mi>Q</mi> <mo stretchy="false">(</mo> <mi>w</mi> <mo stretchy="false">)</mo> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle Q(w)}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/6346b36d378401a300ca8fea88d2a37d48973d0a" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.838ex; width:5.312ex; height:2.843ex;" alt="{\displaystyle Q(w)}"></span> is to be <a href="/wiki/Estimator" title="Estimator">estimated</a>, <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle \eta }"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mi>&#x03B7;<!-- η --></mi> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle \eta }</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/e4d701857cf5fbec133eebaf94deadf722537f64" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.838ex; width:1.169ex; height:2.176ex;" alt="{\displaystyle \eta }"></span> is a step size (sometimes called the <i><a href="/wiki/Learning_rate" title="Learning rate">learning rate</a></i> in machine learning) and <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle \alpha }"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mi>&#x03B1;<!-- α --></mi> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle \alpha }</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/b79333175c8b3f0840bfb4ec41b8072c83ea88d3" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.338ex; width:1.488ex; height:1.676ex;" alt="{\displaystyle \alpha }"></span> is an exponential <a href="/wiki/Learning_rate#Learning_rate_schedule" title="Learning rate">decay factor</a> between 0 and 1 that determines the relative contribution of the current gradient and earlier gradients to the weight change. </p><p>The name momentum stems from an analogy to <a href="/wiki/Momentum" title="Momentum">momentum</a> in physics: the weight vector <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle w}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mi>w</mi> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle w}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/88b1e0c8e1be5ebe69d18a8010676fa42d7961e6" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.338ex; width:1.664ex; height:1.676ex;" alt="{\displaystyle w}"></span>, thought of as a particle traveling through parameter space,<sup id="cite_ref-Rumelhart1986_30-1" class="reference"><a href="#cite_note-Rumelhart1986-30"><span class="cite-bracket">&#91;</span>30<span class="cite-bracket">&#93;</span></a></sup> incurs acceleration from the gradient of the loss ("<a href="/wiki/Force" title="Force">force</a>"). Unlike in classical stochastic gradient descent, it tends to keep traveling in the same direction, preventing oscillations. Momentum has been used successfully by computer scientists in the training of <a href="/wiki/Artificial_neural_networks" class="mw-redirect" title="Artificial neural networks">artificial neural networks</a> for several decades.<sup id="cite_ref-Zeiler_2012_34-0" class="reference"><a href="#cite_note-Zeiler_2012-34"><span class="cite-bracket">&#91;</span>34<span class="cite-bracket">&#93;</span></a></sup> The <i>momentum method</i> is closely related to <a href="/wiki/Langevin_dynamics" title="Langevin dynamics">underdamped Langevin dynamics</a>, and may be combined with <a href="/wiki/Simulated_annealing" title="Simulated annealing">simulated annealing</a>.<sup id="cite_ref-Borysenko2021_35-0" class="reference"><a href="#cite_note-Borysenko2021-35"><span class="cite-bracket">&#91;</span>35<span class="cite-bracket">&#93;</span></a></sup> </p><p>In mid-1980s the method was modified by <a href="/wiki/Yurii_Nesterov" title="Yurii Nesterov">Yurii Nesterov</a> to use the gradient predicted at the next point, and the resulting so-called <i>Nesterov Accelerated Gradient</i> was sometimes used in ML in the 2010s.<sup id="cite_ref-36" class="reference"><a href="#cite_note-36"><span class="cite-bracket">&#91;</span>36<span class="cite-bracket">&#93;</span></a></sup> </p> <div class="mw-heading mw-heading3"><h3 id="Averaging">Averaging</h3><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Stochastic_gradient_descent&amp;action=edit&amp;section=9" title="Edit section: Averaging"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <p><i>Averaged stochastic gradient descent</i>, invented independently by Ruppert and Polyak in the late 1980s, is ordinary stochastic gradient descent that records an average of its parameter vector over time. That is, the update is the same as for ordinary stochastic gradient descent, but the algorithm also keeps track of<sup id="cite_ref-37" class="reference"><a href="#cite_note-37"><span class="cite-bracket">&#91;</span>37<span class="cite-bracket">&#93;</span></a></sup> </p><p><span class="mwe-math-element"><span class="mwe-math-mathml-display mwe-math-mathml-a11y" style="display: none;"><math display="block" xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle {\bar {w}}={\frac {1}{t}}\sum _{i=0}^{t-1}w_{i}.}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mrow class="MJX-TeXAtom-ORD"> <mrow class="MJX-TeXAtom-ORD"> <mover> <mi>w</mi> <mo stretchy="false">&#x00AF;<!-- ¯ --></mo> </mover> </mrow> </mrow> <mo>=</mo> <mrow class="MJX-TeXAtom-ORD"> <mfrac> <mn>1</mn> <mi>t</mi> </mfrac> </mrow> <munderover> <mo>&#x2211;<!-- ∑ --></mo> <mrow class="MJX-TeXAtom-ORD"> <mi>i</mi> <mo>=</mo> <mn>0</mn> </mrow> <mrow class="MJX-TeXAtom-ORD"> <mi>t</mi> <mo>&#x2212;<!-- − --></mo> <mn>1</mn> </mrow> </munderover> <msub> <mi>w</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>i</mi> </mrow> </msub> <mo>.</mo> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle {\bar {w}}={\frac {1}{t}}\sum _{i=0}^{t-1}w_{i}.}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/c8d046252329fab27b51666c755dbc839f1be753" class="mwe-math-fallback-image-display mw-invert skin-invert" aria-hidden="true" style="vertical-align: -3.005ex; width:14.001ex; height:7.343ex;" alt="{\displaystyle {\bar {w}}={\frac {1}{t}}\sum _{i=0}^{t-1}w_{i}.}"></span>When optimization is done, this averaged parameter vector takes the place of <span class="texhtml mvar" style="font-style:italic;">w</span>. </p> <div class="mw-heading mw-heading3"><h3 id="AdaGrad">AdaGrad</h3><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Stochastic_gradient_descent&amp;action=edit&amp;section=10" title="Edit section: AdaGrad"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <p><i>AdaGrad</i> (for adaptive <a href="/wiki/Gradient_descent" title="Gradient descent">gradient</a> algorithm) is a modified stochastic gradient descent algorithm with per-parameter <a href="/wiki/Learning_rate" title="Learning rate">learning rate</a>, first published in 2011.<sup id="cite_ref-duchi_38-0" class="reference"><a href="#cite_note-duchi-38"><span class="cite-bracket">&#91;</span>38<span class="cite-bracket">&#93;</span></a></sup> Informally, this increases the learning rate for <span class="cleanup-needed-content" style="padding-left:0.1em; padding-right:0.1em; color:var(--color-subtle, #54595d); border:1px solid var(--border-color-subtle, #c8ccd1);">sparser parameters</span><sup class="noprint Inline-Template" style="margin-left:0.1em; white-space:nowrap;">&#91;<i><a href="/wiki/Wikipedia:Please_clarify" title="Wikipedia:Please clarify"><span title="The text near this tag may need clarification or removal of jargon. (November 2023)">clarification needed</span></a></i>&#93;</sup> and decreases the learning rate for ones that are less sparse. This strategy often improves convergence performance over standard stochastic gradient descent in settings where data is sparse and sparse parameters are more informative. Examples of such applications include natural language processing and image recognition.<sup id="cite_ref-duchi_38-1" class="reference"><a href="#cite_note-duchi-38"><span class="cite-bracket">&#91;</span>38<span class="cite-bracket">&#93;</span></a></sup> </p><p>It still has a base learning rate <span class="texhtml mvar" style="font-style:italic;">η</span>, but this is multiplied with the elements of a vector <span class="texhtml">{<i>G</i><sub><i>j</i>,<i>j</i></sub>} </span> which is the diagonal of the <a href="/wiki/Outer_product" title="Outer product">outer product</a> matrix </p><p><span class="mwe-math-element"><span class="mwe-math-mathml-display mwe-math-mathml-a11y" style="display: none;"><math display="block" xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle G=\sum _{\tau =1}^{t}g_{\tau }g_{\tau }^{\mathsf {T}}}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mi>G</mi> <mo>=</mo> <munderover> <mo>&#x2211;<!-- ∑ --></mo> <mrow class="MJX-TeXAtom-ORD"> <mi>&#x03C4;<!-- τ --></mi> <mo>=</mo> <mn>1</mn> </mrow> <mrow class="MJX-TeXAtom-ORD"> <mi>t</mi> </mrow> </munderover> <msub> <mi>g</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>&#x03C4;<!-- τ --></mi> </mrow> </msub> <msubsup> <mi>g</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>&#x03C4;<!-- τ --></mi> </mrow> <mrow class="MJX-TeXAtom-ORD"> <mrow class="MJX-TeXAtom-ORD"> <mi mathvariant="sans-serif">T</mi> </mrow> </mrow> </msubsup> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle G=\sum _{\tau =1}^{t}g_{\tau }g_{\tau }^{\mathsf {T}}}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/2312c7e8aef5aa986ec4ce3ce6118a9b50ebaa23" class="mwe-math-fallback-image-display mw-invert skin-invert" aria-hidden="true" style="vertical-align: -3.005ex; width:13.328ex; height:7.176ex;" alt="{\displaystyle G=\sum _{\tau =1}^{t}g_{\tau }g_{\tau }^{\mathsf {T}}}"></span> </p><p>where <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle g_{\tau }=\nabla Q_{i}(w)}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <msub> <mi>g</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>&#x03C4;<!-- τ --></mi> </mrow> </msub> <mo>=</mo> <mi mathvariant="normal">&#x2207;<!-- ∇ --></mi> <msub> <mi>Q</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>i</mi> </mrow> </msub> <mo stretchy="false">(</mo> <mi>w</mi> <mo stretchy="false">)</mo> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle g_{\tau }=\nabla Q_{i}(w)}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/5faf91d08a3415dc6ea1704ddf50ca0f402ae4be" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.838ex; width:13.337ex; height:2.843ex;" alt="{\displaystyle g_{\tau }=\nabla Q_{i}(w)}"></span>, the gradient, at iteration <span class="texhtml mvar" style="font-style:italic;">τ</span>. The diagonal is given by </p><p><span class="mwe-math-element"><span class="mwe-math-mathml-display mwe-math-mathml-a11y" style="display: none;"><math display="block" xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle G_{j,j}=\sum _{\tau =1}^{t}g_{\tau ,j}^{2}.}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <msub> <mi>G</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>j</mi> <mo>,</mo> <mi>j</mi> </mrow> </msub> <mo>=</mo> <munderover> <mo>&#x2211;<!-- ∑ --></mo> <mrow class="MJX-TeXAtom-ORD"> <mi>&#x03C4;<!-- τ --></mi> <mo>=</mo> <mn>1</mn> </mrow> <mrow class="MJX-TeXAtom-ORD"> <mi>t</mi> </mrow> </munderover> <msubsup> <mi>g</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>&#x03C4;<!-- τ --></mi> <mo>,</mo> <mi>j</mi> </mrow> <mrow class="MJX-TeXAtom-ORD"> <mn>2</mn> </mrow> </msubsup> <mo>.</mo> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle G_{j,j}=\sum _{\tau =1}^{t}g_{\tau ,j}^{2}.}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/fe5e95dd9e62c70f3f0eff4aa8ac55bd89752b87" class="mwe-math-fallback-image-display mw-invert skin-invert" aria-hidden="true" style="vertical-align: -3.005ex; width:14.685ex; height:7.176ex;" alt="{\displaystyle G_{j,j}=\sum _{\tau =1}^{t}g_{\tau ,j}^{2}.}"></span>This vector essentially stores a historical sum of gradient squares by dimension and is updated after every iteration. The formula for an update is now<sup id="cite_ref-39" class="reference"><a href="#cite_note-39"><span class="cite-bracket">&#91;</span>a<span class="cite-bracket">&#93;</span></a></sup> <span class="mwe-math-element"><span class="mwe-math-mathml-display mwe-math-mathml-a11y" style="display: none;"><math display="block" xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle w:=w-\eta \,\mathrm {diag} (G)^{-{\frac {1}{2}}}\odot g}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mi>w</mi> <mo>:=</mo> <mi>w</mi> <mo>&#x2212;<!-- − --></mo> <mi>&#x03B7;<!-- η --></mi> <mspace width="thinmathspace" /> <mrow class="MJX-TeXAtom-ORD"> <mi mathvariant="normal">d</mi> <mi mathvariant="normal">i</mi> <mi mathvariant="normal">a</mi> <mi mathvariant="normal">g</mi> </mrow> <mo stretchy="false">(</mo> <mi>G</mi> <msup> <mo stretchy="false">)</mo> <mrow class="MJX-TeXAtom-ORD"> <mo>&#x2212;<!-- − --></mo> <mrow class="MJX-TeXAtom-ORD"> <mfrac> <mn>1</mn> <mn>2</mn> </mfrac> </mrow> </mrow> </msup> <mo>&#x2299;<!-- ⊙ --></mo> <mi>g</mi> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle w:=w-\eta \,\mathrm {diag} (G)^{-{\frac {1}{2}}}\odot g}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/4c179d4e98e477ed5e4fb63084dc3ca866744735" class="mwe-math-fallback-image-display mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.838ex; width:26.341ex; height:4.009ex;" alt="{\displaystyle w:=w-\eta \,\mathrm {diag} (G)^{-{\frac {1}{2}}}\odot g}"></span> or, written as per-parameter updates, <span class="mwe-math-element"><span class="mwe-math-mathml-display mwe-math-mathml-a11y" style="display: none;"><math display="block" xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle w_{j}:=w_{j}-{\frac {\eta }{\sqrt {G_{j,j}}}}g_{j}.}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <msub> <mi>w</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>j</mi> </mrow> </msub> <mo>:=</mo> <msub> <mi>w</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>j</mi> </mrow> </msub> <mo>&#x2212;<!-- − --></mo> <mrow class="MJX-TeXAtom-ORD"> <mfrac> <mi>&#x03B7;<!-- η --></mi> <msqrt> <msub> <mi>G</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>j</mi> <mo>,</mo> <mi>j</mi> </mrow> </msub> </msqrt> </mfrac> </mrow> <msub> <mi>g</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>j</mi> </mrow> </msub> <mo>.</mo> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle w_{j}:=w_{j}-{\frac {\eta }{\sqrt {G_{j,j}}}}g_{j}.}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/a8d640473059bfe0dc38c84dee53b3354d64ca34" class="mwe-math-fallback-image-display mw-invert skin-invert" aria-hidden="true" style="vertical-align: -3.171ex; width:21.43ex; height:6.176ex;" alt="{\displaystyle w_{j}:=w_{j}-{\frac {\eta }{\sqrt {G_{j,j}}}}g_{j}.}"></span> Each <span class="texhtml">{<i>G</i><sub>(<i>i</i>,<i>i</i>)</sub>} </span> gives rise to a scaling factor for the learning rate that applies to a single parameter <span class="texhtml"><i>w</i><sub><i>i</i></sub></span>. Since the denominator in this factor, <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\textstyle {\sqrt {G_{i}}}={\sqrt {\sum _{\tau =1}^{t}g_{\tau }^{2}}}}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="false" scriptlevel="0"> <mrow class="MJX-TeXAtom-ORD"> <msqrt> <msub> <mi>G</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>i</mi> </mrow> </msub> </msqrt> </mrow> <mo>=</mo> <mrow class="MJX-TeXAtom-ORD"> <msqrt> <munderover> <mo>&#x2211;<!-- ∑ --></mo> <mrow class="MJX-TeXAtom-ORD"> <mi>&#x03C4;<!-- τ --></mi> <mo>=</mo> <mn>1</mn> </mrow> <mrow class="MJX-TeXAtom-ORD"> <mi>t</mi> </mrow> </munderover> <msubsup> <mi>g</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>&#x03C4;<!-- τ --></mi> </mrow> <mrow class="MJX-TeXAtom-ORD"> <mn>2</mn> </mrow> </msubsup> </msqrt> </mrow> </mstyle> </mrow> <annotation encoding="application/x-tex">{\textstyle {\sqrt {G_{i}}}={\sqrt {\sum _{\tau =1}^{t}g_{\tau }^{2}}}}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/81c2500fec2ed8fca3dec39c580eca94f3172d8d" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -1.671ex; width:18.587ex; height:4.843ex;" alt="{\textstyle {\sqrt {G_{i}}}={\sqrt {\sum _{\tau =1}^{t}g_{\tau }^{2}}}}"></span> is the <a href="/wiki/Norm_(mathematics)#Euclidean_norm" title="Norm (mathematics)"><i>ℓ</i><sub>2</sub> norm</a> of previous derivatives, extreme parameter updates get dampened, while parameters that get few or small updates receive higher learning rates.<sup id="cite_ref-Zeiler_2012_34-1" class="reference"><a href="#cite_note-Zeiler_2012-34"><span class="cite-bracket">&#91;</span>34<span class="cite-bracket">&#93;</span></a></sup> </p><p>While designed for <a href="/wiki/Convex_optimization" title="Convex optimization">convex problems</a>, AdaGrad has been successfully applied to non-convex optimization.<sup id="cite_ref-40" class="reference"><a href="#cite_note-40"><span class="cite-bracket">&#91;</span>39<span class="cite-bracket">&#93;</span></a></sup> </p> <div class="mw-heading mw-heading3"><h3 id="RMSProp">RMSProp</h3><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Stochastic_gradient_descent&amp;action=edit&amp;section=11" title="Edit section: RMSProp"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <p><i>RMSProp</i> (for Root Mean Square Propagation) is a method invented in 2012 by James Martens and <a href="/wiki/Ilya_Sutskever" title="Ilya Sutskever">Ilya Sutskever</a>, at the time both PhD students in Geoffrey Hinton's group, in which the <a href="/wiki/Learning_rate" title="Learning rate">learning rate</a> is, like in Adagrad, adapted for each of the parameters. The idea is to divide the learning rate for a weight by a running average of the magnitudes of recent gradients for that weight.<sup id="cite_ref-rmsprop_41-0" class="reference"><a href="#cite_note-rmsprop-41"><span class="cite-bracket">&#91;</span>40<span class="cite-bracket">&#93;</span></a></sup> Unusually, it was not published in an article but merely described in a <a href="/wiki/Coursera" title="Coursera">Coursera</a> lecture.<sup class="noprint Inline-Template Template-Fact" style="white-space:nowrap;">&#91;<i><a href="/wiki/Wikipedia:Citation_needed" title="Wikipedia:Citation needed"><span title="This claim needs references to reliable sources. (June 2023)">citation needed</span></a></i>&#93;</sup> Citation 1: <a rel="nofollow" class="external free" href="https://deepai.org/machine-learning-glossary-and-terms/rmsprop#:~:text=The%20RMSProp%20algorithm%20was%20introduced,its%20effectiveness%20in%20various%20applications">https://deepai.org/machine-learning-glossary-and-terms/rmsprop#:~:text=The%20RMSProp%20algorithm%20was%20introduced,its%20effectiveness%20in%20various%20applications</a>. Citation 2: this video at 36:37 <a rel="nofollow" class="external free" href="https://www.youtube.com/watch?v=-eyhCTvrEtE&amp;t=36m37s">https://www.youtube.com/watch?v=-eyhCTvrEtE&amp;t=36m37s</a> </p><p>So, first the running average is calculated in terms of means square, </p><p><span class="mwe-math-element"><span class="mwe-math-mathml-display mwe-math-mathml-a11y" style="display: none;"><math display="block" xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle v(w,t):=\gamma v(w,t-1)+\left(1-\gamma \right)\left(\nabla Q_{i}(w)\right)^{2}}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mi>v</mi> <mo stretchy="false">(</mo> <mi>w</mi> <mo>,</mo> <mi>t</mi> <mo stretchy="false">)</mo> <mo>:=</mo> <mi>&#x03B3;<!-- γ --></mi> <mi>v</mi> <mo stretchy="false">(</mo> <mi>w</mi> <mo>,</mo> <mi>t</mi> <mo>&#x2212;<!-- − --></mo> <mn>1</mn> <mo stretchy="false">)</mo> <mo>+</mo> <mrow> <mo>(</mo> <mrow> <mn>1</mn> <mo>&#x2212;<!-- − --></mo> <mi>&#x03B3;<!-- γ --></mi> </mrow> <mo>)</mo> </mrow> <msup> <mrow> <mo>(</mo> <mrow> <mi mathvariant="normal">&#x2207;<!-- ∇ --></mi> <msub> <mi>Q</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>i</mi> </mrow> </msub> <mo stretchy="false">(</mo> <mi>w</mi> <mo stretchy="false">)</mo> </mrow> <mo>)</mo> </mrow> <mrow class="MJX-TeXAtom-ORD"> <mn>2</mn> </mrow> </msup> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle v(w,t):=\gamma v(w,t-1)+\left(1-\gamma \right)\left(\nabla Q_{i}(w)\right)^{2}}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/86cb444a65e0eb7422feeb14dd95a0ed3fe675b5" class="mwe-math-fallback-image-display mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.838ex; width:43.173ex; height:3.343ex;" alt="{\displaystyle v(w,t):=\gamma v(w,t-1)+\left(1-\gamma \right)\left(\nabla Q_{i}(w)\right)^{2}}"></span> </p><p>where, <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle \gamma }"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mi>&#x03B3;<!-- γ --></mi> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle \gamma }</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/a223c880b0ce3da8f64ee33c4f0010beee400b1a" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.838ex; width:1.262ex; height:2.176ex;" alt="{\displaystyle \gamma }"></span> is the forgetting factor. The concept of storing the historical gradient as sum of squares is borrowed from Adagrad, but "forgetting" is introduced to solve Adagrad's diminishing learning rates in non-convex problems by gradually decreasing the influence of old data.<sup class="noprint Inline-Template Template-Fact" style="white-space:nowrap;">&#91;<i><a href="/wiki/Wikipedia:Citation_needed" title="Wikipedia:Citation needed"><span title="This claim needs references to reliable sources. (June 2024)">citation needed</span></a></i>&#93;</sup> </p><p>And the parameters are updated as, </p><p><span class="mwe-math-element"><span class="mwe-math-mathml-display mwe-math-mathml-a11y" style="display: none;"><math display="block" xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle w:=w-{\frac {\eta }{\sqrt {v(w,t)}}}\nabla Q_{i}(w)}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mi>w</mi> <mo>:=</mo> <mi>w</mi> <mo>&#x2212;<!-- − --></mo> <mrow class="MJX-TeXAtom-ORD"> <mfrac> <mi>&#x03B7;<!-- η --></mi> <msqrt> <mi>v</mi> <mo stretchy="false">(</mo> <mi>w</mi> <mo>,</mo> <mi>t</mi> <mo stretchy="false">)</mo> </msqrt> </mfrac> </mrow> <mi mathvariant="normal">&#x2207;<!-- ∇ --></mi> <msub> <mi>Q</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>i</mi> </mrow> </msub> <mo stretchy="false">(</mo> <mi>w</mi> <mo stretchy="false">)</mo> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle w:=w-{\frac {\eta }{\sqrt {v(w,t)}}}\nabla Q_{i}(w)}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/fc46ae8619e71130c6c8212eec31560cb4891c0a" class="mwe-math-fallback-image-display mw-invert skin-invert" aria-hidden="true" style="vertical-align: -3.171ex; width:27.596ex; height:6.176ex;" alt="{\displaystyle w:=w-{\frac {\eta }{\sqrt {v(w,t)}}}\nabla Q_{i}(w)}"></span> </p><p>RMSProp has shown good adaptation of learning rate in different applications. RMSProp can be seen as a generalization of <a href="/wiki/Rprop" title="Rprop">Rprop</a> and is capable to work with mini-batches as well opposed to only full-batches.<sup id="cite_ref-rmsprop_41-1" class="reference"><a href="#cite_note-rmsprop-41"><span class="cite-bracket">&#91;</span>40<span class="cite-bracket">&#93;</span></a></sup> </p> <div class="mw-heading mw-heading3"><h3 id="Adam">Adam</h3><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Stochastic_gradient_descent&amp;action=edit&amp;section=12" title="Edit section: Adam"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <p><i>Adam</i><sup id="cite_ref-Adam2014_42-0" class="reference"><a href="#cite_note-Adam2014-42"><span class="cite-bracket">&#91;</span>41<span class="cite-bracket">&#93;</span></a></sup> (short for Adaptive Moment Estimation) is a 2014 update to the <i>RMSProp</i> optimizer combining it with the main feature of the <i>Momentum method</i>.<sup id="cite_ref-43" class="reference"><a href="#cite_note-43"><span class="cite-bracket">&#91;</span>42<span class="cite-bracket">&#93;</span></a></sup> In this optimization algorithm, running averages with exponential forgetting of both the gradients and the second moments of the gradients are used. Given parameters <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle w^{(t)}}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <msup> <mi>w</mi> <mrow class="MJX-TeXAtom-ORD"> <mo stretchy="false">(</mo> <mi>t</mi> <mo stretchy="false">)</mo> </mrow> </msup> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle w^{(t)}}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/b9c6a224b683255861bac714bbe9ee10ee332d5b" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.338ex; width:3.769ex; height:2.843ex;" alt="{\displaystyle w^{(t)}}"></span> and a loss function <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle L^{(t)}}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <msup> <mi>L</mi> <mrow class="MJX-TeXAtom-ORD"> <mo stretchy="false">(</mo> <mi>t</mi> <mo stretchy="false">)</mo> </mrow> </msup> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle L^{(t)}}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/0d5697e3f8a32382c580064ecf3a2753ec9c9bc1" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.338ex; width:3.688ex; height:2.843ex;" alt="{\displaystyle L^{(t)}}"></span>, where <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle t}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mi>t</mi> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle t}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/65658b7b223af9e1acc877d848888ecdb4466560" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.338ex; width:0.84ex; height:2.009ex;" alt="{\displaystyle t}"></span> indexes the current training iteration (indexed at <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle 0}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mn>0</mn> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle 0}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/2aae8864a3c1fec9585261791a809ddec1489950" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.338ex; width:1.162ex; height:2.176ex;" alt="{\displaystyle 0}"></span>), Adam's parameter update is given by: </p><p><span class="mwe-math-element"><span class="mwe-math-mathml-display mwe-math-mathml-a11y" style="display: none;"><math display="block" xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle m_{w}^{(t+1)}:=\beta _{1}m_{w}^{(t)}+\left(1-\beta _{1}\right)\nabla _{w}L^{(t)}}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <msubsup> <mi>m</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>w</mi> </mrow> <mrow class="MJX-TeXAtom-ORD"> <mo stretchy="false">(</mo> <mi>t</mi> <mo>+</mo> <mn>1</mn> <mo stretchy="false">)</mo> </mrow> </msubsup> <mo>:=</mo> <msub> <mi>&#x03B2;<!-- β --></mi> <mrow class="MJX-TeXAtom-ORD"> <mn>1</mn> </mrow> </msub> <msubsup> <mi>m</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>w</mi> </mrow> <mrow class="MJX-TeXAtom-ORD"> <mo stretchy="false">(</mo> <mi>t</mi> <mo stretchy="false">)</mo> </mrow> </msubsup> <mo>+</mo> <mrow> <mo>(</mo> <mrow> <mn>1</mn> <mo>&#x2212;<!-- − --></mo> <msub> <mi>&#x03B2;<!-- β --></mi> <mrow class="MJX-TeXAtom-ORD"> <mn>1</mn> </mrow> </msub> </mrow> <mo>)</mo> </mrow> <msub> <mi mathvariant="normal">&#x2207;<!-- ∇ --></mi> <mrow class="MJX-TeXAtom-ORD"> <mi>w</mi> </mrow> </msub> <msup> <mi>L</mi> <mrow class="MJX-TeXAtom-ORD"> <mo stretchy="false">(</mo> <mi>t</mi> <mo stretchy="false">)</mo> </mrow> </msup> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle m_{w}^{(t+1)}:=\beta _{1}m_{w}^{(t)}+\left(1-\beta _{1}\right)\nabla _{w}L^{(t)}}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/46bb333f22d229572a6c9a1bac93cf133ef78adf" class="mwe-math-fallback-image-display mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.838ex; width:34.95ex; height:3.509ex;" alt="{\displaystyle m_{w}^{(t+1)}:=\beta _{1}m_{w}^{(t)}+\left(1-\beta _{1}\right)\nabla _{w}L^{(t)}}"></span> <span class="mwe-math-element"><span class="mwe-math-mathml-display mwe-math-mathml-a11y" style="display: none;"><math display="block" xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle v_{w}^{(t+1)}:=\beta _{2}v_{w}^{(t)}+\left(1-\beta _{2}\right)\left(\nabla _{w}L^{(t)}\right)^{2}}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <msubsup> <mi>v</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>w</mi> </mrow> <mrow class="MJX-TeXAtom-ORD"> <mo stretchy="false">(</mo> <mi>t</mi> <mo>+</mo> <mn>1</mn> <mo stretchy="false">)</mo> </mrow> </msubsup> <mo>:=</mo> <msub> <mi>&#x03B2;<!-- β --></mi> <mrow class="MJX-TeXAtom-ORD"> <mn>2</mn> </mrow> </msub> <msubsup> <mi>v</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>w</mi> </mrow> <mrow class="MJX-TeXAtom-ORD"> <mo stretchy="false">(</mo> <mi>t</mi> <mo stretchy="false">)</mo> </mrow> </msubsup> <mo>+</mo> <mrow> <mo>(</mo> <mrow> <mn>1</mn> <mo>&#x2212;<!-- − --></mo> <msub> <mi>&#x03B2;<!-- β --></mi> <mrow class="MJX-TeXAtom-ORD"> <mn>2</mn> </mrow> </msub> </mrow> <mo>)</mo> </mrow> <msup> <mrow> <mo>(</mo> <mrow> <msub> <mi mathvariant="normal">&#x2207;<!-- ∇ --></mi> <mrow class="MJX-TeXAtom-ORD"> <mi>w</mi> </mrow> </msub> <msup> <mi>L</mi> <mrow class="MJX-TeXAtom-ORD"> <mo stretchy="false">(</mo> <mi>t</mi> <mo stretchy="false">)</mo> </mrow> </msup> </mrow> <mo>)</mo> </mrow> <mrow class="MJX-TeXAtom-ORD"> <mn>2</mn> </mrow> </msup> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle v_{w}^{(t+1)}:=\beta _{2}v_{w}^{(t)}+\left(1-\beta _{2}\right)\left(\nabla _{w}L^{(t)}\right)^{2}}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/99cc46b137dae2156dfdb8c140ccf7ed1a0e631e" class="mwe-math-fallback-image-display mw-invert skin-invert" aria-hidden="true" style="vertical-align: -1.838ex; width:36.954ex; height:5.176ex;" alt="{\displaystyle v_{w}^{(t+1)}:=\beta _{2}v_{w}^{(t)}+\left(1-\beta _{2}\right)\left(\nabla _{w}L^{(t)}\right)^{2}}"></span> </p><p><span class="mwe-math-element"><span class="mwe-math-mathml-display mwe-math-mathml-a11y" style="display: none;"><math display="block" xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle {\hat {m}}_{w}={\frac {m_{w}^{(t+1)}}{1-\beta _{1}^{t}}}}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <msub> <mrow class="MJX-TeXAtom-ORD"> <mrow class="MJX-TeXAtom-ORD"> <mover> <mi>m</mi> <mo stretchy="false">&#x005E;<!-- ^ --></mo> </mover> </mrow> </mrow> <mrow class="MJX-TeXAtom-ORD"> <mi>w</mi> </mrow> </msub> <mo>=</mo> <mrow class="MJX-TeXAtom-ORD"> <mfrac> <msubsup> <mi>m</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>w</mi> </mrow> <mrow class="MJX-TeXAtom-ORD"> <mo stretchy="false">(</mo> <mi>t</mi> <mo>+</mo> <mn>1</mn> <mo stretchy="false">)</mo> </mrow> </msubsup> <mrow> <mn>1</mn> <mo>&#x2212;<!-- − --></mo> <msubsup> <mi>&#x03B2;<!-- β --></mi> <mrow class="MJX-TeXAtom-ORD"> <mn>1</mn> </mrow> <mrow class="MJX-TeXAtom-ORD"> <mi>t</mi> </mrow> </msubsup> </mrow> </mfrac> </mrow> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle {\hat {m}}_{w}={\frac {m_{w}^{(t+1)}}{1-\beta _{1}^{t}}}}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/04503cc845e3aeeab05618c695d842d1d61d8206" class="mwe-math-fallback-image-display mw-invert skin-invert" aria-hidden="true" style="vertical-align: -2.838ex; width:13.757ex; height:7.176ex;" alt="{\displaystyle {\hat {m}}_{w}={\frac {m_{w}^{(t+1)}}{1-\beta _{1}^{t}}}}"></span> <span class="mwe-math-element"><span class="mwe-math-mathml-display mwe-math-mathml-a11y" style="display: none;"><math display="block" xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle {\hat {v}}_{w}={\frac {v_{w}^{(t+1)}}{1-\beta _{2}^{t}}}}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <msub> <mrow class="MJX-TeXAtom-ORD"> <mrow class="MJX-TeXAtom-ORD"> <mover> <mi>v</mi> <mo stretchy="false">&#x005E;<!-- ^ --></mo> </mover> </mrow> </mrow> <mrow class="MJX-TeXAtom-ORD"> <mi>w</mi> </mrow> </msub> <mo>=</mo> <mrow class="MJX-TeXAtom-ORD"> <mfrac> <msubsup> <mi>v</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>w</mi> </mrow> <mrow class="MJX-TeXAtom-ORD"> <mo stretchy="false">(</mo> <mi>t</mi> <mo>+</mo> <mn>1</mn> <mo stretchy="false">)</mo> </mrow> </msubsup> <mrow> <mn>1</mn> <mo>&#x2212;<!-- − --></mo> <msubsup> <mi>&#x03B2;<!-- β --></mi> <mrow class="MJX-TeXAtom-ORD"> <mn>2</mn> </mrow> <mrow class="MJX-TeXAtom-ORD"> <mi>t</mi> </mrow> </msubsup> </mrow> </mfrac> </mrow> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle {\hat {v}}_{w}={\frac {v_{w}^{(t+1)}}{1-\beta _{2}^{t}}}}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/8a71621d7af16548f96d8fe5c8901002740bbdfc" class="mwe-math-fallback-image-display mw-invert skin-invert" aria-hidden="true" style="vertical-align: -2.838ex; width:12.943ex; height:7.176ex;" alt="{\displaystyle {\hat {v}}_{w}={\frac {v_{w}^{(t+1)}}{1-\beta _{2}^{t}}}}"></span> </p><p><span class="mwe-math-element"><span class="mwe-math-mathml-display mwe-math-mathml-a11y" style="display: none;"><math display="block" xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle w^{(t+1)}:=w^{(t)}-\eta {\frac {{\hat {m}}_{w}}{{\sqrt {{\hat {v}}_{w}}}+\varepsilon }}}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <msup> <mi>w</mi> <mrow class="MJX-TeXAtom-ORD"> <mo stretchy="false">(</mo> <mi>t</mi> <mo>+</mo> <mn>1</mn> <mo stretchy="false">)</mo> </mrow> </msup> <mo>:=</mo> <msup> <mi>w</mi> <mrow class="MJX-TeXAtom-ORD"> <mo stretchy="false">(</mo> <mi>t</mi> <mo stretchy="false">)</mo> </mrow> </msup> <mo>&#x2212;<!-- − --></mo> <mi>&#x03B7;<!-- η --></mi> <mrow class="MJX-TeXAtom-ORD"> <mfrac> <msub> <mrow class="MJX-TeXAtom-ORD"> <mrow class="MJX-TeXAtom-ORD"> <mover> <mi>m</mi> <mo stretchy="false">&#x005E;<!-- ^ --></mo> </mover> </mrow> </mrow> <mrow class="MJX-TeXAtom-ORD"> <mi>w</mi> </mrow> </msub> <mrow> <mrow class="MJX-TeXAtom-ORD"> <msqrt> <msub> <mrow class="MJX-TeXAtom-ORD"> <mrow class="MJX-TeXAtom-ORD"> <mover> <mi>v</mi> <mo stretchy="false">&#x005E;<!-- ^ --></mo> </mover> </mrow> </mrow> <mrow class="MJX-TeXAtom-ORD"> <mi>w</mi> </mrow> </msub> </msqrt> </mrow> <mo>+</mo> <mi>&#x03B5;<!-- ε --></mi> </mrow> </mfrac> </mrow> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle w^{(t+1)}:=w^{(t)}-\eta {\frac {{\hat {m}}_{w}}{{\sqrt {{\hat {v}}_{w}}}+\varepsilon }}}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/f6b2c3b37f4519c17a0541f9f7d0065cb0531c68" class="mwe-math-fallback-image-display mw-invert skin-invert" aria-hidden="true" style="vertical-align: -3.171ex; width:27.114ex; height:6.676ex;" alt="{\displaystyle w^{(t+1)}:=w^{(t)}-\eta {\frac {{\hat {m}}_{w}}{{\sqrt {{\hat {v}}_{w}}}+\varepsilon }}}"></span> where <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle \varepsilon }"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mi>&#x03B5;<!-- ε --></mi> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle \varepsilon }</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/a30c89172e5b88edbd45d3e2772c7f5e562e5173" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.338ex; width:1.083ex; height:1.676ex;" alt="{\displaystyle \varepsilon }"></span> is a small scalar (e.g. <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle 10^{-8}}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <msup> <mn>10</mn> <mrow class="MJX-TeXAtom-ORD"> <mo>&#x2212;<!-- − --></mo> <mn>8</mn> </mrow> </msup> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle 10^{-8}}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/fb466108745a6e8f71ecf869caf1a33ff63886db" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.338ex; width:4.658ex; height:2.676ex;" alt="{\displaystyle 10^{-8}}"></span>) used to prevent division by 0, and <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle \beta _{1}}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <msub> <mi>&#x03B2;<!-- β --></mi> <mrow class="MJX-TeXAtom-ORD"> <mn>1</mn> </mrow> </msub> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle \beta _{1}}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/eeeccd8b585b819e38f9c1fe5e9816a3ea01804c" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.671ex; width:2.37ex; height:2.509ex;" alt="{\displaystyle \beta _{1}}"></span> (e.g. 0.9) and <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle \beta _{2}}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <msub> <mi>&#x03B2;<!-- β --></mi> <mrow class="MJX-TeXAtom-ORD"> <mn>2</mn> </mrow> </msub> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle \beta _{2}}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/8d30285b40d7488ae6caef3beb7106142869fbea" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.671ex; width:2.37ex; height:2.509ex;" alt="{\displaystyle \beta _{2}}"></span> (e.g. 0.999) are the forgetting factors for gradients and second moments of gradients, respectively. Squaring and square-rooting is done element-wise. </p><p>The initial proof establishing the convergence of Adam was incomplete, and subsequent analysis has revealed that Adam does not converge for all convex objectives.<sup id="cite_ref-44" class="reference"><a href="#cite_note-44"><span class="cite-bracket">&#91;</span>43<span class="cite-bracket">&#93;</span></a></sup><sup id="cite_ref-45" class="reference"><a href="#cite_note-45"><span class="cite-bracket">&#91;</span>44<span class="cite-bracket">&#93;</span></a></sup> Despite this, <i>Adam</i> continues to be used due to its strong performance in practice.<sup id="cite_ref-46" class="reference"><a href="#cite_note-46"><span class="cite-bracket">&#91;</span>45<span class="cite-bracket">&#93;</span></a></sup> </p> <div class="mw-heading mw-heading4"><h4 id="Variants">Variants</h4><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Stochastic_gradient_descent&amp;action=edit&amp;section=13" title="Edit section: Variants"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <p>The popularity of <i>Adam</i> inspired many variants and enhancements. Some examples include: </p> <ul><li>Nesterov-enhanced gradients: <i>NAdam</i>,<sup id="cite_ref-47" class="reference"><a href="#cite_note-47"><span class="cite-bracket">&#91;</span>46<span class="cite-bracket">&#93;</span></a></sup> <i>FASFA</i><sup id="cite_ref-48" class="reference"><a href="#cite_note-48"><span class="cite-bracket">&#91;</span>47<span class="cite-bracket">&#93;</span></a></sup></li> <li>varying interpretations of second-order information: <i>Powerpropagation</i><sup id="cite_ref-49" class="reference"><a href="#cite_note-49"><span class="cite-bracket">&#91;</span>48<span class="cite-bracket">&#93;</span></a></sup> and <i>AdaSqrt</i>.<sup id="cite_ref-50" class="reference"><a href="#cite_note-50"><span class="cite-bracket">&#91;</span>49<span class="cite-bracket">&#93;</span></a></sup></li> <li>Using <a href="/wiki/Uniform_norm" title="Uniform norm">infinity norm</a>: <i>AdaMax</i><sup id="cite_ref-Adam2014_42-1" class="reference"><a href="#cite_note-Adam2014-42"><span class="cite-bracket">&#91;</span>41<span class="cite-bracket">&#93;</span></a></sup></li> <li><i>AMSGrad</i>,<sup id="cite_ref-51" class="reference"><a href="#cite_note-51"><span class="cite-bracket">&#91;</span>50<span class="cite-bracket">&#93;</span></a></sup> which improves convergence over <i>Adam</i> by using maximum of past squared gradients instead of the exponential average.<sup id="cite_ref-52" class="reference"><a href="#cite_note-52"><span class="cite-bracket">&#91;</span>51<span class="cite-bracket">&#93;</span></a></sup> <i>AdamX</i><sup id="cite_ref-53" class="reference"><a href="#cite_note-53"><span class="cite-bracket">&#91;</span>52<span class="cite-bracket">&#93;</span></a></sup> further improves convergence over <i>AMSGrad</i>.</li> <li><i>AdamW</i>,<sup id="cite_ref-AdamW_54-0" class="reference"><a href="#cite_note-AdamW-54"><span class="cite-bracket">&#91;</span>53<span class="cite-bracket">&#93;</span></a></sup> which improves the <a href="/wiki/Weight_decay" class="mw-redirect" title="Weight decay">weight decay</a>.</li></ul> <div class="mw-heading mw-heading3"><h3 id="Sign-based_stochastic_gradient_descent">Sign-based stochastic gradient descent</h3><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Stochastic_gradient_descent&amp;action=edit&amp;section=14" title="Edit section: Sign-based stochastic gradient descent"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <p>Even though sign-based optimization goes back to the aforementioned <i>Rprop</i>, in 2018 researchers tried to simplify Adam by removing the magnitude of the stochastic gradient from being taken into account and only considering its sign.<sup id="cite_ref-55" class="reference"><a href="#cite_note-55"><span class="cite-bracket">&#91;</span>54<span class="cite-bracket">&#93;</span></a></sup><sup id="cite_ref-56" class="reference"><a href="#cite_note-56"><span class="cite-bracket">&#91;</span>55<span class="cite-bracket">&#93;</span></a></sup> </p> <style data-mw-deduplicate="TemplateStyles:r1251242444">.mw-parser-output .ambox{border:1px solid #a2a9b1;border-left:10px solid #36c;background-color:#fbfbfb;box-sizing:border-box}.mw-parser-output .ambox+link+.ambox,.mw-parser-output .ambox+link+style+.ambox,.mw-parser-output .ambox+link+link+.ambox,.mw-parser-output .ambox+.mw-empty-elt+link+.ambox,.mw-parser-output .ambox+.mw-empty-elt+link+style+.ambox,.mw-parser-output .ambox+.mw-empty-elt+link+link+.ambox{margin-top:-1px}html body.mediawiki .mw-parser-output .ambox.mbox-small-left{margin:4px 1em 4px 0;overflow:hidden;width:238px;border-collapse:collapse;font-size:88%;line-height:1.25em}.mw-parser-output .ambox-speedy{border-left:10px solid #b32424;background-color:#fee7e6}.mw-parser-output .ambox-delete{border-left:10px solid #b32424}.mw-parser-output .ambox-content{border-left:10px solid #f28500}.mw-parser-output .ambox-style{border-left:10px solid #fc3}.mw-parser-output .ambox-move{border-left:10px solid #9932cc}.mw-parser-output .ambox-protection{border-left:10px solid #a2a9b1}.mw-parser-output .ambox .mbox-text{border:none;padding:0.25em 0.5em;width:100%}.mw-parser-output .ambox .mbox-image{border:none;padding:2px 0 2px 0.5em;text-align:center}.mw-parser-output .ambox .mbox-imageright{border:none;padding:2px 0.5em 2px 0;text-align:center}.mw-parser-output .ambox .mbox-empty-cell{border:none;padding:0;width:1px}.mw-parser-output .ambox .mbox-image-div{width:52px}@media(min-width:720px){.mw-parser-output .ambox{margin:0 10%}}@media print{body.ns-0 .mw-parser-output .ambox{display:none!important}}</style><table class="box-Expand_section plainlinks metadata ambox mbox-small-left ambox-content" role="presentation"><tbody><tr><td class="mbox-image"><span typeof="mw:File"><a href="/wiki/File:Wiki_letter_w_cropped.svg" class="mw-file-description"><img alt="[icon]" src="//upload.wikimedia.org/wikipedia/commons/thumb/1/1c/Wiki_letter_w_cropped.svg/20px-Wiki_letter_w_cropped.svg.png" decoding="async" width="20" height="14" class="mw-file-element" srcset="//upload.wikimedia.org/wikipedia/commons/thumb/1/1c/Wiki_letter_w_cropped.svg/30px-Wiki_letter_w_cropped.svg.png 1.5x, //upload.wikimedia.org/wikipedia/commons/thumb/1/1c/Wiki_letter_w_cropped.svg/40px-Wiki_letter_w_cropped.svg.png 2x" data-file-width="44" data-file-height="31" /></a></span></td><td class="mbox-text"><div class="mbox-text-span">This section <b>needs expansion</b>. You can help by <a class="external text" href="https://en.wikipedia.org/w/index.php?title=Stochastic_gradient_descent&amp;action=edit&amp;section=">adding to it</a>. <span class="date-container"><i>(<span class="date">June 2023</span>)</i></span></div></td></tr></tbody></table> <div class="mw-heading mw-heading3"><h3 id="Backtracking_line_search">Backtracking line search</h3><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Stochastic_gradient_descent&amp;action=edit&amp;section=15" title="Edit section: Backtracking line search"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <p><a href="/wiki/Backtracking_line_search" title="Backtracking line search">Backtracking line search</a> is another variant of gradient descent. All of the below are sourced from the mentioned link. It is based on a condition known as the Armijo–Goldstein condition. Both methods allow learning rates to change at each iteration; however, the manner of the change is different. Backtracking line search uses function evaluations to check Armijo's condition, and in principle the loop in the algorithm for determining the learning rates can be long and unknown in advance. Adaptive SGD does not need a loop in determining learning rates. On the other hand, adaptive SGD does not guarantee the "descent property" – which Backtracking line search enjoys – which is that <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle f(x_{n+1})\leq f(x_{n})}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mi>f</mi> <mo stretchy="false">(</mo> <msub> <mi>x</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>n</mi> <mo>+</mo> <mn>1</mn> </mrow> </msub> <mo stretchy="false">)</mo> <mo>&#x2264;<!-- ≤ --></mo> <mi>f</mi> <mo stretchy="false">(</mo> <msub> <mi>x</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>n</mi> </mrow> </msub> <mo stretchy="false">)</mo> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle f(x_{n+1})\leq f(x_{n})}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/e4b45082cf76fd9282918d1d72baf9d5ff4e0fd5" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.838ex; width:16.471ex; height:2.843ex;" alt="{\displaystyle f(x_{n+1})\leq f(x_{n})}"></span> for all n. If the gradient of the cost function is globally Lipschitz continuous, with Lipschitz constant L, and learning rate is chosen of the order 1/L, then the standard version of SGD is a special case of backtracking line search. </p> <div class="mw-heading mw-heading3"><h3 id="Second-order_methods">Second-order methods</h3><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Stochastic_gradient_descent&amp;action=edit&amp;section=16" title="Edit section: Second-order methods"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <p>A stochastic analogue of the standard (deterministic) <a href="/wiki/Newton%27s_method_in_optimization" title="Newton&#39;s method in optimization">Newton–Raphson algorithm</a> (a "second-order" method) provides an asymptotically optimal or near-optimal form of iterative optimization in the setting of stochastic approximation<sup class="noprint Inline-Template Template-Fact" style="white-space:nowrap;">&#91;<i><a href="/wiki/Wikipedia:Citation_needed" title="Wikipedia:Citation needed"><span title="This claim needs references to reliable sources. (April 2020)">citation needed</span></a></i>&#93;</sup>. A method that uses direct measurements of the <a href="/wiki/Hessian_matrix" title="Hessian matrix">Hessian matrices</a> of the summands in the empirical risk function was developed by Byrd, Hansen, Nocedal, and Singer.<sup id="cite_ref-57" class="reference"><a href="#cite_note-57"><span class="cite-bracket">&#91;</span>56<span class="cite-bracket">&#93;</span></a></sup> However, directly determining the required Hessian matrices for optimization may not be possible in practice. Practical and theoretically sound methods for second-order versions of SGD that do not require direct Hessian information are given by Spall and others.<sup id="cite_ref-58" class="reference"><a href="#cite_note-58"><span class="cite-bracket">&#91;</span>57<span class="cite-bracket">&#93;</span></a></sup><sup id="cite_ref-59" class="reference"><a href="#cite_note-59"><span class="cite-bracket">&#91;</span>58<span class="cite-bracket">&#93;</span></a></sup><sup id="cite_ref-60" class="reference"><a href="#cite_note-60"><span class="cite-bracket">&#91;</span>59<span class="cite-bracket">&#93;</span></a></sup> (A less efficient method based on finite differences, instead of simultaneous perturbations, is given by Ruppert.<sup id="cite_ref-61" class="reference"><a href="#cite_note-61"><span class="cite-bracket">&#91;</span>60<span class="cite-bracket">&#93;</span></a></sup>) Another approach to the approximation Hessian matrix is replacing it with the Fisher information matrix, which transforms usual gradient to natural.<sup id="cite_ref-62" class="reference"><a href="#cite_note-62"><span class="cite-bracket">&#91;</span>61<span class="cite-bracket">&#93;</span></a></sup> These methods not requiring direct Hessian information are based on either values of the summands in the above empirical risk function or values of the gradients of the summands (i.e., the SGD inputs). In particular, second-order optimality is asymptotically achievable without direct calculation of the Hessian matrices of the summands in the empirical risk function. When the objective is a <a href="/wiki/Non-linear_least_squares" title="Non-linear least squares"> nonlinear least-squres</a> loss <span class="mwe-math-element"><span class="mwe-math-mathml-display mwe-math-mathml-a11y" style="display: none;"><math display="block" xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle Q(w)={\frac {1}{n}}\sum _{i=1}^{n}Q_{i}(w)={\frac {1}{n}}\sum _{i=1}^{n}(m(w;x_{i})-y_{i})^{2},}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mi>Q</mi> <mo stretchy="false">(</mo> <mi>w</mi> <mo stretchy="false">)</mo> <mo>=</mo> <mrow class="MJX-TeXAtom-ORD"> <mfrac> <mn>1</mn> <mi>n</mi> </mfrac> </mrow> <munderover> <mo>&#x2211;<!-- ∑ --></mo> <mrow class="MJX-TeXAtom-ORD"> <mi>i</mi> <mo>=</mo> <mn>1</mn> </mrow> <mrow class="MJX-TeXAtom-ORD"> <mi>n</mi> </mrow> </munderover> <msub> <mi>Q</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>i</mi> </mrow> </msub> <mo stretchy="false">(</mo> <mi>w</mi> <mo stretchy="false">)</mo> <mo>=</mo> <mrow class="MJX-TeXAtom-ORD"> <mfrac> <mn>1</mn> <mi>n</mi> </mfrac> </mrow> <munderover> <mo>&#x2211;<!-- ∑ --></mo> <mrow class="MJX-TeXAtom-ORD"> <mi>i</mi> <mo>=</mo> <mn>1</mn> </mrow> <mrow class="MJX-TeXAtom-ORD"> <mi>n</mi> </mrow> </munderover> <mo stretchy="false">(</mo> <mi>m</mi> <mo stretchy="false">(</mo> <mi>w</mi> <mo>;</mo> <msub> <mi>x</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>i</mi> </mrow> </msub> <mo stretchy="false">)</mo> <mo>&#x2212;<!-- − --></mo> <msub> <mi>y</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>i</mi> </mrow> </msub> <msup> <mo stretchy="false">)</mo> <mrow class="MJX-TeXAtom-ORD"> <mn>2</mn> </mrow> </msup> <mo>,</mo> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle Q(w)={\frac {1}{n}}\sum _{i=1}^{n}Q_{i}(w)={\frac {1}{n}}\sum _{i=1}^{n}(m(w;x_{i})-y_{i})^{2},}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/bf49f3d77928a3c124122ccac4b9a49846fbd991" class="mwe-math-fallback-image-display mw-invert skin-invert" aria-hidden="true" style="vertical-align: -3.005ex; width:46.92ex; height:6.843ex;" alt="{\displaystyle Q(w)={\frac {1}{n}}\sum _{i=1}^{n}Q_{i}(w)={\frac {1}{n}}\sum _{i=1}^{n}(m(w;x_{i})-y_{i})^{2},}"></span> where <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle m(w;x_{i})}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mi>m</mi> <mo stretchy="false">(</mo> <mi>w</mi> <mo>;</mo> <msub> <mi>x</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>i</mi> </mrow> </msub> <mo stretchy="false">)</mo> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle m(w;x_{i})}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/294d5e0a991612c5a18d2c320f7a779d0898d04d" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.838ex; width:8.677ex; height:2.843ex;" alt="{\displaystyle m(w;x_{i})}"></span> is the predictive model (e.g., a <a href="/wiki/Neural_network_(machine_learning)" title="Neural network (machine learning)">deep neural network</a>) the objective's structure can be exploited to estimate 2nd order information using gradients only. The resulting methods are simple and often effective<sup id="cite_ref-63" class="reference"><a href="#cite_note-63"><span class="cite-bracket">&#91;</span>62<span class="cite-bracket">&#93;</span></a></sup> </p><p><br /> </p> <div class="mw-heading mw-heading2"><h2 id="Approximations_in_continuous_time">Approximations in continuous time</h2><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Stochastic_gradient_descent&amp;action=edit&amp;section=17" title="Edit section: Approximations in continuous time"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <p>For small learning rate <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\textstyle \eta }"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="false" scriptlevel="0"> <mi>&#x03B7;<!-- η --></mi> </mstyle> </mrow> <annotation encoding="application/x-tex">{\textstyle \eta }</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/1cd470a01ea581d0f82c0d140b6438620ba03647" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.838ex; width:1.169ex; height:2.176ex;" alt="{\textstyle \eta }"></span> stochastic gradient descent <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\textstyle (w_{n})_{n\in \mathbb {N} _{0}}}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="false" scriptlevel="0"> <mo stretchy="false">(</mo> <msub> <mi>w</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>n</mi> </mrow> </msub> <msub> <mo stretchy="false">)</mo> <mrow class="MJX-TeXAtom-ORD"> <mi>n</mi> <mo>&#x2208;<!-- ∈ --></mo> <msub> <mrow class="MJX-TeXAtom-ORD"> <mi mathvariant="double-struck">N</mi> </mrow> <mrow class="MJX-TeXAtom-ORD"> <mn>0</mn> </mrow> </msub> </mrow> </msub> </mstyle> </mrow> <annotation encoding="application/x-tex">{\textstyle (w_{n})_{n\in \mathbb {N} _{0}}}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/8c36ff649092da5721760d5d3527980ea58f1d39" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -1.005ex; width:9.025ex; height:3.009ex;" alt="{\textstyle (w_{n})_{n\in \mathbb {N} _{0}}}"></span> can be viewed as a discretization of the <a href="/wiki/Gradient_flow" class="mw-redirect" title="Gradient flow">gradient flow</a> ODE </p><p><span class="mwe-math-element"><span class="mwe-math-mathml-display mwe-math-mathml-a11y" style="display: none;"><math display="block" xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle {\frac {d}{dt}}W_{t}=-\nabla Q(W_{t})}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mrow class="MJX-TeXAtom-ORD"> <mfrac> <mi>d</mi> <mrow> <mi>d</mi> <mi>t</mi> </mrow> </mfrac> </mrow> <msub> <mi>W</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>t</mi> </mrow> </msub> <mo>=</mo> <mo>&#x2212;<!-- − --></mo> <mi mathvariant="normal">&#x2207;<!-- ∇ --></mi> <mi>Q</mi> <mo stretchy="false">(</mo> <msub> <mi>W</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>t</mi> </mrow> </msub> <mo stretchy="false">)</mo> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle {\frac {d}{dt}}W_{t}=-\nabla Q(W_{t})}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/d569b1d793a7b13a19e518b2f76e470a0cb67f66" class="mwe-math-fallback-image-display mw-invert skin-invert" aria-hidden="true" style="vertical-align: -2.005ex; width:19.421ex; height:5.509ex;" alt="{\displaystyle {\frac {d}{dt}}W_{t}=-\nabla Q(W_{t})}"></span> </p><p>subject to additional stochastic noise. This approximation is only valid on a finite time-horizon in the following sense: assume that all the coefficients <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\textstyle Q_{i}}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="false" scriptlevel="0"> <msub> <mi>Q</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>i</mi> </mrow> </msub> </mstyle> </mrow> <annotation encoding="application/x-tex">{\textstyle Q_{i}}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/a9ec8357d96c14346893acab09ae83bf7b5096da" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.671ex; width:2.638ex; height:2.509ex;" alt="{\textstyle Q_{i}}"></span> are sufficiently smooth. Let <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\textstyle T&gt;0}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="false" scriptlevel="0"> <mi>T</mi> <mo>&gt;</mo> <mn>0</mn> </mstyle> </mrow> <annotation encoding="application/x-tex">{\textstyle T&gt;0}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/99009de0a13404653124b0b620e9403b839f2bf5" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.338ex; width:5.897ex; height:2.176ex;" alt="{\textstyle T&gt;0}"></span> and <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\textstyle g:\mathbb {R} ^{d}\to \mathbb {R} }"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="false" scriptlevel="0"> <mi>g</mi> <mo>:</mo> <msup> <mrow class="MJX-TeXAtom-ORD"> <mi mathvariant="double-struck">R</mi> </mrow> <mrow class="MJX-TeXAtom-ORD"> <mi>d</mi> </mrow> </msup> <mo stretchy="false">&#x2192;<!-- → --></mo> <mrow class="MJX-TeXAtom-ORD"> <mi mathvariant="double-struck">R</mi> </mrow> </mstyle> </mrow> <annotation encoding="application/x-tex">{\textstyle g:\mathbb {R} ^{d}\to \mathbb {R} }</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/4b8c983aca742fb8d032a9878d67f624e077be0a" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.671ex; width:11.115ex; height:3.009ex;" alt="{\textstyle g:\mathbb {R} ^{d}\to \mathbb {R} }"></span> be a sufficiently smooth test function. Then, there exists a constant <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\textstyle C&gt;0}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="false" scriptlevel="0"> <mi>C</mi> <mo>&gt;</mo> <mn>0</mn> </mstyle> </mrow> <annotation encoding="application/x-tex">{\textstyle C&gt;0}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/69e1fe97062f3ebc7ce3c06da5e208b726d4a91f" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.338ex; width:6.027ex; height:2.176ex;" alt="{\textstyle C&gt;0}"></span> such that for all <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\textstyle \eta &gt;0}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="false" scriptlevel="0"> <mi>&#x03B7;<!-- η --></mi> <mo>&gt;</mo> <mn>0</mn> </mstyle> </mrow> <annotation encoding="application/x-tex">{\textstyle \eta &gt;0}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/d8ddd0031f83ee042a190eddc35c5b8a779d49e2" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.838ex; width:5.43ex; height:2.676ex;" alt="{\textstyle \eta &gt;0}"></span> </p><p><span class="mwe-math-element"><span class="mwe-math-mathml-display mwe-math-mathml-a11y" style="display: none;"><math display="block" xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle \max _{k=0,\dots ,\lfloor T/\eta \rfloor }\left|\mathbb {E} [g(w_{k})]-g(W_{k\eta })\right|\leq C\eta ,}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <munder> <mo movablelimits="true" form="prefix">max</mo> <mrow class="MJX-TeXAtom-ORD"> <mi>k</mi> <mo>=</mo> <mn>0</mn> <mo>,</mo> <mo>&#x2026;<!-- … --></mo> <mo>,</mo> <mo fence="false" stretchy="false">&#x230A;<!-- ⌊ --></mo> <mi>T</mi> <mrow class="MJX-TeXAtom-ORD"> <mo>/</mo> </mrow> <mi>&#x03B7;<!-- η --></mi> <mo fence="false" stretchy="false">&#x230B;<!-- ⌋ --></mo> </mrow> </munder> <mrow> <mo>|</mo> <mrow> <mrow class="MJX-TeXAtom-ORD"> <mi mathvariant="double-struck">E</mi> </mrow> <mo stretchy="false">[</mo> <mi>g</mi> <mo stretchy="false">(</mo> <msub> <mi>w</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>k</mi> </mrow> </msub> <mo stretchy="false">)</mo> <mo stretchy="false">]</mo> <mo>&#x2212;<!-- − --></mo> <mi>g</mi> <mo stretchy="false">(</mo> <msub> <mi>W</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>k</mi> <mi>&#x03B7;<!-- η --></mi> </mrow> </msub> <mo stretchy="false">)</mo> </mrow> <mo>|</mo> </mrow> <mo>&#x2264;<!-- ≤ --></mo> <mi>C</mi> <mi>&#x03B7;<!-- η --></mi> <mo>,</mo> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle \max _{k=0,\dots ,\lfloor T/\eta \rfloor }\left|\mathbb {E} [g(w_{k})]-g(W_{k\eta })\right|\leq C\eta ,}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/bdd23d0ff0e5bdccc19ad02e29843d7a4d95877a" class="mwe-math-fallback-image-display mw-invert skin-invert" aria-hidden="true" style="vertical-align: -2.505ex; width:36.822ex; height:4.509ex;" alt="{\displaystyle \max _{k=0,\dots ,\lfloor T/\eta \rfloor }\left|\mathbb {E} [g(w_{k})]-g(W_{k\eta })\right|\leq C\eta ,}"></span> </p><p>where <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\textstyle \mathbb {E} }"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="false" scriptlevel="0"> <mrow class="MJX-TeXAtom-ORD"> <mi mathvariant="double-struck">E</mi> </mrow> </mstyle> </mrow> <annotation encoding="application/x-tex">{\textstyle \mathbb {E} }</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/e6c1347563cdaf813b6d4f37eb1ee7488645a667" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.338ex; width:1.55ex; height:2.176ex;" alt="{\textstyle \mathbb {E} }"></span> denotes taking the expectation with respect to the random choice of indices in the stochastic gradient descent scheme. </p><p>Since this approximation does not capture the random fluctuations around the mean behavior of stochastic gradient descent solutions to <a href="/wiki/Stochastic_differential_equations" class="mw-redirect" title="Stochastic differential equations">stochastic differential equations</a> (SDEs) have been proposed as limiting objects.<sup id="cite_ref-64" class="reference"><a href="#cite_note-64"><span class="cite-bracket">&#91;</span>63<span class="cite-bracket">&#93;</span></a></sup> More precisely, the solution to the SDE </p><p><span class="mwe-math-element"><span class="mwe-math-mathml-display mwe-math-mathml-a11y" style="display: none;"><math display="block" xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle dW_{t}=-\nabla \left(Q(W_{t})+{\tfrac {1}{4}}\eta |\nabla Q(W_{t})|^{2}\right)dt+{\sqrt {\eta }}\Sigma (W_{t})^{1/2}dB_{t},}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mi>d</mi> <msub> <mi>W</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>t</mi> </mrow> </msub> <mo>=</mo> <mo>&#x2212;<!-- − --></mo> <mi mathvariant="normal">&#x2207;<!-- ∇ --></mi> <mrow> <mo>(</mo> <mrow> <mi>Q</mi> <mo stretchy="false">(</mo> <msub> <mi>W</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>t</mi> </mrow> </msub> <mo stretchy="false">)</mo> <mo>+</mo> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="false" scriptlevel="0"> <mfrac> <mn>1</mn> <mn>4</mn> </mfrac> </mstyle> </mrow> <mi>&#x03B7;<!-- η --></mi> <mrow class="MJX-TeXAtom-ORD"> <mo stretchy="false">|</mo> </mrow> <mi mathvariant="normal">&#x2207;<!-- ∇ --></mi> <mi>Q</mi> <mo stretchy="false">(</mo> <msub> <mi>W</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>t</mi> </mrow> </msub> <mo stretchy="false">)</mo> <msup> <mrow class="MJX-TeXAtom-ORD"> <mo stretchy="false">|</mo> </mrow> <mrow class="MJX-TeXAtom-ORD"> <mn>2</mn> </mrow> </msup> </mrow> <mo>)</mo> </mrow> <mi>d</mi> <mi>t</mi> <mo>+</mo> <mrow class="MJX-TeXAtom-ORD"> <msqrt> <mi>&#x03B7;<!-- η --></mi> </msqrt> </mrow> <mi mathvariant="normal">&#x03A3;<!-- Σ --></mi> <mo stretchy="false">(</mo> <msub> <mi>W</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>t</mi> </mrow> </msub> <msup> <mo stretchy="false">)</mo> <mrow class="MJX-TeXAtom-ORD"> <mn>1</mn> <mrow class="MJX-TeXAtom-ORD"> <mo>/</mo> </mrow> <mn>2</mn> </mrow> </msup> <mi>d</mi> <msub> <mi>B</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>t</mi> </mrow> </msub> <mo>,</mo> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle dW_{t}=-\nabla \left(Q(W_{t})+{\tfrac {1}{4}}\eta |\nabla Q(W_{t})|^{2}\right)dt+{\sqrt {\eta }}\Sigma (W_{t})^{1/2}dB_{t},}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/c45808a4b6582dbf5cab794d403c42b933b36e71" class="mwe-math-fallback-image-display mw-invert skin-invert" aria-hidden="true" style="vertical-align: -1.838ex; width:59.573ex; height:4.843ex;" alt="{\displaystyle dW_{t}=-\nabla \left(Q(W_{t})+{\tfrac {1}{4}}\eta |\nabla Q(W_{t})|^{2}\right)dt+{\sqrt {\eta }}\Sigma (W_{t})^{1/2}dB_{t},}"></span> </p><p>for <span class="mwe-math-element"><span class="mwe-math-mathml-display mwe-math-mathml-a11y" style="display: none;"><math display="block" xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle \Sigma (w)={\frac {1}{n^{2}}}\left(\sum _{i=1}^{n}Q_{i}(w)-Q(w)\right)\left(\sum _{i=1}^{n}Q_{i}(w)-Q(w)\right)^{T}}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mi mathvariant="normal">&#x03A3;<!-- Σ --></mi> <mo stretchy="false">(</mo> <mi>w</mi> <mo stretchy="false">)</mo> <mo>=</mo> <mrow class="MJX-TeXAtom-ORD"> <mfrac> <mn>1</mn> <msup> <mi>n</mi> <mrow class="MJX-TeXAtom-ORD"> <mn>2</mn> </mrow> </msup> </mfrac> </mrow> <mrow> <mo>(</mo> <mrow> <munderover> <mo>&#x2211;<!-- ∑ --></mo> <mrow class="MJX-TeXAtom-ORD"> <mi>i</mi> <mo>=</mo> <mn>1</mn> </mrow> <mrow class="MJX-TeXAtom-ORD"> <mi>n</mi> </mrow> </munderover> <msub> <mi>Q</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>i</mi> </mrow> </msub> <mo stretchy="false">(</mo> <mi>w</mi> <mo stretchy="false">)</mo> <mo>&#x2212;<!-- − --></mo> <mi>Q</mi> <mo stretchy="false">(</mo> <mi>w</mi> <mo stretchy="false">)</mo> </mrow> <mo>)</mo> </mrow> <msup> <mrow> <mo>(</mo> <mrow> <munderover> <mo>&#x2211;<!-- ∑ --></mo> <mrow class="MJX-TeXAtom-ORD"> <mi>i</mi> <mo>=</mo> <mn>1</mn> </mrow> <mrow class="MJX-TeXAtom-ORD"> <mi>n</mi> </mrow> </munderover> <msub> <mi>Q</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>i</mi> </mrow> </msub> <mo stretchy="false">(</mo> <mi>w</mi> <mo stretchy="false">)</mo> <mo>&#x2212;<!-- − --></mo> <mi>Q</mi> <mo stretchy="false">(</mo> <mi>w</mi> <mo stretchy="false">)</mo> </mrow> <mo>)</mo> </mrow> <mrow class="MJX-TeXAtom-ORD"> <mi>T</mi> </mrow> </msup> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle \Sigma (w)={\frac {1}{n^{2}}}\left(\sum _{i=1}^{n}Q_{i}(w)-Q(w)\right)\left(\sum _{i=1}^{n}Q_{i}(w)-Q(w)\right)^{T}}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/4f9ff30f20c3ca9e176bfecf25c4d22437425978" class="mwe-math-fallback-image-display mw-invert skin-invert" aria-hidden="true" style="vertical-align: -3.171ex; width:57.072ex; height:8.009ex;" alt="{\displaystyle \Sigma (w)={\frac {1}{n^{2}}}\left(\sum _{i=1}^{n}Q_{i}(w)-Q(w)\right)\left(\sum _{i=1}^{n}Q_{i}(w)-Q(w)\right)^{T}}"></span> where <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\textstyle dB_{t}}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="false" scriptlevel="0"> <mi>d</mi> <msub> <mi>B</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>t</mi> </mrow> </msub> </mstyle> </mrow> <annotation encoding="application/x-tex">{\textstyle dB_{t}}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/2d1fb80dda72890576186bea3ccf64e2f23cb488" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.671ex; width:3.806ex; height:2.509ex;" alt="{\textstyle dB_{t}}"></span> denotes the <a href="/wiki/Ito_integral" class="mw-redirect" title="Ito integral">Ito-integral</a> with respect to a <a href="/wiki/Brownian_motion" title="Brownian motion">Brownian motion</a> is a more precise approximation in the sense that there exists a constant <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\textstyle C&gt;0}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="false" scriptlevel="0"> <mi>C</mi> <mo>&gt;</mo> <mn>0</mn> </mstyle> </mrow> <annotation encoding="application/x-tex">{\textstyle C&gt;0}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/69e1fe97062f3ebc7ce3c06da5e208b726d4a91f" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.338ex; width:6.027ex; height:2.176ex;" alt="{\textstyle C&gt;0}"></span> such that </p><p><span class="mwe-math-element"><span class="mwe-math-mathml-display mwe-math-mathml-a11y" style="display: none;"><math display="block" xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle \max _{k=0,\dots ,\lfloor T/\eta \rfloor }\left|\mathbb {E} [g(w_{k})]-\mathbb {E} [g(W_{k\eta })]\right|\leq C\eta ^{2}.}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <munder> <mo movablelimits="true" form="prefix">max</mo> <mrow class="MJX-TeXAtom-ORD"> <mi>k</mi> <mo>=</mo> <mn>0</mn> <mo>,</mo> <mo>&#x2026;<!-- … --></mo> <mo>,</mo> <mo fence="false" stretchy="false">&#x230A;<!-- ⌊ --></mo> <mi>T</mi> <mrow class="MJX-TeXAtom-ORD"> <mo>/</mo> </mrow> <mi>&#x03B7;<!-- η --></mi> <mo fence="false" stretchy="false">&#x230B;<!-- ⌋ --></mo> </mrow> </munder> <mrow> <mo>|</mo> <mrow> <mrow class="MJX-TeXAtom-ORD"> <mi mathvariant="double-struck">E</mi> </mrow> <mo stretchy="false">[</mo> <mi>g</mi> <mo stretchy="false">(</mo> <msub> <mi>w</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>k</mi> </mrow> </msub> <mo stretchy="false">)</mo> <mo stretchy="false">]</mo> <mo>&#x2212;<!-- − --></mo> <mrow class="MJX-TeXAtom-ORD"> <mi mathvariant="double-struck">E</mi> </mrow> <mo stretchy="false">[</mo> <mi>g</mi> <mo stretchy="false">(</mo> <msub> <mi>W</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>k</mi> <mi>&#x03B7;<!-- η --></mi> </mrow> </msub> <mo stretchy="false">)</mo> <mo stretchy="false">]</mo> </mrow> <mo>|</mo> </mrow> <mo>&#x2264;<!-- ≤ --></mo> <mi>C</mi> <msup> <mi>&#x03B7;<!-- η --></mi> <mrow class="MJX-TeXAtom-ORD"> <mn>2</mn> </mrow> </msup> <mo>.</mo> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle \max _{k=0,\dots ,\lfloor T/\eta \rfloor }\left|\mathbb {E} [g(w_{k})]-\mathbb {E} [g(W_{k\eta })]\right|\leq C\eta ^{2}.}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/5f61fe4ce716a0fdca021434da3aa17b91b1bd04" class="mwe-math-fallback-image-display mw-invert skin-invert" aria-hidden="true" style="vertical-align: -2.505ex; width:40.725ex; height:4.843ex;" alt="{\displaystyle \max _{k=0,\dots ,\lfloor T/\eta \rfloor }\left|\mathbb {E} [g(w_{k})]-\mathbb {E} [g(W_{k\eta })]\right|\leq C\eta ^{2}.}"></span> </p><p>However this SDE only approximates the one-point motion of stochastic gradient descent. For an approximation of the <a href="/wiki/Flow_(mathematics)" title="Flow (mathematics)">stochastic flow</a> one has to consider SDEs with infinite-dimensional noise.<sup id="cite_ref-65" class="reference"><a href="#cite_note-65"><span class="cite-bracket">&#91;</span>64<span class="cite-bracket">&#93;</span></a></sup> </p> <div class="mw-heading mw-heading2"><h2 id="See_also">See also</h2><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Stochastic_gradient_descent&amp;action=edit&amp;section=18" title="Edit section: See also"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <ul><li><a href="/wiki/Backtracking_line_search" title="Backtracking line search">Backtracking line search</a></li> <li><a href="/wiki/Broken_Neural_Scaling_Law" class="mw-redirect" title="Broken Neural Scaling Law">Broken Neural Scaling Law</a></li> <li><a href="/wiki/Coordinate_descent" title="Coordinate descent">Coordinate descent</a> – changes one coordinate at a time, rather than one example</li> <li><a href="/wiki/Linear_classifier" title="Linear classifier">Linear classifier</a></li> <li><a href="/wiki/Online_machine_learning" title="Online machine learning">Online machine learning</a></li> <li><a href="/wiki/Stochastic_hill_climbing" title="Stochastic hill climbing">Stochastic hill climbing</a></li> <li><a href="/wiki/Stochastic_variance_reduction" title="Stochastic variance reduction">Stochastic variance reduction</a></li></ul> <div class="mw-heading mw-heading2"><h2 id="Notes">Notes</h2><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Stochastic_gradient_descent&amp;action=edit&amp;section=19" title="Edit section: Notes"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <style data-mw-deduplicate="TemplateStyles:r1239543626">.mw-parser-output .reflist{margin-bottom:0.5em;list-style-type:decimal}@media screen{.mw-parser-output .reflist{font-size:90%}}.mw-parser-output .reflist .references{font-size:100%;margin-bottom:0;list-style-type:inherit}.mw-parser-output .reflist-columns-2{column-width:30em}.mw-parser-output .reflist-columns-3{column-width:25em}.mw-parser-output .reflist-columns{margin-top:0.3em}.mw-parser-output .reflist-columns ol{margin-top:0}.mw-parser-output .reflist-columns li{page-break-inside:avoid;break-inside:avoid-column}.mw-parser-output .reflist-upper-alpha{list-style-type:upper-alpha}.mw-parser-output .reflist-upper-roman{list-style-type:upper-roman}.mw-parser-output .reflist-lower-alpha{list-style-type:lower-alpha}.mw-parser-output .reflist-lower-greek{list-style-type:lower-greek}.mw-parser-output .reflist-lower-roman{list-style-type:lower-roman}</style><div class="reflist reflist-lower-alpha"> <div class="mw-references-wrap"><ol class="references"> <li id="cite_note-39"><span class="mw-cite-backlink"><b><a href="#cite_ref-39">^</a></b></span> <span class="reference-text"><span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle \odot }"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mo>&#x2299;<!-- ⊙ --></mo> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle \odot }</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/e89e009eb8a8839c82aa5c76c15e9f2d67006276" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.505ex; width:1.808ex; height:2.176ex;" alt="{\displaystyle \odot }"></span> denotes the <a href="/wiki/Hadamard_product_(matrices)" title="Hadamard product (matrices)">element-wise product</a>.</span> </li> </ol></div></div> <div class="mw-heading mw-heading2"><h2 id="References"><span lang="ru" dir="ltr">References</span></h2><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Stochastic_gradient_descent&amp;action=edit&amp;section=20" title="Edit section: References"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1239543626"><div class="reflist reflist-columns references-column-width" style="column-width: 30em;"> <ol class="references"> <li id="cite_note-1"><span class="mw-cite-backlink"><b><a href="#cite_ref-1">^</a></b></span> <span class="reference-text"><style data-mw-deduplicate="TemplateStyles:r1238218222">.mw-parser-output cite.citation{font-style:inherit;word-wrap:break-word}.mw-parser-output .citation q{quotes:"\"""\"""'""'"}.mw-parser-output .citation:target{background-color:rgba(0,127,255,0.133)}.mw-parser-output .id-lock-free.id-lock-free a{background:url("//upload.wikimedia.org/wikipedia/commons/6/65/Lock-green.svg")right 0.1em center/9px no-repeat}.mw-parser-output .id-lock-limited.id-lock-limited a,.mw-parser-output .id-lock-registration.id-lock-registration a{background:url("//upload.wikimedia.org/wikipedia/commons/d/d6/Lock-gray-alt-2.svg")right 0.1em center/9px no-repeat}.mw-parser-output .id-lock-subscription.id-lock-subscription a{background:url("//upload.wikimedia.org/wikipedia/commons/a/aa/Lock-red-alt-2.svg")right 0.1em center/9px no-repeat}.mw-parser-output .cs1-ws-icon a{background:url("//upload.wikimedia.org/wikipedia/commons/4/4c/Wikisource-logo.svg")right 0.1em center/12px no-repeat}body:not(.skin-timeless):not(.skin-minerva) .mw-parser-output .id-lock-free a,body:not(.skin-timeless):not(.skin-minerva) .mw-parser-output .id-lock-limited a,body:not(.skin-timeless):not(.skin-minerva) .mw-parser-output .id-lock-registration a,body:not(.skin-timeless):not(.skin-minerva) .mw-parser-output .id-lock-subscription a,body:not(.skin-timeless):not(.skin-minerva) .mw-parser-output .cs1-ws-icon a{background-size:contain;padding:0 1em 0 0}.mw-parser-output .cs1-code{color:inherit;background:inherit;border:none;padding:inherit}.mw-parser-output .cs1-hidden-error{display:none;color:var(--color-error,#d33)}.mw-parser-output .cs1-visible-error{color:var(--color-error,#d33)}.mw-parser-output .cs1-maint{display:none;color:#085;margin-left:0.3em}.mw-parser-output .cs1-kern-left{padding-left:0.2em}.mw-parser-output .cs1-kern-right{padding-right:0.2em}.mw-parser-output .citation .mw-selflink{font-weight:inherit}@media screen{.mw-parser-output .cs1-format{font-size:95%}html.skin-theme-clientpref-night .mw-parser-output .cs1-maint{color:#18911f}}@media screen and (prefers-color-scheme:dark){html.skin-theme-clientpref-os .mw-parser-output .cs1-maint{color:#18911f}}</style><cite id="CITEREFBottouBousquet2012" class="citation book cs1"><a href="/wiki/L%C3%A9on_Bottou" title="Léon Bottou">Bottou, Léon</a>; Bousquet, Olivier (2012). <a rel="nofollow" class="external text" href="https://books.google.com/books?id=JPQx7s2L1A8C&amp;pg=PA351">"The Tradeoffs of Large Scale Learning"</a>. In Sra, Suvrit; Nowozin, Sebastian; Wright, Stephen J. (eds.). <i>Optimization for Machine Learning</i>. Cambridge: MIT Press. pp.&#160;<span class="nowrap">351–</span>368. <a href="/wiki/ISBN_(identifier)" class="mw-redirect" title="ISBN (identifier)">ISBN</a>&#160;<a href="/wiki/Special:BookSources/978-0-262-01646-9" title="Special:BookSources/978-0-262-01646-9"><bdi>978-0-262-01646-9</bdi></a>.</cite><span title="ctx_ver=Z39.88-2004&amp;rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&amp;rft.genre=bookitem&amp;rft.atitle=The+Tradeoffs+of+Large+Scale+Learning&amp;rft.btitle=Optimization+for+Machine+Learning&amp;rft.place=Cambridge&amp;rft.pages=%3Cspan+class%3D%22nowrap%22%3E351-%3C%2Fspan%3E368&amp;rft.pub=MIT+Press&amp;rft.date=2012&amp;rft.isbn=978-0-262-01646-9&amp;rft.aulast=Bottou&amp;rft.aufirst=L%C3%A9on&amp;rft.au=Bousquet%2C+Olivier&amp;rft_id=https%3A%2F%2Fbooks.google.com%2Fbooks%3Fid%3DJPQx7s2L1A8C%26pg%3DPA351&amp;rfr_id=info%3Asid%2Fen.wikipedia.org%3AStochastic+gradient+descent" class="Z3988"></span></span> </li> <li id="cite_note-Bottou_1998-2"><span class="mw-cite-backlink">^ <a href="#cite_ref-Bottou_1998_2-0"><sup><i><b>a</b></i></sup></a> <a href="#cite_ref-Bottou_1998_2-1"><sup><i><b>b</b></i></sup></a></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFBottou1998" class="citation book cs1"><a href="/wiki/L%C3%A9on_Bottou" title="Léon Bottou">Bottou, Léon</a> (1998). "Online Algorithms and Stochastic Approximations". <span class="id-lock-registration" title="Free registration required"><a rel="nofollow" class="external text" href="https://archive.org/details/onlinelearningin0000unse"><i>Online Learning and Neural Networks</i></a></span>. Cambridge University Press. <a href="/wiki/ISBN_(identifier)" class="mw-redirect" title="ISBN (identifier)">ISBN</a>&#160;<a href="/wiki/Special:BookSources/978-0-521-65263-6" title="Special:BookSources/978-0-521-65263-6"><bdi>978-0-521-65263-6</bdi></a>.</cite><span title="ctx_ver=Z39.88-2004&amp;rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&amp;rft.genre=bookitem&amp;rft.atitle=Online+Algorithms+and+Stochastic+Approximations&amp;rft.btitle=Online+Learning+and+Neural+Networks&amp;rft.pub=Cambridge+University+Press&amp;rft.date=1998&amp;rft.isbn=978-0-521-65263-6&amp;rft.aulast=Bottou&amp;rft.aufirst=L%C3%A9on&amp;rft_id=https%3A%2F%2Farchive.org%2Fdetails%2Fonlinelearningin0000unse&amp;rfr_id=info%3Asid%2Fen.wikipedia.org%3AStochastic+gradient+descent" class="Z3988"></span></span> </li> <li id="cite_note-3"><span class="mw-cite-backlink"><b><a href="#cite_ref-3">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFFerguson1982" class="citation journal cs1"><a href="/wiki/Thomas_S._Ferguson" title="Thomas S. Ferguson">Ferguson, Thomas S.</a> (1982). "An inconsistent maximum likelihood estimate". <i>Journal of the American Statistical Association</i>. <b>77</b> (380): <span class="nowrap">831–</span>834. <a href="/wiki/Doi_(identifier)" class="mw-redirect" title="Doi (identifier)">doi</a>:<a rel="nofollow" class="external text" href="https://doi.org/10.1080%2F01621459.1982.10477894">10.1080/01621459.1982.10477894</a>. <a href="/wiki/JSTOR_(identifier)" class="mw-redirect" title="JSTOR (identifier)">JSTOR</a>&#160;<a rel="nofollow" class="external text" href="https://www.jstor.org/stable/2287314">2287314</a>.</cite><span title="ctx_ver=Z39.88-2004&amp;rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&amp;rft.genre=article&amp;rft.jtitle=Journal+of+the+American+Statistical+Association&amp;rft.atitle=An+inconsistent+maximum+likelihood+estimate&amp;rft.volume=77&amp;rft.issue=380&amp;rft.pages=%3Cspan+class%3D%22nowrap%22%3E831-%3C%2Fspan%3E834&amp;rft.date=1982&amp;rft_id=info%3Adoi%2F10.1080%2F01621459.1982.10477894&amp;rft_id=https%3A%2F%2Fwww.jstor.org%2Fstable%2F2287314%23id-name%3DJSTOR&amp;rft.aulast=Ferguson&amp;rft.aufirst=Thomas+S.&amp;rfr_id=info%3Asid%2Fen.wikipedia.org%3AStochastic+gradient+descent" class="Z3988"></span></span> </li> <li id="cite_note-4"><span class="mw-cite-backlink"><b><a href="#cite_ref-4">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFBottouBousquet2008" class="citation conference cs1"><a href="/wiki/L%C3%A9on_Bottou" title="Léon Bottou">Bottou, Léon</a>; Bousquet, Olivier (2008). <a rel="nofollow" class="external text" href="http://leon.bottou.org/papers/bottou-bousquet-2008"><i>The Tradeoffs of Large Scale Learning</i></a>. <a href="/wiki/Advances_in_Neural_Information_Processing_Systems" class="mw-redirect" title="Advances in Neural Information Processing Systems">Advances in Neural Information Processing Systems</a>. Vol.&#160;20. pp.&#160;<span class="nowrap">161–</span>168.</cite><span title="ctx_ver=Z39.88-2004&amp;rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&amp;rft.genre=conference&amp;rft.btitle=The+Tradeoffs+of+Large+Scale+Learning&amp;rft.pages=%3Cspan+class%3D%22nowrap%22%3E161-%3C%2Fspan%3E168&amp;rft.date=2008&amp;rft.aulast=Bottou&amp;rft.aufirst=L%C3%A9on&amp;rft.au=Bousquet%2C+Olivier&amp;rft_id=http%3A%2F%2Fleon.bottou.org%2Fpapers%2Fbottou-bousquet-2008&amp;rfr_id=info%3Asid%2Fen.wikipedia.org%3AStochastic+gradient+descent" class="Z3988"></span></span> </li> <li id="cite_note-5"><span class="mw-cite-backlink"><b><a href="#cite_ref-5">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFMurphy2021" class="citation book cs1">Murphy, Kevin (2021). <a rel="nofollow" class="external text" href="https://probml.github.io/pml-book/book1.html"><i>Probabilistic Machine Learning: An Introduction</i></a>. MIT Press<span class="reference-accessdate">. Retrieved <span class="nowrap">10 April</span> 2021</span>.</cite><span title="ctx_ver=Z39.88-2004&amp;rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&amp;rft.genre=book&amp;rft.btitle=Probabilistic+Machine+Learning%3A+An+Introduction&amp;rft.pub=MIT+Press&amp;rft.date=2021&amp;rft.aulast=Murphy&amp;rft.aufirst=Kevin&amp;rft_id=https%3A%2F%2Fprobml.github.io%2Fpml-book%2Fbook1.html&amp;rfr_id=info%3Asid%2Fen.wikipedia.org%3AStochastic+gradient+descent" class="Z3988"></span></span> </li> <li id="cite_note-6"><span class="mw-cite-backlink"><b><a href="#cite_ref-6">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFBilmesAsanovicChinDemmel1997" class="citation conference cs1">Bilmes, Jeff; <a href="/wiki/Krste_Asanovi%C4%87" title="Krste Asanović">Asanovic, Krste</a>; Chin, Chee-Whye; Demmel, James (April 1997). <a rel="nofollow" class="external text" href="https://ieeexplore.ieee.org/document/604861">"Using PHiPAC to speed error back-propagation learning"</a>. <i>1997 IEEE International Conference on Acoustics, Speech, and Signal Processing</i>. ICASSP. Munich, Germany: IEEE. pp.&#160;4153–4156 vol.5. <a href="/wiki/Doi_(identifier)" class="mw-redirect" title="Doi (identifier)">doi</a>:<a rel="nofollow" class="external text" href="https://doi.org/10.1109%2FICASSP.1997.604861">10.1109/ICASSP.1997.604861</a>.</cite><span title="ctx_ver=Z39.88-2004&amp;rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&amp;rft.genre=conference&amp;rft.atitle=Using+PHiPAC+to+speed+error+back-propagation+learning&amp;rft.btitle=1997+IEEE+International+Conference+on+Acoustics%2C+Speech%2C+and+Signal+Processing&amp;rft.place=Munich%2C+Germany&amp;rft.pages=4153-4156+vol.5&amp;rft.pub=IEEE&amp;rft.date=1997-04&amp;rft_id=info%3Adoi%2F10.1109%2FICASSP.1997.604861&amp;rft.aulast=Bilmes&amp;rft.aufirst=Jeff&amp;rft.au=Asanovic%2C+Krste&amp;rft.au=Chin%2C+Chee-Whye&amp;rft.au=Demmel%2C+James&amp;rft_id=https%3A%2F%2Fieeexplore.ieee.org%2Fdocument%2F604861&amp;rfr_id=info%3Asid%2Fen.wikipedia.org%3AStochastic+gradient+descent" class="Z3988"></span></span> </li> <li id="cite_note-7"><span class="mw-cite-backlink"><b><a href="#cite_ref-7">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFKiwiel2001" class="citation journal cs1">Kiwiel, Krzysztof C. (2001). "Convergence and efficiency of subgradient methods for quasiconvex minimization". <i>Mathematical Programming, Series A</i>. <b>90</b> (1). Berlin, Heidelberg: Springer: <span class="nowrap">1–</span>25. <a href="/wiki/Doi_(identifier)" class="mw-redirect" title="Doi (identifier)">doi</a>:<a rel="nofollow" class="external text" href="https://doi.org/10.1007%2FPL00011414">10.1007/PL00011414</a>. <a href="/wiki/ISSN_(identifier)" class="mw-redirect" title="ISSN (identifier)">ISSN</a>&#160;<a rel="nofollow" class="external text" href="https://search.worldcat.org/issn/0025-5610">0025-5610</a>. <a href="/wiki/MR_(identifier)" class="mw-redirect" title="MR (identifier)">MR</a>&#160;<a rel="nofollow" class="external text" href="https://mathscinet.ams.org/mathscinet-getitem?mr=1819784">1819784</a>. <a href="/wiki/S2CID_(identifier)" class="mw-redirect" title="S2CID (identifier)">S2CID</a>&#160;<a rel="nofollow" class="external text" href="https://api.semanticscholar.org/CorpusID:10043417">10043417</a>.</cite><span title="ctx_ver=Z39.88-2004&amp;rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&amp;rft.genre=article&amp;rft.jtitle=Mathematical+Programming%2C+Series+A&amp;rft.atitle=Convergence+and+efficiency+of+subgradient+methods+for+quasiconvex+minimization&amp;rft.volume=90&amp;rft.issue=1&amp;rft.pages=%3Cspan+class%3D%22nowrap%22%3E1-%3C%2Fspan%3E25&amp;rft.date=2001&amp;rft_id=https%3A%2F%2Fapi.semanticscholar.org%2FCorpusID%3A10043417%23id-name%3DS2CID&amp;rft_id=https%3A%2F%2Fmathscinet.ams.org%2Fmathscinet-getitem%3Fmr%3D1819784%23id-name%3DMR&amp;rft.issn=0025-5610&amp;rft_id=info%3Adoi%2F10.1007%2FPL00011414&amp;rft.aulast=Kiwiel&amp;rft.aufirst=Krzysztof+C.&amp;rfr_id=info%3Asid%2Fen.wikipedia.org%3AStochastic+gradient+descent" class="Z3988"></span></span> </li> <li id="cite_note-8"><span class="mw-cite-backlink"><b><a href="#cite_ref-8">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFRobbinsSiegmund1971" class="citation book cs1"><a href="/wiki/Herbert_Robbins" title="Herbert Robbins">Robbins, Herbert</a>; <a href="/wiki/David_O._Siegmund" class="mw-redirect" title="David O. Siegmund">Siegmund, David O.</a> (1971). "A convergence theorem for non negative almost supermartingales and some applications". In Rustagi, Jagdish S. (ed.). <i>Optimizing Methods in Statistics</i>. Academic Press. <a href="/wiki/ISBN_(identifier)" class="mw-redirect" title="ISBN (identifier)">ISBN</a>&#160;<a href="/wiki/Special:BookSources/0-12-604550-X" title="Special:BookSources/0-12-604550-X"><bdi>0-12-604550-X</bdi></a>.</cite><span title="ctx_ver=Z39.88-2004&amp;rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&amp;rft.genre=bookitem&amp;rft.atitle=A+convergence+theorem+for+non+negative+almost+supermartingales+and+some+applications&amp;rft.btitle=Optimizing+Methods+in+Statistics&amp;rft.pub=Academic+Press&amp;rft.date=1971&amp;rft.isbn=0-12-604550-X&amp;rft.aulast=Robbins&amp;rft.aufirst=Herbert&amp;rft.au=Siegmund%2C+David+O.&amp;rfr_id=info%3Asid%2Fen.wikipedia.org%3AStochastic+gradient+descent" class="Z3988"></span></span> </li> <li id="cite_note-9"><span class="mw-cite-backlink"><b><a href="#cite_ref-9">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFBelkin2021" class="citation journal cs1">Belkin, Mikhail (May 2021). <a rel="nofollow" class="external text" href="https://www.cambridge.org/core/journals/acta-numerica/article/abs/fit-without-fear-remarkable-mathematical-phenomena-of-deep-learning-through-the-prism-of-interpolation/DBAC769EB7F4DBA5C4720932C2826014">"Fit without fear: remarkable mathematical phenomena of deep learning through the prism of interpolation"</a>. <i>Acta Numerica</i>. <b>30</b>: <span class="nowrap">203–</span>248. <a href="/wiki/ArXiv_(identifier)" class="mw-redirect" title="ArXiv (identifier)">arXiv</a>:<span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://arxiv.org/abs/2105.14368">2105.14368</a></span>. <a href="/wiki/Doi_(identifier)" class="mw-redirect" title="Doi (identifier)">doi</a>:<a rel="nofollow" class="external text" href="https://doi.org/10.1017%2FS0962492921000039">10.1017/S0962492921000039</a>. <a href="/wiki/ISSN_(identifier)" class="mw-redirect" title="ISSN (identifier)">ISSN</a>&#160;<a rel="nofollow" class="external text" href="https://search.worldcat.org/issn/0962-4929">0962-4929</a>.</cite><span title="ctx_ver=Z39.88-2004&amp;rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&amp;rft.genre=article&amp;rft.jtitle=Acta+Numerica&amp;rft.atitle=Fit+without+fear%3A+remarkable+mathematical+phenomena+of+deep+learning+through+the+prism+of+interpolation&amp;rft.volume=30&amp;rft.pages=%3Cspan+class%3D%22nowrap%22%3E203-%3C%2Fspan%3E248&amp;rft.date=2021-05&amp;rft_id=info%3Aarxiv%2F2105.14368&amp;rft.issn=0962-4929&amp;rft_id=info%3Adoi%2F10.1017%2FS0962492921000039&amp;rft.aulast=Belkin&amp;rft.aufirst=Mikhail&amp;rft_id=https%3A%2F%2Fwww.cambridge.org%2Fcore%2Fjournals%2Facta-numerica%2Farticle%2Fabs%2Ffit-without-fear-remarkable-mathematical-phenomena-of-deep-learning-through-the-prism-of-interpolation%2FDBAC769EB7F4DBA5C4720932C2826014&amp;rfr_id=info%3Asid%2Fen.wikipedia.org%3AStochastic+gradient+descent" class="Z3988"></span></span> </li> <li id="cite_note-rm-10"><span class="mw-cite-backlink"><b><a href="#cite_ref-rm_10-0">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFRobbinsMonro1951" class="citation journal cs1"><a href="/wiki/Herbert_Robbins" title="Herbert Robbins">Robbins, H.</a>; Monro, S. (1951). <a rel="nofollow" class="external text" href="https://doi.org/10.1214%2Faoms%2F1177729586">"A Stochastic Approximation Method"</a>. <i>The Annals of Mathematical Statistics</i>. <b>22</b> (3): 400. <a href="/wiki/Doi_(identifier)" class="mw-redirect" title="Doi (identifier)">doi</a>:<span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://doi.org/10.1214%2Faoms%2F1177729586">10.1214/aoms/1177729586</a></span>.</cite><span title="ctx_ver=Z39.88-2004&amp;rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&amp;rft.genre=article&amp;rft.jtitle=The+Annals+of+Mathematical+Statistics&amp;rft.atitle=A+Stochastic+Approximation+Method&amp;rft.volume=22&amp;rft.issue=3&amp;rft.pages=400&amp;rft.date=1951&amp;rft_id=info%3Adoi%2F10.1214%2Faoms%2F1177729586&amp;rft.aulast=Robbins&amp;rft.aufirst=H.&amp;rft.au=Monro%2C+S.&amp;rft_id=https%3A%2F%2Fdoi.org%2F10.1214%252Faoms%252F1177729586&amp;rfr_id=info%3Asid%2Fen.wikipedia.org%3AStochastic+gradient+descent" class="Z3988"></span></span> </li> <li id="cite_note-11"><span class="mw-cite-backlink"><b><a href="#cite_ref-11">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFKieferWolfowitz1952" class="citation journal cs1">Kiefer, J.; Wolfowitz, J. (1952). <a rel="nofollow" class="external text" href="https://doi.org/10.1214%2Faoms%2F1177729392">"Stochastic Estimation of the Maximum of a Regression Function"</a>. <i>The Annals of Mathematical Statistics</i>. <b>23</b> (3): <span class="nowrap">462–</span>466. <a href="/wiki/Doi_(identifier)" class="mw-redirect" title="Doi (identifier)">doi</a>:<span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://doi.org/10.1214%2Faoms%2F1177729392">10.1214/aoms/1177729392</a></span>.</cite><span title="ctx_ver=Z39.88-2004&amp;rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&amp;rft.genre=article&amp;rft.jtitle=The+Annals+of+Mathematical+Statistics&amp;rft.atitle=Stochastic+Estimation+of+the+Maximum+of+a+Regression+Function&amp;rft.volume=23&amp;rft.issue=3&amp;rft.pages=%3Cspan+class%3D%22nowrap%22%3E462-%3C%2Fspan%3E466&amp;rft.date=1952&amp;rft_id=info%3Adoi%2F10.1214%2Faoms%2F1177729392&amp;rft.aulast=Kiefer&amp;rft.aufirst=J.&amp;rft.au=Wolfowitz%2C+J.&amp;rft_id=https%3A%2F%2Fdoi.org%2F10.1214%252Faoms%252F1177729392&amp;rfr_id=info%3Asid%2Fen.wikipedia.org%3AStochastic+gradient+descent" class="Z3988"></span></span> </li> <li id="cite_note-12"><span class="mw-cite-backlink"><b><a href="#cite_ref-12">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFRosenblatt1958" class="citation journal cs1">Rosenblatt, F. (1958). "The perceptron: A probabilistic model for information storage and organization in the brain". <i>Psychological Review</i>. <b>65</b> (6): <span class="nowrap">386–</span>408. <a href="/wiki/Doi_(identifier)" class="mw-redirect" title="Doi (identifier)">doi</a>:<a rel="nofollow" class="external text" href="https://doi.org/10.1037%2Fh0042519">10.1037/h0042519</a>. <a href="/wiki/PMID_(identifier)" class="mw-redirect" title="PMID (identifier)">PMID</a>&#160;<a rel="nofollow" class="external text" href="https://pubmed.ncbi.nlm.nih.gov/13602029">13602029</a>. <a href="/wiki/S2CID_(identifier)" class="mw-redirect" title="S2CID (identifier)">S2CID</a>&#160;<a rel="nofollow" class="external text" href="https://api.semanticscholar.org/CorpusID:12781225">12781225</a>.</cite><span title="ctx_ver=Z39.88-2004&amp;rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&amp;rft.genre=article&amp;rft.jtitle=Psychological+Review&amp;rft.atitle=The+perceptron%3A+A+probabilistic+model+for+information+storage+and+organization+in+the+brain.&amp;rft.volume=65&amp;rft.issue=6&amp;rft.pages=%3Cspan+class%3D%22nowrap%22%3E386-%3C%2Fspan%3E408&amp;rft.date=1958&amp;rft_id=https%3A%2F%2Fapi.semanticscholar.org%2FCorpusID%3A12781225%23id-name%3DS2CID&amp;rft_id=info%3Apmid%2F13602029&amp;rft_id=info%3Adoi%2F10.1037%2Fh0042519&amp;rft.aulast=Rosenblatt&amp;rft.aufirst=F.&amp;rfr_id=info%3Asid%2Fen.wikipedia.org%3AStochastic+gradient+descent" class="Z3988"></span></span> </li> <li id="cite_note-13"><span class="mw-cite-backlink"><b><a href="#cite_ref-13">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFBilmesAsanovicChinDemmel1997" class="citation conference cs1">Bilmes, Jeff; <a href="/wiki/Krste_Asanovi%C4%87" title="Krste Asanović">Asanovic, Krste</a>; Chin, Chee-Whye; Demmel, James (April 1997). <a rel="nofollow" class="external text" href="https://ieeexplore.ieee.org/document/604861">"Using PHiPAC to speed error back-propagation learning"</a>. <i>1997 IEEE International Conference on Acoustics, Speech, and Signal Processing</i>. ICASSP. Munich, Germany: IEEE. pp.&#160;4153–4156 vol.5. <a href="/wiki/Doi_(identifier)" class="mw-redirect" title="Doi (identifier)">doi</a>:<a rel="nofollow" class="external text" href="https://doi.org/10.1109%2FICASSP.1997.604861">10.1109/ICASSP.1997.604861</a>.</cite><span title="ctx_ver=Z39.88-2004&amp;rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&amp;rft.genre=conference&amp;rft.atitle=Using+PHiPAC+to+speed+error+back-propagation+learning&amp;rft.btitle=1997+IEEE+International+Conference+on+Acoustics%2C+Speech%2C+and+Signal+Processing&amp;rft.place=Munich%2C+Germany&amp;rft.pages=4153-4156+vol.5&amp;rft.pub=IEEE&amp;rft.date=1997-04&amp;rft_id=info%3Adoi%2F10.1109%2FICASSP.1997.604861&amp;rft.aulast=Bilmes&amp;rft.aufirst=Jeff&amp;rft.au=Asanovic%2C+Krste&amp;rft.au=Chin%2C+Chee-Whye&amp;rft.au=Demmel%2C+James&amp;rft_id=https%3A%2F%2Fieeexplore.ieee.org%2Fdocument%2F604861&amp;rfr_id=info%3Asid%2Fen.wikipedia.org%3AStochastic+gradient+descent" class="Z3988"></span></span> </li> <li id="cite_note-14"><span class="mw-cite-backlink"><b><a href="#cite_ref-14">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFPengLiWang2020" class="citation journal cs1">Peng, Xinyu; Li, Li; Wang, Fei-Yue (2020). <a rel="nofollow" class="external text" href="https://ieeexplore.ieee.org/document/8945166">"Accelerating Minibatch Stochastic Gradient Descent Using Typicality Sampling"</a>. <i>IEEE Transactions on Neural Networks and Learning Systems</i>. <b>31</b> (11): <span class="nowrap">4649–</span>4659. <a href="/wiki/ArXiv_(identifier)" class="mw-redirect" title="ArXiv (identifier)">arXiv</a>:<span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://arxiv.org/abs/1903.04192">1903.04192</a></span>. <a href="/wiki/Doi_(identifier)" class="mw-redirect" title="Doi (identifier)">doi</a>:<a rel="nofollow" class="external text" href="https://doi.org/10.1109%2FTNNLS.2019.2957003">10.1109/TNNLS.2019.2957003</a>. <a href="/wiki/PMID_(identifier)" class="mw-redirect" title="PMID (identifier)">PMID</a>&#160;<a rel="nofollow" class="external text" href="https://pubmed.ncbi.nlm.nih.gov/31899442">31899442</a>. <a href="/wiki/S2CID_(identifier)" class="mw-redirect" title="S2CID (identifier)">S2CID</a>&#160;<a rel="nofollow" class="external text" href="https://api.semanticscholar.org/CorpusID:73728964">73728964</a><span class="reference-accessdate">. Retrieved <span class="nowrap">2023-10-02</span></span>.</cite><span title="ctx_ver=Z39.88-2004&amp;rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&amp;rft.genre=article&amp;rft.jtitle=IEEE+Transactions+on+Neural+Networks+and+Learning+Systems&amp;rft.atitle=Accelerating+Minibatch+Stochastic+Gradient+Descent+Using+Typicality+Sampling&amp;rft.volume=31&amp;rft.issue=11&amp;rft.pages=%3Cspan+class%3D%22nowrap%22%3E4649-%3C%2Fspan%3E4659&amp;rft.date=2020&amp;rft_id=info%3Aarxiv%2F1903.04192&amp;rft_id=https%3A%2F%2Fapi.semanticscholar.org%2FCorpusID%3A73728964%23id-name%3DS2CID&amp;rft_id=info%3Apmid%2F31899442&amp;rft_id=info%3Adoi%2F10.1109%2FTNNLS.2019.2957003&amp;rft.aulast=Peng&amp;rft.aufirst=Xinyu&amp;rft.au=Li%2C+Li&amp;rft.au=Wang%2C+Fei-Yue&amp;rft_id=https%3A%2F%2Fieeexplore.ieee.org%2Fdocument%2F8945166&amp;rfr_id=info%3Asid%2Fen.wikipedia.org%3AStochastic+gradient+descent" class="Z3988"></span></span> </li> <li id="cite_note-15"><span class="mw-cite-backlink"><b><a href="#cite_ref-15">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFRumelhartHintonWilliams1986" class="citation journal cs1">Rumelhart, David E.; Hinton, Geoffrey E.; Williams, Ronald J. (October 1986). <a rel="nofollow" class="external text" href="https://www.nature.com/articles/323533a0">"Learning representations by back-propagating errors"</a>. <i>Nature</i>. <b>323</b> (6088): <span class="nowrap">533–</span>536. <a href="/wiki/Bibcode_(identifier)" class="mw-redirect" title="Bibcode (identifier)">Bibcode</a>:<a rel="nofollow" class="external text" href="https://ui.adsabs.harvard.edu/abs/1986Natur.323..533R">1986Natur.323..533R</a>. <a href="/wiki/Doi_(identifier)" class="mw-redirect" title="Doi (identifier)">doi</a>:<a rel="nofollow" class="external text" href="https://doi.org/10.1038%2F323533a0">10.1038/323533a0</a>. <a href="/wiki/ISSN_(identifier)" class="mw-redirect" title="ISSN (identifier)">ISSN</a>&#160;<a rel="nofollow" class="external text" href="https://search.worldcat.org/issn/1476-4687">1476-4687</a>. <a href="/wiki/S2CID_(identifier)" class="mw-redirect" title="S2CID (identifier)">S2CID</a>&#160;<a rel="nofollow" class="external text" href="https://api.semanticscholar.org/CorpusID:205001834">205001834</a>.</cite><span title="ctx_ver=Z39.88-2004&amp;rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&amp;rft.genre=article&amp;rft.jtitle=Nature&amp;rft.atitle=Learning+representations+by+back-propagating+errors&amp;rft.volume=323&amp;rft.issue=6088&amp;rft.pages=%3Cspan+class%3D%22nowrap%22%3E533-%3C%2Fspan%3E536&amp;rft.date=1986-10&amp;rft_id=info%3Adoi%2F10.1038%2F323533a0&amp;rft_id=https%3A%2F%2Fapi.semanticscholar.org%2FCorpusID%3A205001834%23id-name%3DS2CID&amp;rft.issn=1476-4687&amp;rft_id=info%3Abibcode%2F1986Natur.323..533R&amp;rft.aulast=Rumelhart&amp;rft.aufirst=David+E.&amp;rft.au=Hinton%2C+Geoffrey+E.&amp;rft.au=Williams%2C+Ronald+J.&amp;rft_id=https%3A%2F%2Fwww.nature.com%2Farticles%2F323533a0&amp;rfr_id=info%3Asid%2Fen.wikipedia.org%3AStochastic+gradient+descent" class="Z3988"></span></span> </li> <li id="cite_note-duchi2-16"><span class="mw-cite-backlink"><b><a href="#cite_ref-duchi2_16-0">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFDuchiHazanSinger2011" class="citation journal cs1">Duchi, John; Hazan, Elad; Singer, Yoram (2011). <a rel="nofollow" class="external text" href="http://jmlr.org/papers/volume12/duchi11a/duchi11a.pdf">"Adaptive subgradient methods for online learning and stochastic optimization"</a> <span class="cs1-format">(PDF)</span>. <i><a href="/wiki/Journal_of_Machine_Learning_Research" title="Journal of Machine Learning Research">JMLR</a></i>. <b>12</b>: <span class="nowrap">2121–</span>2159.</cite><span title="ctx_ver=Z39.88-2004&amp;rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&amp;rft.genre=article&amp;rft.jtitle=JMLR&amp;rft.atitle=Adaptive+subgradient+methods+for+online+learning+and+stochastic+optimization&amp;rft.volume=12&amp;rft.pages=%3Cspan+class%3D%22nowrap%22%3E2121-%3C%2Fspan%3E2159&amp;rft.date=2011&amp;rft.aulast=Duchi&amp;rft.aufirst=John&amp;rft.au=Hazan%2C+Elad&amp;rft.au=Singer%2C+Yoram&amp;rft_id=http%3A%2F%2Fjmlr.org%2Fpapers%2Fvolume12%2Fduchi11a%2Fduchi11a.pdf&amp;rfr_id=info%3Asid%2Fen.wikipedia.org%3AStochastic+gradient+descent" class="Z3988"></span></span> </li> <li id="cite_note-rmsprop2-17"><span class="mw-cite-backlink"><b><a href="#cite_ref-rmsprop2_17-0">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFHinton" class="citation web cs1"><a href="/wiki/Geoffrey_Hinton" title="Geoffrey Hinton">Hinton, Geoffrey</a>. <a rel="nofollow" class="external text" href="http://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf">"Lecture 6e rmsprop: Divide the gradient by a running average of its recent magnitude"</a> <span class="cs1-format">(PDF)</span>. p.&#160;26<span class="reference-accessdate">. Retrieved <span class="nowrap">19 March</span> 2020</span>.</cite><span title="ctx_ver=Z39.88-2004&amp;rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&amp;rft.genre=unknown&amp;rft.btitle=Lecture+6e+rmsprop%3A+Divide+the+gradient+by+a+running+average+of+its+recent+magnitude&amp;rft.pages=26&amp;rft.aulast=Hinton&amp;rft.aufirst=Geoffrey&amp;rft_id=http%3A%2F%2Fwww.cs.toronto.edu%2F~tijmen%2Fcsc321%2Fslides%2Flecture_slides_lec6.pdf&amp;rfr_id=info%3Asid%2Fen.wikipedia.org%3AStochastic+gradient+descent" class="Z3988"></span></span> </li> <li id="cite_note-Adam20142-18"><span class="mw-cite-backlink"><b><a href="#cite_ref-Adam20142_18-0">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFKingmaBa2014" class="citation arxiv cs1">Kingma, Diederik; Ba, Jimmy (2014). "Adam: A Method for Stochastic Optimization". <a href="/wiki/ArXiv_(identifier)" class="mw-redirect" title="ArXiv (identifier)">arXiv</a>:<span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://arxiv.org/abs/1412.6980">1412.6980</a></span> [<a rel="nofollow" class="external text" href="https://arxiv.org/archive/cs.LG">cs.LG</a>].</cite><span title="ctx_ver=Z39.88-2004&amp;rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&amp;rft.genre=preprint&amp;rft.jtitle=arXiv&amp;rft.atitle=Adam%3A+A+Method+for+Stochastic+Optimization&amp;rft.date=2014&amp;rft_id=info%3Aarxiv%2F1412.6980&amp;rft.aulast=Kingma&amp;rft.aufirst=Diederik&amp;rft.au=Ba%2C+Jimmy&amp;rfr_id=info%3Asid%2Fen.wikipedia.org%3AStochastic+gradient+descent" class="Z3988"></span></span> </li> <li id="cite_note-pytorch.org-19"><span class="mw-cite-backlink">^ <a href="#cite_ref-pytorch.org_19-0"><sup><i><b>a</b></i></sup></a> <a href="#cite_ref-pytorch.org_19-1"><sup><i><b>b</b></i></sup></a></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite class="citation web cs1"><a rel="nofollow" class="external text" href="https://pytorch.org/docs/stable/optim.html">"torch.optim — PyTorch 2.0 documentation"</a>. <i>pytorch.org</i><span class="reference-accessdate">. Retrieved <span class="nowrap">2023-10-02</span></span>.</cite><span title="ctx_ver=Z39.88-2004&amp;rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&amp;rft.genre=unknown&amp;rft.jtitle=pytorch.org&amp;rft.atitle=torch.optim+%E2%80%94+PyTorch+2.0+documentation&amp;rft_id=https%3A%2F%2Fpytorch.org%2Fdocs%2Fstable%2Foptim.html&amp;rfr_id=info%3Asid%2Fen.wikipedia.org%3AStochastic+gradient+descent" class="Z3988"></span></span> </li> <li id="cite_note-20"><span class="mw-cite-backlink"><b><a href="#cite_ref-20">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFNguyenDlugolinskyBobákTran2019" class="citation journal cs1">Nguyen, Giang; Dlugolinsky, Stefan; Bobák, Martin; Tran, Viet; García, Álvaro; Heredia, Ignacio; Malík, Peter; Hluchý, Ladislav (19 January 2019). <a rel="nofollow" class="external text" href="https://link.springer.com/content/pdf/10.1007/s10462-018-09679-z.pdf">"Machine Learning and Deep Learning frameworks and libraries for large-scale data mining: a survey"</a> <span class="cs1-format">(PDF)</span>. <i>Artificial Intelligence Review</i>. <b>52</b>: <span class="nowrap">77–</span>124. <a href="/wiki/Doi_(identifier)" class="mw-redirect" title="Doi (identifier)">doi</a>:<a rel="nofollow" class="external text" href="https://doi.org/10.1007%2Fs10462-018-09679-z">10.1007/s10462-018-09679-z</a>. <a href="/wiki/S2CID_(identifier)" class="mw-redirect" title="S2CID (identifier)">S2CID</a>&#160;<a rel="nofollow" class="external text" href="https://api.semanticscholar.org/CorpusID:254236976">254236976</a>.</cite><span title="ctx_ver=Z39.88-2004&amp;rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&amp;rft.genre=article&amp;rft.jtitle=Artificial+Intelligence+Review&amp;rft.atitle=Machine+Learning+and+Deep+Learning+frameworks+and+libraries+for+large-scale+data+mining%3A+a+survey&amp;rft.volume=52&amp;rft.pages=%3Cspan+class%3D%22nowrap%22%3E77-%3C%2Fspan%3E124&amp;rft.date=2019-01-19&amp;rft_id=info%3Adoi%2F10.1007%2Fs10462-018-09679-z&amp;rft_id=https%3A%2F%2Fapi.semanticscholar.org%2FCorpusID%3A254236976%23id-name%3DS2CID&amp;rft.aulast=Nguyen&amp;rft.aufirst=Giang&amp;rft.au=Dlugolinsky%2C+Stefan&amp;rft.au=Bob%C3%A1k%2C+Martin&amp;rft.au=Tran%2C+Viet&amp;rft.au=Garc%C3%ADa%2C+%C3%81lvaro&amp;rft.au=Heredia%2C+Ignacio&amp;rft.au=Mal%C3%ADk%2C+Peter&amp;rft.au=Hluch%C3%BD%2C+Ladislav&amp;rft_id=https%3A%2F%2Flink.springer.com%2Fcontent%2Fpdf%2F10.1007%2Fs10462-018-09679-z.pdf&amp;rfr_id=info%3Asid%2Fen.wikipedia.org%3AStochastic+gradient+descent" class="Z3988"></span></span> </li> <li id="cite_note-21"><span class="mw-cite-backlink"><b><a href="#cite_ref-21">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite class="citation web cs1"><a rel="nofollow" class="external text" href="https://www.tensorflow.org/api_docs/python/tf/keras/optimizers">"Module: tf.keras.optimizers | TensorFlow v2.14.0"</a>. <i>TensorFlow</i><span class="reference-accessdate">. Retrieved <span class="nowrap">2023-10-02</span></span>.</cite><span title="ctx_ver=Z39.88-2004&amp;rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&amp;rft.genre=unknown&amp;rft.jtitle=TensorFlow&amp;rft.atitle=Module%3A+tf.keras.optimizers+%7C+TensorFlow+v2.14.0&amp;rft_id=https%3A%2F%2Fwww.tensorflow.org%2Fapi_docs%2Fpython%2Ftf%2Fkeras%2Foptimizers&amp;rfr_id=info%3Asid%2Fen.wikipedia.org%3AStochastic+gradient+descent" class="Z3988"></span></span> </li> <li id="cite_note-22"><span class="mw-cite-backlink"><b><a href="#cite_ref-22">^</a></b></span> <span class="reference-text">Jenny Rose Finkel, Alex Kleeman, Christopher D. Manning (2008). <a rel="nofollow" class="external text" href="http://www.aclweb.org/anthology/P08-1109">Efficient, Feature-based, Conditional Random Field Parsing</a>. Proc. Annual Meeting of the ACL.</span> </li> <li id="cite_note-23"><span class="mw-cite-backlink"><b><a href="#cite_ref-23">^</a></b></span> <span class="reference-text"><a rel="nofollow" class="external text" href="http://yann.lecun.com/exdb/publis/pdf/lecun-98b.pdf">LeCun, Yann A., et al. "Efficient backprop." Neural networks: Tricks of the trade. Springer Berlin Heidelberg, 2012. 9-48</a></span> </li> <li id="cite_note-24"><span class="mw-cite-backlink"><b><a href="#cite_ref-24">^</a></b></span> <span class="reference-text"><a rel="nofollow" class="external text" href="https://library.seg.org/doi/abs/10.1190/1.3230502">Jerome R. Krebs, John E. Anderson, David Hinkley, Ramesh Neelamani, Sunwoong Lee, Anatoly Baumstein, and Martin-Daniel Lacasse, (2009), "Fast full-wavefield seismic inversion using encoded sources," GEOPHYSICS 74: WCC177-WCC188.</a></span> </li> <li id="cite_note-25"><span class="mw-cite-backlink"><b><a href="#cite_ref-25">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFAvi_Pfeffer" class="citation web cs1">Avi Pfeffer. <a rel="nofollow" class="external text" href="http://www.seas.harvard.edu/courses/cs181/files/lecture05-notes.pdf">"CS181 Lecture 5 — Perceptrons"</a> <span class="cs1-format">(PDF)</span>. Harvard University.</cite><span title="ctx_ver=Z39.88-2004&amp;rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&amp;rft.genre=unknown&amp;rft.btitle=CS181+Lecture+5+%E2%80%94+Perceptrons&amp;rft.pub=Harvard+University&amp;rft.au=Avi+Pfeffer&amp;rft_id=http%3A%2F%2Fwww.seas.harvard.edu%2Fcourses%2Fcs181%2Ffiles%2Flecture05-notes.pdf&amp;rfr_id=info%3Asid%2Fen.wikipedia.org%3AStochastic+gradient+descent" class="Z3988"></span><sup class="noprint Inline-Template"><span style="white-space: nowrap;">&#91;<i><a href="/wiki/Wikipedia:Link_rot" title="Wikipedia:Link rot"><span title="&#160;Dead link tagged June 2018">permanent dead link</span></a></i><span style="visibility:hidden; color:transparent; padding-left:2px">&#8205;</span>&#93;</span></sup></span> </li> <li id="cite_note-26"><span class="mw-cite-backlink"><b><a href="#cite_ref-26">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFGoodfellowBengioCourville2016" class="citation book cs1"><a href="/wiki/Ian_Goodfellow" title="Ian Goodfellow">Goodfellow, Ian</a>; Bengio, Yoshua; Courville, Aaron (2016). <a rel="nofollow" class="external text" href="https://www.deeplearningbook.org"><i>Deep Learning</i></a>. MIT Press. p.&#160;291. <a href="/wiki/ISBN_(identifier)" class="mw-redirect" title="ISBN (identifier)">ISBN</a>&#160;<a href="/wiki/Special:BookSources/978-0262035613" title="Special:BookSources/978-0262035613"><bdi>978-0262035613</bdi></a>.</cite><span title="ctx_ver=Z39.88-2004&amp;rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&amp;rft.genre=book&amp;rft.btitle=Deep+Learning&amp;rft.pages=291&amp;rft.pub=MIT+Press&amp;rft.date=2016&amp;rft.isbn=978-0262035613&amp;rft.aulast=Goodfellow&amp;rft.aufirst=Ian&amp;rft.au=Bengio%2C+Yoshua&amp;rft.au=Courville%2C+Aaron&amp;rft_id=https%3A%2F%2Fwww.deeplearningbook.org&amp;rfr_id=info%3Asid%2Fen.wikipedia.org%3AStochastic+gradient+descent" class="Z3988"></span></span> </li> <li id="cite_note-27"><span class="mw-cite-backlink"><b><a href="#cite_ref-27">^</a></b></span> <span class="reference-text">Cited by <link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFDarkenMoody1990" class="citation conference cs1">Darken, Christian; Moody, John (1990). <i>Fast adaptive k-means clustering: some empirical results</i>. Int'l Joint Conf. on Neural Networks (IJCNN). IEEE. <a href="/wiki/Doi_(identifier)" class="mw-redirect" title="Doi (identifier)">doi</a>:<a rel="nofollow" class="external text" href="https://doi.org/10.1109%2FIJCNN.1990.137720">10.1109/IJCNN.1990.137720</a>.</cite><span title="ctx_ver=Z39.88-2004&amp;rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&amp;rft.genre=conference&amp;rft.btitle=Fast+adaptive+k-means+clustering%3A+some+empirical+results&amp;rft.pub=IEEE&amp;rft.date=1990&amp;rft_id=info%3Adoi%2F10.1109%2FIJCNN.1990.137720&amp;rft.aulast=Darken&amp;rft.aufirst=Christian&amp;rft.au=Moody%2C+John&amp;rfr_id=info%3Asid%2Fen.wikipedia.org%3AStochastic+gradient+descent" class="Z3988"></span></span> </li> <li id="cite_note-28"><span class="mw-cite-backlink"><b><a href="#cite_ref-28">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFSpall2003" class="citation book cs1">Spall, J. C. (2003). <i>Introduction to Stochastic Search and Optimization: Estimation, Simulation, and Control</i>. Hoboken, NJ: Wiley. pp.&#160;Sections 4.4, 6.6, and 7.5. <a href="/wiki/ISBN_(identifier)" class="mw-redirect" title="ISBN (identifier)">ISBN</a>&#160;<a href="/wiki/Special:BookSources/0-471-33052-3" title="Special:BookSources/0-471-33052-3"><bdi>0-471-33052-3</bdi></a>.</cite><span title="ctx_ver=Z39.88-2004&amp;rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&amp;rft.genre=book&amp;rft.btitle=Introduction+to+Stochastic+Search+and+Optimization%3A+Estimation%2C+Simulation%2C+and+Control&amp;rft.place=Hoboken%2C+NJ&amp;rft.pages=Sections+4.4%2C+6.6%2C+and+7.5&amp;rft.pub=Wiley&amp;rft.date=2003&amp;rft.isbn=0-471-33052-3&amp;rft.aulast=Spall&amp;rft.aufirst=J.+C.&amp;rfr_id=info%3Asid%2Fen.wikipedia.org%3AStochastic+gradient+descent" class="Z3988"></span></span> </li> <li id="cite_note-29"><span class="mw-cite-backlink"><b><a href="#cite_ref-29">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFToulisAiroldi2017" class="citation journal cs1">Toulis, Panos; Airoldi, Edoardo (2017). "Asymptotic and finite-sample properties of estimators based on stochastic gradients". <i>Annals of Statistics</i>. <b>45</b> (4): <span class="nowrap">1694–</span>1727. <a href="/wiki/ArXiv_(identifier)" class="mw-redirect" title="ArXiv (identifier)">arXiv</a>:<span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://arxiv.org/abs/1408.2923">1408.2923</a></span>. <a href="/wiki/Doi_(identifier)" class="mw-redirect" title="Doi (identifier)">doi</a>:<a rel="nofollow" class="external text" href="https://doi.org/10.1214%2F16-AOS1506">10.1214/16-AOS1506</a>. <a href="/wiki/S2CID_(identifier)" class="mw-redirect" title="S2CID (identifier)">S2CID</a>&#160;<a rel="nofollow" class="external text" href="https://api.semanticscholar.org/CorpusID:10279395">10279395</a>.</cite><span title="ctx_ver=Z39.88-2004&amp;rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&amp;rft.genre=article&amp;rft.jtitle=Annals+of+Statistics&amp;rft.atitle=Asymptotic+and+finite-sample+properties+of+estimators+based+on+stochastic+gradients&amp;rft.volume=45&amp;rft.issue=4&amp;rft.pages=%3Cspan+class%3D%22nowrap%22%3E1694-%3C%2Fspan%3E1727&amp;rft.date=2017&amp;rft_id=info%3Aarxiv%2F1408.2923&amp;rft_id=https%3A%2F%2Fapi.semanticscholar.org%2FCorpusID%3A10279395%23id-name%3DS2CID&amp;rft_id=info%3Adoi%2F10.1214%2F16-AOS1506&amp;rft.aulast=Toulis&amp;rft.aufirst=Panos&amp;rft.au=Airoldi%2C+Edoardo&amp;rfr_id=info%3Asid%2Fen.wikipedia.org%3AStochastic+gradient+descent" class="Z3988"></span></span> </li> <li id="cite_note-Rumelhart1986-30"><span class="mw-cite-backlink">^ <a href="#cite_ref-Rumelhart1986_30-0"><sup><i><b>a</b></i></sup></a> <a href="#cite_ref-Rumelhart1986_30-1"><sup><i><b>b</b></i></sup></a></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFRumelhartHinton,_Geoffrey_E.Williams,_Ronald_J.1986" class="citation journal cs1">Rumelhart, David E.; Hinton, Geoffrey E.; Williams, Ronald J. (8 October 1986). "Learning representations by back-propagating errors". <i>Nature</i>. <b>323</b> (6088): <span class="nowrap">533–</span>536. <a href="/wiki/Bibcode_(identifier)" class="mw-redirect" title="Bibcode (identifier)">Bibcode</a>:<a rel="nofollow" class="external text" href="https://ui.adsabs.harvard.edu/abs/1986Natur.323..533R">1986Natur.323..533R</a>. <a href="/wiki/Doi_(identifier)" class="mw-redirect" title="Doi (identifier)">doi</a>:<a rel="nofollow" class="external text" href="https://doi.org/10.1038%2F323533a0">10.1038/323533a0</a>. <a href="/wiki/S2CID_(identifier)" class="mw-redirect" title="S2CID (identifier)">S2CID</a>&#160;<a rel="nofollow" class="external text" href="https://api.semanticscholar.org/CorpusID:205001834">205001834</a>.</cite><span title="ctx_ver=Z39.88-2004&amp;rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&amp;rft.genre=article&amp;rft.jtitle=Nature&amp;rft.atitle=Learning+representations+by+back-propagating+errors&amp;rft.volume=323&amp;rft.issue=6088&amp;rft.pages=%3Cspan+class%3D%22nowrap%22%3E533-%3C%2Fspan%3E536&amp;rft.date=1986-10-08&amp;rft_id=https%3A%2F%2Fapi.semanticscholar.org%2FCorpusID%3A205001834%23id-name%3DS2CID&amp;rft_id=info%3Adoi%2F10.1038%2F323533a0&amp;rft_id=info%3Abibcode%2F1986Natur.323..533R&amp;rft.aulast=Rumelhart&amp;rft.aufirst=David+E.&amp;rft.au=Hinton%2C+Geoffrey+E.&amp;rft.au=Williams%2C+Ronald+J.&amp;rfr_id=info%3Asid%2Fen.wikipedia.org%3AStochastic+gradient+descent" class="Z3988"></span></span> </li> <li id="cite_note-31"><span class="mw-cite-backlink"><b><a href="#cite_ref-31">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite class="citation web cs1"><a rel="nofollow" class="external text" href="https://boostedml.com/2020/07/gradient-descent-and-momentum-the-heavy-ball-method.html">"Gradient Descent and Momentum: The Heavy Ball Method"</a>. 13 July 2020.</cite><span title="ctx_ver=Z39.88-2004&amp;rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&amp;rft.genre=unknown&amp;rft.btitle=Gradient+Descent+and+Momentum%3A+The+Heavy+Ball+Method&amp;rft.date=2020-07-13&amp;rft_id=https%3A%2F%2Fboostedml.com%2F2020%2F07%2Fgradient-descent-and-momentum-the-heavy-ball-method.html&amp;rfr_id=info%3Asid%2Fen.wikipedia.org%3AStochastic+gradient+descent" class="Z3988"></span></span> </li> <li id="cite_note-Sutskever2013-32"><span class="mw-cite-backlink"><b><a href="#cite_ref-Sutskever2013_32-0">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFSutskeverMartens,_JamesDahl,_GeorgeHinton,_Geoffrey_E.2013" class="citation conference cs1">Sutskever, Ilya; Martens, James; Dahl, George; Hinton, Geoffrey E. (June 2013). Sanjoy Dasgupta and David Mcallester (ed.). <a rel="nofollow" class="external text" href="http://www.cs.utoronto.ca/~ilya/pubs/2013/1051_2.pdf"><i>On the importance of initialization and momentum in deep learning</i></a> <span class="cs1-format">(PDF)</span>. In Proceedings of the 30th international conference on machine learning (ICML-13). Vol.&#160;28. Atlanta, GA. pp.&#160;<span class="nowrap">1139–</span>1147<span class="reference-accessdate">. Retrieved <span class="nowrap">14 January</span> 2016</span>.</cite><span title="ctx_ver=Z39.88-2004&amp;rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&amp;rft.genre=conference&amp;rft.btitle=On+the+importance+of+initialization+and+momentum+in+deep+learning&amp;rft.place=Atlanta%2C+GA&amp;rft.pages=%3Cspan+class%3D%22nowrap%22%3E1139-%3C%2Fspan%3E1147&amp;rft.date=2013-06&amp;rft.aulast=Sutskever&amp;rft.aufirst=Ilya&amp;rft.au=Martens%2C+James&amp;rft.au=Dahl%2C+George&amp;rft.au=Hinton%2C+Geoffrey+E.&amp;rft_id=http%3A%2F%2Fwww.cs.utoronto.ca%2F~ilya%2Fpubs%2F2013%2F1051_2.pdf&amp;rfr_id=info%3Asid%2Fen.wikipedia.org%3AStochastic+gradient+descent" class="Z3988"></span></span> </li> <li id="cite_note-SutskeverPhD-33"><span class="mw-cite-backlink"><b><a href="#cite_ref-SutskeverPhD_33-0">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFSutskever2013" class="citation thesis cs1">Sutskever, Ilya (2013). <a rel="nofollow" class="external text" href="http://www.cs.utoronto.ca/~ilya/pubs/ilya_sutskever_phd_thesis.pdf"><i>Training recurrent neural networks</i></a> <span class="cs1-format">(PDF)</span> (Ph.D.). University of Toronto. p.&#160;74.</cite><span title="ctx_ver=Z39.88-2004&amp;rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Adissertation&amp;rft.title=Training+recurrent+neural+networks&amp;rft.inst=University+of+Toronto&amp;rft.date=2013&amp;rft.aulast=Sutskever&amp;rft.aufirst=Ilya&amp;rft_id=http%3A%2F%2Fwww.cs.utoronto.ca%2F~ilya%2Fpubs%2Filya_sutskever_phd_thesis.pdf&amp;rfr_id=info%3Asid%2Fen.wikipedia.org%3AStochastic+gradient+descent" class="Z3988"></span></span> </li> <li id="cite_note-Zeiler_2012-34"><span class="mw-cite-backlink">^ <a href="#cite_ref-Zeiler_2012_34-0"><sup><i><b>a</b></i></sup></a> <a href="#cite_ref-Zeiler_2012_34-1"><sup><i><b>b</b></i></sup></a></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFZeiler2012" class="citation arxiv cs1">Zeiler, Matthew D. (2012). "ADADELTA: An adaptive learning rate method". <a href="/wiki/ArXiv_(identifier)" class="mw-redirect" title="ArXiv (identifier)">arXiv</a>:<span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://arxiv.org/abs/1212.5701">1212.5701</a></span> [<a rel="nofollow" class="external text" href="https://arxiv.org/archive/cs.LG">cs.LG</a>].</cite><span title="ctx_ver=Z39.88-2004&amp;rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&amp;rft.genre=preprint&amp;rft.jtitle=arXiv&amp;rft.atitle=ADADELTA%3A+An+adaptive+learning+rate+method&amp;rft.date=2012&amp;rft_id=info%3Aarxiv%2F1212.5701&amp;rft.aulast=Zeiler&amp;rft.aufirst=Matthew+D.&amp;rfr_id=info%3Asid%2Fen.wikipedia.org%3AStochastic+gradient+descent" class="Z3988"></span></span> </li> <li id="cite_note-Borysenko2021-35"><span class="mw-cite-backlink"><b><a href="#cite_ref-Borysenko2021_35-0">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFBorysenkoByshkin,_Maksym2021" class="citation journal cs1">Borysenko, Oleksandr; Byshkin, Maksym (2021). <a rel="nofollow" class="external text" href="https://www.ncbi.nlm.nih.gov/pmc/articles/PMC8139967">"CoolMomentum: A Method for Stochastic Optimization by Langevin Dynamics with Simulated Annealing"</a>. <i>Scientific Reports</i>. <b>11</b> (1): 10705. <a href="/wiki/ArXiv_(identifier)" class="mw-redirect" title="ArXiv (identifier)">arXiv</a>:<span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://arxiv.org/abs/2005.14605">2005.14605</a></span>. <a href="/wiki/Bibcode_(identifier)" class="mw-redirect" title="Bibcode (identifier)">Bibcode</a>:<a rel="nofollow" class="external text" href="https://ui.adsabs.harvard.edu/abs/2021NatSR..1110705B">2021NatSR..1110705B</a>. <a href="/wiki/Doi_(identifier)" class="mw-redirect" title="Doi (identifier)">doi</a>:<a rel="nofollow" class="external text" href="https://doi.org/10.1038%2Fs41598-021-90144-3">10.1038/s41598-021-90144-3</a>. <a href="/wiki/PMC_(identifier)" class="mw-redirect" title="PMC (identifier)">PMC</a>&#160;<span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://www.ncbi.nlm.nih.gov/pmc/articles/PMC8139967">8139967</a></span>. <a href="/wiki/PMID_(identifier)" class="mw-redirect" title="PMID (identifier)">PMID</a>&#160;<a rel="nofollow" class="external text" href="https://pubmed.ncbi.nlm.nih.gov/34021212">34021212</a>.</cite><span title="ctx_ver=Z39.88-2004&amp;rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&amp;rft.genre=article&amp;rft.jtitle=Scientific+Reports&amp;rft.atitle=CoolMomentum%3A+A+Method+for+Stochastic+Optimization+by+Langevin+Dynamics+with+Simulated+Annealing&amp;rft.volume=11&amp;rft.issue=1&amp;rft.pages=10705&amp;rft.date=2021&amp;rft_id=https%3A%2F%2Fwww.ncbi.nlm.nih.gov%2Fpmc%2Farticles%2FPMC8139967%23id-name%3DPMC&amp;rft_id=info%3Abibcode%2F2021NatSR..1110705B&amp;rft_id=info%3Aarxiv%2F2005.14605&amp;rft_id=info%3Apmid%2F34021212&amp;rft_id=info%3Adoi%2F10.1038%2Fs41598-021-90144-3&amp;rft.aulast=Borysenko&amp;rft.aufirst=Oleksandr&amp;rft.au=Byshkin%2C+Maksym&amp;rft_id=https%3A%2F%2Fwww.ncbi.nlm.nih.gov%2Fpmc%2Farticles%2FPMC8139967&amp;rfr_id=info%3Asid%2Fen.wikipedia.org%3AStochastic+gradient+descent" class="Z3988"></span></span> </li> <li id="cite_note-36"><span class="mw-cite-backlink"><b><a href="#cite_ref-36">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite class="citation web cs1"><a rel="nofollow" class="external text" href="https://paperswithcode.com/method/nesterov-accelerated-gradient">"Papers with Code - Nesterov Accelerated Gradient Explained"</a>.</cite><span title="ctx_ver=Z39.88-2004&amp;rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&amp;rft.genre=unknown&amp;rft.btitle=Papers+with+Code+-+Nesterov+Accelerated+Gradient+Explained&amp;rft_id=https%3A%2F%2Fpaperswithcode.com%2Fmethod%2Fnesterov-accelerated-gradient&amp;rfr_id=info%3Asid%2Fen.wikipedia.org%3AStochastic+gradient+descent" class="Z3988"></span></span> </li> <li id="cite_note-37"><span class="mw-cite-backlink"><b><a href="#cite_ref-37">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFPolyakJuditsky1992" class="citation journal cs1">Polyak, Boris T.; Juditsky, Anatoli B. (1992). <a rel="nofollow" class="external text" href="https://web.archive.org/web/20160112091615/http://www.meyn.ece.ufl.edu/archive/spm_files/Courses/ECE555-2011/555media/poljud92.pdf">"Acceleration of stochastic approximation by averaging"</a> <span class="cs1-format">(PDF)</span>. <i>SIAM J. Control Optim</i>. <b>30</b> (4): <span class="nowrap">838–</span>855. <a href="/wiki/Doi_(identifier)" class="mw-redirect" title="Doi (identifier)">doi</a>:<a rel="nofollow" class="external text" href="https://doi.org/10.1137%2F0330046">10.1137/0330046</a>. <a href="/wiki/S2CID_(identifier)" class="mw-redirect" title="S2CID (identifier)">S2CID</a>&#160;<a rel="nofollow" class="external text" href="https://api.semanticscholar.org/CorpusID:3548228">3548228</a>. Archived from <a rel="nofollow" class="external text" href="http://www.meyn.ece.ufl.edu/archive/spm_files/Courses/ECE555-2011/555media/poljud92.pdf">the original</a> <span class="cs1-format">(PDF)</span> on 2016-01-12<span class="reference-accessdate">. Retrieved <span class="nowrap">2018-02-14</span></span>.</cite><span title="ctx_ver=Z39.88-2004&amp;rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&amp;rft.genre=article&amp;rft.jtitle=SIAM+J.+Control+Optim.&amp;rft.atitle=Acceleration+of+stochastic+approximation+by+averaging&amp;rft.volume=30&amp;rft.issue=4&amp;rft.pages=%3Cspan+class%3D%22nowrap%22%3E838-%3C%2Fspan%3E855&amp;rft.date=1992&amp;rft_id=info%3Adoi%2F10.1137%2F0330046&amp;rft_id=https%3A%2F%2Fapi.semanticscholar.org%2FCorpusID%3A3548228%23id-name%3DS2CID&amp;rft.aulast=Polyak&amp;rft.aufirst=Boris+T.&amp;rft.au=Juditsky%2C+Anatoli+B.&amp;rft_id=http%3A%2F%2Fwww.meyn.ece.ufl.edu%2Farchive%2Fspm_files%2FCourses%2FECE555-2011%2F555media%2Fpoljud92.pdf&amp;rfr_id=info%3Asid%2Fen.wikipedia.org%3AStochastic+gradient+descent" class="Z3988"></span></span> </li> <li id="cite_note-duchi-38"><span class="mw-cite-backlink">^ <a href="#cite_ref-duchi_38-0"><sup><i><b>a</b></i></sup></a> <a href="#cite_ref-duchi_38-1"><sup><i><b>b</b></i></sup></a></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFDuchiHazanSinger2011" class="citation journal cs1">Duchi, John; Hazan, Elad; Singer, Yoram (2011). <a rel="nofollow" class="external text" href="http://jmlr.org/papers/volume12/duchi11a/duchi11a.pdf">"Adaptive subgradient methods for online learning and stochastic optimization"</a> <span class="cs1-format">(PDF)</span>. <i><a href="/wiki/Journal_of_Machine_Learning_Research" title="Journal of Machine Learning Research">JMLR</a></i>. <b>12</b>: <span class="nowrap">2121–</span>2159.</cite><span title="ctx_ver=Z39.88-2004&amp;rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&amp;rft.genre=article&amp;rft.jtitle=JMLR&amp;rft.atitle=Adaptive+subgradient+methods+for+online+learning+and+stochastic+optimization&amp;rft.volume=12&amp;rft.pages=%3Cspan+class%3D%22nowrap%22%3E2121-%3C%2Fspan%3E2159&amp;rft.date=2011&amp;rft.aulast=Duchi&amp;rft.aufirst=John&amp;rft.au=Hazan%2C+Elad&amp;rft.au=Singer%2C+Yoram&amp;rft_id=http%3A%2F%2Fjmlr.org%2Fpapers%2Fvolume12%2Fduchi11a%2Fduchi11a.pdf&amp;rfr_id=info%3Asid%2Fen.wikipedia.org%3AStochastic+gradient+descent" class="Z3988"></span></span> </li> <li id="cite_note-40"><span class="mw-cite-backlink"><b><a href="#cite_ref-40">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFGuptaBengioWeston2014" class="citation journal cs1">Gupta, Maya R.; Bengio, Samy; Weston, Jason (2014). <a rel="nofollow" class="external text" href="http://jmlr.org/papers/volume15/gupta14a/gupta14a.pdf">"Training highly multiclass classifiers"</a> <span class="cs1-format">(PDF)</span>. <i>JMLR</i>. <b>15</b> (1): <span class="nowrap">1461–</span>1492.</cite><span title="ctx_ver=Z39.88-2004&amp;rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&amp;rft.genre=article&amp;rft.jtitle=JMLR&amp;rft.atitle=Training+highly+multiclass+classifiers&amp;rft.volume=15&amp;rft.issue=1&amp;rft.pages=%3Cspan+class%3D%22nowrap%22%3E1461-%3C%2Fspan%3E1492&amp;rft.date=2014&amp;rft.aulast=Gupta&amp;rft.aufirst=Maya+R.&amp;rft.au=Bengio%2C+Samy&amp;rft.au=Weston%2C+Jason&amp;rft_id=http%3A%2F%2Fjmlr.org%2Fpapers%2Fvolume15%2Fgupta14a%2Fgupta14a.pdf&amp;rfr_id=info%3Asid%2Fen.wikipedia.org%3AStochastic+gradient+descent" class="Z3988"></span></span> </li> <li id="cite_note-rmsprop-41"><span class="mw-cite-backlink">^ <a href="#cite_ref-rmsprop_41-0"><sup><i><b>a</b></i></sup></a> <a href="#cite_ref-rmsprop_41-1"><sup><i><b>b</b></i></sup></a></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFHinton" class="citation web cs1"><a href="/wiki/Geoffrey_Hinton" title="Geoffrey Hinton">Hinton, Geoffrey</a>. <a rel="nofollow" class="external text" href="http://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf">"Lecture 6e rmsprop: Divide the gradient by a running average of its recent magnitude"</a> <span class="cs1-format">(PDF)</span>. p.&#160;26<span class="reference-accessdate">. Retrieved <span class="nowrap">19 March</span> 2020</span>.</cite><span title="ctx_ver=Z39.88-2004&amp;rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&amp;rft.genre=unknown&amp;rft.btitle=Lecture+6e+rmsprop%3A+Divide+the+gradient+by+a+running+average+of+its+recent+magnitude&amp;rft.pages=26&amp;rft.aulast=Hinton&amp;rft.aufirst=Geoffrey&amp;rft_id=http%3A%2F%2Fwww.cs.toronto.edu%2F~tijmen%2Fcsc321%2Fslides%2Flecture_slides_lec6.pdf&amp;rfr_id=info%3Asid%2Fen.wikipedia.org%3AStochastic+gradient+descent" class="Z3988"></span></span> </li> <li id="cite_note-Adam2014-42"><span class="mw-cite-backlink">^ <a href="#cite_ref-Adam2014_42-0"><sup><i><b>a</b></i></sup></a> <a href="#cite_ref-Adam2014_42-1"><sup><i><b>b</b></i></sup></a></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFKingmaBa2014" class="citation arxiv cs1">Kingma, Diederik; Ba, Jimmy (2014). "Adam: A Method for Stochastic Optimization". <a href="/wiki/ArXiv_(identifier)" class="mw-redirect" title="ArXiv (identifier)">arXiv</a>:<span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://arxiv.org/abs/1412.6980">1412.6980</a></span> [<a rel="nofollow" class="external text" href="https://arxiv.org/archive/cs.LG">cs.LG</a>].</cite><span title="ctx_ver=Z39.88-2004&amp;rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&amp;rft.genre=preprint&amp;rft.jtitle=arXiv&amp;rft.atitle=Adam%3A+A+Method+for+Stochastic+Optimization&amp;rft.date=2014&amp;rft_id=info%3Aarxiv%2F1412.6980&amp;rft.aulast=Kingma&amp;rft.aufirst=Diederik&amp;rft.au=Ba%2C+Jimmy&amp;rfr_id=info%3Asid%2Fen.wikipedia.org%3AStochastic+gradient+descent" class="Z3988"></span></span> </li> <li id="cite_note-43"><span class="mw-cite-backlink"><b><a href="#cite_ref-43">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite class="citation web cs1"><a rel="nofollow" class="external text" href="https://www.oreilly.com/library/view/fundamentals-of-deep/9781491925607/ch04.html">"4. Beyond Gradient Descent - Fundamentals of Deep Learning &#91;Book&#93;"</a>.</cite><span title="ctx_ver=Z39.88-2004&amp;rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&amp;rft.genre=unknown&amp;rft.btitle=4.+Beyond+Gradient+Descent+-+Fundamentals+of+Deep+Learning+%26%2391%3BBook%26%2393%3B&amp;rft_id=https%3A%2F%2Fwww.oreilly.com%2Flibrary%2Fview%2Ffundamentals-of-deep%2F9781491925607%2Fch04.html&amp;rfr_id=info%3Asid%2Fen.wikipedia.org%3AStochastic+gradient+descent" class="Z3988"></span></span> </li> <li id="cite_note-44"><span class="mw-cite-backlink"><b><a href="#cite_ref-44">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFReddiKaleKumar2018" class="citation conference cs1">Reddi, Sashank J.; Kale, Satyen; Kumar, Sanjiv (2018). <a rel="nofollow" class="external text" href="https://openreview.net/forum?id=ryQu7f-RZ"><i>On the Convergence of Adam and Beyond</i></a>. 6th International Conference on Learning Representations (ICLR 2018). <a href="/wiki/ArXiv_(identifier)" class="mw-redirect" title="ArXiv (identifier)">arXiv</a>:<span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://arxiv.org/abs/1904.09237">1904.09237</a></span>.</cite><span title="ctx_ver=Z39.88-2004&amp;rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&amp;rft.genre=conference&amp;rft.btitle=On+the+Convergence+of+Adam+and+Beyond&amp;rft.date=2018&amp;rft_id=info%3Aarxiv%2F1904.09237&amp;rft.aulast=Reddi&amp;rft.aufirst=Sashank+J.&amp;rft.au=Kale%2C+Satyen&amp;rft.au=Kumar%2C+Sanjiv&amp;rft_id=https%3A%2F%2Fopenreview.net%2Fforum%3Fid%3DryQu7f-RZ&amp;rfr_id=info%3Asid%2Fen.wikipedia.org%3AStochastic+gradient+descent" class="Z3988"></span></span> </li> <li id="cite_note-45"><span class="mw-cite-backlink"><b><a href="#cite_ref-45">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFRubio2017" class="citation thesis cs1">Rubio, David Martínez (2017). <a rel="nofollow" class="external text" href="https://damaru2.github.io/convergence_analysis_hypergradient_descent/dissertation_hypergradients.pdf"><i>Convergence Analysis of an Adaptive Method of Gradient Descent</i></a> <span class="cs1-format">(PDF)</span> (Master thesis). University of Oxford<span class="reference-accessdate">. Retrieved <span class="nowrap">5 January</span> 2024</span>.</cite><span title="ctx_ver=Z39.88-2004&amp;rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Adissertation&amp;rft.title=Convergence+Analysis+of+an+Adaptive+Method+of+Gradient+Descent&amp;rft.degree=Master&amp;rft.inst=University+of+Oxford&amp;rft.date=2017&amp;rft.aulast=Rubio&amp;rft.aufirst=David+Mart%C3%ADnez&amp;rft_id=https%3A%2F%2Fdamaru2.github.io%2Fconvergence_analysis_hypergradient_descent%2Fdissertation_hypergradients.pdf&amp;rfr_id=info%3Asid%2Fen.wikipedia.org%3AStochastic+gradient+descent" class="Z3988"></span></span> </li> <li id="cite_note-46"><span class="mw-cite-backlink"><b><a href="#cite_ref-46">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFZhangChenShiSun2022" class="citation conference cs1">Zhang, Yushun; Chen, Congliang; Shi, Naichen; Sun, Ruoyu; Luo, Zhi-Quan (2022). "Adam Can Converge Without Any Modification On Update Rules". <i>Advances in Neural Information Processing Systems 35</i>. Advances in Neural Information Processing Systems 35 (NeurIPS 2022). <a href="/wiki/ArXiv_(identifier)" class="mw-redirect" title="ArXiv (identifier)">arXiv</a>:<span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://arxiv.org/abs/2208.09632">2208.09632</a></span>.</cite><span title="ctx_ver=Z39.88-2004&amp;rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&amp;rft.genre=conference&amp;rft.atitle=Adam+Can+Converge+Without+Any+Modification+On+Update+Rules&amp;rft.btitle=Advances+in+Neural+Information+Processing+Systems+35&amp;rft.date=2022&amp;rft_id=info%3Aarxiv%2F2208.09632&amp;rft.aulast=Zhang&amp;rft.aufirst=Yushun&amp;rft.au=Chen%2C+Congliang&amp;rft.au=Shi%2C+Naichen&amp;rft.au=Sun%2C+Ruoyu&amp;rft.au=Luo%2C+Zhi-Quan&amp;rfr_id=info%3Asid%2Fen.wikipedia.org%3AStochastic+gradient+descent" class="Z3988"></span></span> </li> <li id="cite_note-47"><span class="mw-cite-backlink"><b><a href="#cite_ref-47">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFDozat2016" class="citation journal cs1">Dozat, T. (2016). "Incorporating Nesterov Momentum into Adam". <a href="/wiki/S2CID_(identifier)" class="mw-redirect" title="S2CID (identifier)">S2CID</a>&#160;<a rel="nofollow" class="external text" href="https://api.semanticscholar.org/CorpusID:70293087">70293087</a>.</cite><span title="ctx_ver=Z39.88-2004&amp;rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&amp;rft.genre=article&amp;rft.atitle=Incorporating+Nesterov+Momentum+into+Adam&amp;rft.date=2016&amp;rft_id=https%3A%2F%2Fapi.semanticscholar.org%2FCorpusID%3A70293087%23id-name%3DS2CID&amp;rft.aulast=Dozat&amp;rft.aufirst=T.&amp;rfr_id=info%3Asid%2Fen.wikipedia.org%3AStochastic+gradient+descent" class="Z3988"></span> <span class="cs1-visible-error citation-comment"><code class="cs1-code">{{<a href="/wiki/Template:Cite_journal" title="Template:Cite journal">cite journal</a>}}</code>: </span><span class="cs1-visible-error citation-comment">Cite journal requires <code class="cs1-code">&#124;journal=</code> (<a href="/wiki/Help:CS1_errors#missing_periodical" title="Help:CS1 errors">help</a>)</span></span> </li> <li id="cite_note-48"><span class="mw-cite-backlink"><b><a href="#cite_ref-48">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFNaveen2022" class="citation journal cs1">Naveen, Philip (2022-08-09). <a rel="nofollow" class="external text" href="https://dx.doi.org/10.36227/techrxiv.20427852.v1">"FASFA: A Novel Next-Generation Backpropagation Optimizer"</a>. <a href="/wiki/Doi_(identifier)" class="mw-redirect" title="Doi (identifier)">doi</a>:<a rel="nofollow" class="external text" href="https://doi.org/10.36227%2Ftechrxiv.20427852.v1">10.36227/techrxiv.20427852.v1</a><span class="reference-accessdate">. Retrieved <span class="nowrap">2022-11-19</span></span>.</cite><span title="ctx_ver=Z39.88-2004&amp;rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&amp;rft.genre=article&amp;rft.atitle=FASFA%3A+A+Novel+Next-Generation+Backpropagation+Optimizer&amp;rft.date=2022-08-09&amp;rft_id=info%3Adoi%2F10.36227%2Ftechrxiv.20427852.v1&amp;rft.aulast=Naveen&amp;rft.aufirst=Philip&amp;rft_id=http%3A%2F%2Fdx.doi.org%2F10.36227%2Ftechrxiv.20427852.v1&amp;rfr_id=info%3Asid%2Fen.wikipedia.org%3AStochastic+gradient+descent" class="Z3988"></span> <span class="cs1-visible-error citation-comment"><code class="cs1-code">{{<a href="/wiki/Template:Cite_journal" title="Template:Cite journal">cite journal</a>}}</code>: </span><span class="cs1-visible-error citation-comment">Cite journal requires <code class="cs1-code">&#124;journal=</code> (<a href="/wiki/Help:CS1_errors#missing_periodical" title="Help:CS1 errors">help</a>)</span></span> </li> <li id="cite_note-49"><span class="mw-cite-backlink"><b><a href="#cite_ref-49">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFWhye2021" class="citation book cs1">Whye, Schwarz, Jonathan Jayakumar, Siddhant M. Pascanu, Razvan Latham, Peter E. Teh, Yee (2021-10-01). <a rel="nofollow" class="external text" href="http://worldcat.org/oclc/1333722169"><i>Powerpropagation: A sparsity inducing weight reparameterisation</i></a>. <a href="/wiki/OCLC_(identifier)" class="mw-redirect" title="OCLC (identifier)">OCLC</a>&#160;<a rel="nofollow" class="external text" href="https://search.worldcat.org/oclc/1333722169">1333722169</a>.</cite><span title="ctx_ver=Z39.88-2004&amp;rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&amp;rft.genre=book&amp;rft.btitle=Powerpropagation%3A+A+sparsity+inducing+weight+reparameterisation&amp;rft.date=2021-10-01&amp;rft_id=info%3Aoclcnum%2F1333722169&amp;rft.aulast=Whye&amp;rft.aufirst=Schwarz%2C+Jonathan+Jayakumar%2C+Siddhant+M.+Pascanu%2C+Razvan+Latham%2C+Peter+E.+Teh%2C+Yee&amp;rft_id=http%3A%2F%2Fworldcat.org%2Foclc%2F1333722169&amp;rfr_id=info%3Asid%2Fen.wikipedia.org%3AStochastic+gradient+descent" class="Z3988"></span><span class="cs1-maint citation-comment"><code class="cs1-code">{{<a href="/wiki/Template:Cite_book" title="Template:Cite book">cite book</a>}}</code>: CS1 maint: multiple names: authors list (<a href="/wiki/Category:CS1_maint:_multiple_names:_authors_list" title="Category:CS1 maint: multiple names: authors list">link</a>)</span></span> </li> <li id="cite_note-50"><span class="mw-cite-backlink"><b><a href="#cite_ref-50">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFHuLinTang2019" class="citation journal cs1">Hu, Yuzheng; Lin, Licong; Tang, Shange (2019-12-20). "Second-order Information in First-order Optimization Methods". <a href="/wiki/ArXiv_(identifier)" class="mw-redirect" title="ArXiv (identifier)">arXiv</a>:<span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://arxiv.org/abs/1912.09926">1912.09926</a></span>.</cite><span title="ctx_ver=Z39.88-2004&amp;rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&amp;rft.genre=article&amp;rft.atitle=Second-order+Information+in+First-order+Optimization+Methods&amp;rft.date=2019-12-20&amp;rft_id=info%3Aarxiv%2F1912.09926&amp;rft.aulast=Hu&amp;rft.aufirst=Yuzheng&amp;rft.au=Lin%2C+Licong&amp;rft.au=Tang%2C+Shange&amp;rfr_id=info%3Asid%2Fen.wikipedia.org%3AStochastic+gradient+descent" class="Z3988"></span> <span class="cs1-visible-error citation-comment"><code class="cs1-code">{{<a href="/wiki/Template:Cite_journal" title="Template:Cite journal">cite journal</a>}}</code>: </span><span class="cs1-visible-error citation-comment">Cite journal requires <code class="cs1-code">&#124;journal=</code> (<a href="/wiki/Help:CS1_errors#missing_periodical" title="Help:CS1 errors">help</a>)</span></span> </li> <li id="cite_note-51"><span class="mw-cite-backlink"><b><a href="#cite_ref-51">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFReddiKaleKumar2018" class="citation journal cs1">Reddi, Sashank J.; Kale, Satyen; Kumar, Sanjiv (2018). "On the Convergence of Adam and Beyond". <a href="/wiki/ArXiv_(identifier)" class="mw-redirect" title="ArXiv (identifier)">arXiv</a>:<span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://arxiv.org/abs/1904.09237">1904.09237</a></span>.</cite><span title="ctx_ver=Z39.88-2004&amp;rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&amp;rft.genre=article&amp;rft.atitle=On+the+Convergence+of+Adam+and+Beyond&amp;rft.date=2018&amp;rft_id=info%3Aarxiv%2F1904.09237&amp;rft.aulast=Reddi&amp;rft.aufirst=Sashank+J.&amp;rft.au=Kale%2C+Satyen&amp;rft.au=Kumar%2C+Sanjiv&amp;rfr_id=info%3Asid%2Fen.wikipedia.org%3AStochastic+gradient+descent" class="Z3988"></span> <span class="cs1-visible-error citation-comment"><code class="cs1-code">{{<a href="/wiki/Template:Cite_journal" title="Template:Cite journal">cite journal</a>}}</code>: </span><span class="cs1-visible-error citation-comment">Cite journal requires <code class="cs1-code">&#124;journal=</code> (<a href="/wiki/Help:CS1_errors#missing_periodical" title="Help:CS1 errors">help</a>)</span></span> </li> <li id="cite_note-52"><span class="mw-cite-backlink"><b><a href="#cite_ref-52">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite class="citation web cs1"><a rel="nofollow" class="external text" href="https://www.ruder.io/optimizing-gradient-descent/#amsgrad">"An overview of gradient descent optimization algorithms"</a>. 19 January 2016.</cite><span title="ctx_ver=Z39.88-2004&amp;rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&amp;rft.genre=unknown&amp;rft.btitle=An+overview+of+gradient+descent+optimization+algorithms&amp;rft.date=2016-01-19&amp;rft_id=https%3A%2F%2Fwww.ruder.io%2Foptimizing-gradient-descent%2F%23amsgrad&amp;rfr_id=info%3Asid%2Fen.wikipedia.org%3AStochastic+gradient+descent" class="Z3988"></span></span> </li> <li id="cite_note-53"><span class="mw-cite-backlink"><b><a href="#cite_ref-53">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFTranPhong2019" class="citation journal cs1">Tran, Phuong Thi; Phong, Le Trieu (2019). <a rel="nofollow" class="external text" href="https://ieeexplore.ieee.org/document/8713445">"On the Convergence Proof of AMSGrad and a New Version"</a>. <i>IEEE Access</i>. <b>7</b>: <span class="nowrap">61706–</span>61716. <a href="/wiki/ArXiv_(identifier)" class="mw-redirect" title="ArXiv (identifier)">arXiv</a>:<span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://arxiv.org/abs/1904.03590">1904.03590</a></span>. <a href="/wiki/Bibcode_(identifier)" class="mw-redirect" title="Bibcode (identifier)">Bibcode</a>:<a rel="nofollow" class="external text" href="https://ui.adsabs.harvard.edu/abs/2019IEEEA...761706T">2019IEEEA...761706T</a>. <a href="/wiki/Doi_(identifier)" class="mw-redirect" title="Doi (identifier)">doi</a>:<a rel="nofollow" class="external text" href="https://doi.org/10.1109%2FACCESS.2019.2916341">10.1109/ACCESS.2019.2916341</a>. <a href="/wiki/ISSN_(identifier)" class="mw-redirect" title="ISSN (identifier)">ISSN</a>&#160;<a rel="nofollow" class="external text" href="https://search.worldcat.org/issn/2169-3536">2169-3536</a>.</cite><span title="ctx_ver=Z39.88-2004&amp;rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&amp;rft.genre=article&amp;rft.jtitle=IEEE+Access&amp;rft.atitle=On+the+Convergence+Proof+of+AMSGrad+and+a+New+Version&amp;rft.volume=7&amp;rft.pages=%3Cspan+class%3D%22nowrap%22%3E61706-%3C%2Fspan%3E61716&amp;rft.date=2019&amp;rft_id=info%3Aarxiv%2F1904.03590&amp;rft.issn=2169-3536&amp;rft_id=info%3Adoi%2F10.1109%2FACCESS.2019.2916341&amp;rft_id=info%3Abibcode%2F2019IEEEA...761706T&amp;rft.aulast=Tran&amp;rft.aufirst=Phuong+Thi&amp;rft.au=Phong%2C+Le+Trieu&amp;rft_id=https%3A%2F%2Fieeexplore.ieee.org%2Fdocument%2F8713445&amp;rfr_id=info%3Asid%2Fen.wikipedia.org%3AStochastic+gradient+descent" class="Z3988"></span></span> </li> <li id="cite_note-AdamW-54"><span class="mw-cite-backlink"><b><a href="#cite_ref-AdamW_54-0">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFLoshchilovHutter2019" class="citation journal cs1">Loshchilov, Ilya; Hutter, Frank (4 January 2019). "Decoupled Weight Decay Regularization". <a href="/wiki/ArXiv_(identifier)" class="mw-redirect" title="ArXiv (identifier)">arXiv</a>:<span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://arxiv.org/abs/1711.05101">1711.05101</a></span>.</cite><span title="ctx_ver=Z39.88-2004&amp;rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&amp;rft.genre=article&amp;rft.atitle=Decoupled+Weight+Decay+Regularization&amp;rft.date=2019-01-04&amp;rft_id=info%3Aarxiv%2F1711.05101&amp;rft.aulast=Loshchilov&amp;rft.aufirst=Ilya&amp;rft.au=Hutter%2C+Frank&amp;rfr_id=info%3Asid%2Fen.wikipedia.org%3AStochastic+gradient+descent" class="Z3988"></span> <span class="cs1-visible-error citation-comment"><code class="cs1-code">{{<a href="/wiki/Template:Cite_journal" title="Template:Cite journal">cite journal</a>}}</code>: </span><span class="cs1-visible-error citation-comment">Cite journal requires <code class="cs1-code">&#124;journal=</code> (<a href="/wiki/Help:CS1_errors#missing_periodical" title="Help:CS1 errors">help</a>)</span></span> </li> <li id="cite_note-55"><span class="mw-cite-backlink"><b><a href="#cite_ref-55">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFBallesHennig2018" class="citation web cs1">Balles, Lukas; Hennig, Philipp (15 February 2018). <a rel="nofollow" class="external text" href="https://openreview.net/forum?id=S1EwLkW0W">"Dissecting Adam: The Sign, Magnitude and Variance of Stochastic Gradients"</a>.</cite><span title="ctx_ver=Z39.88-2004&amp;rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&amp;rft.genre=unknown&amp;rft.btitle=Dissecting+Adam%3A+The+Sign%2C+Magnitude+and+Variance+of+Stochastic+Gradients&amp;rft.date=2018-02-15&amp;rft.aulast=Balles&amp;rft.aufirst=Lukas&amp;rft.au=Hennig%2C+Philipp&amp;rft_id=https%3A%2F%2Fopenreview.net%2Fforum%3Fid%3DS1EwLkW0W&amp;rfr_id=info%3Asid%2Fen.wikipedia.org%3AStochastic+gradient+descent" class="Z3988"></span></span> </li> <li id="cite_note-56"><span class="mw-cite-backlink"><b><a href="#cite_ref-56">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite class="citation web cs1"><a rel="nofollow" class="external text" href="https://proceedings.mlr.press/v80/bernstein18a.html">"SignSGD: Compressed Optimisation for Non-Convex Problems"</a>. 3 July 2018. pp.&#160;<span class="nowrap">560–</span>569.</cite><span title="ctx_ver=Z39.88-2004&amp;rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&amp;rft.genre=unknown&amp;rft.btitle=SignSGD%3A+Compressed+Optimisation+for+Non-Convex+Problems&amp;rft.pages=%3Cspan+class%3D%22nowrap%22%3E560-%3C%2Fspan%3E569&amp;rft.date=2018-07-03&amp;rft_id=https%3A%2F%2Fproceedings.mlr.press%2Fv80%2Fbernstein18a.html&amp;rfr_id=info%3Asid%2Fen.wikipedia.org%3AStochastic+gradient+descent" class="Z3988"></span></span> </li> <li id="cite_note-57"><span class="mw-cite-backlink"><b><a href="#cite_ref-57">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFByrdHansenNocedalSinger2016" class="citation journal cs1">Byrd, R. H.; Hansen, S. L.; Nocedal, J.; Singer, Y. (2016). "A Stochastic Quasi-Newton method for Large-Scale Optimization". <i>SIAM Journal on Optimization</i>. <b>26</b> (2): <span class="nowrap">1008–</span>1031. <a href="/wiki/ArXiv_(identifier)" class="mw-redirect" title="ArXiv (identifier)">arXiv</a>:<span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://arxiv.org/abs/1401.7020">1401.7020</a></span>. <a href="/wiki/Doi_(identifier)" class="mw-redirect" title="Doi (identifier)">doi</a>:<a rel="nofollow" class="external text" href="https://doi.org/10.1137%2F140954362">10.1137/140954362</a>. <a href="/wiki/S2CID_(identifier)" class="mw-redirect" title="S2CID (identifier)">S2CID</a>&#160;<a rel="nofollow" class="external text" href="https://api.semanticscholar.org/CorpusID:12396034">12396034</a>.</cite><span title="ctx_ver=Z39.88-2004&amp;rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&amp;rft.genre=article&amp;rft.jtitle=SIAM+Journal+on+Optimization&amp;rft.atitle=A+Stochastic+Quasi-Newton+method+for+Large-Scale+Optimization&amp;rft.volume=26&amp;rft.issue=2&amp;rft.pages=%3Cspan+class%3D%22nowrap%22%3E1008-%3C%2Fspan%3E1031&amp;rft.date=2016&amp;rft_id=info%3Aarxiv%2F1401.7020&amp;rft_id=https%3A%2F%2Fapi.semanticscholar.org%2FCorpusID%3A12396034%23id-name%3DS2CID&amp;rft_id=info%3Adoi%2F10.1137%2F140954362&amp;rft.aulast=Byrd&amp;rft.aufirst=R.+H.&amp;rft.au=Hansen%2C+S.+L.&amp;rft.au=Nocedal%2C+J.&amp;rft.au=Singer%2C+Y.&amp;rfr_id=info%3Asid%2Fen.wikipedia.org%3AStochastic+gradient+descent" class="Z3988"></span></span> </li> <li id="cite_note-58"><span class="mw-cite-backlink"><b><a href="#cite_ref-58">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFSpall2000" class="citation journal cs1">Spall, J. C. (2000). "Adaptive Stochastic Approximation by the Simultaneous Perturbation Method". <i>IEEE Transactions on Automatic Control</i>. <b>45</b> (10): 1839−1853. <a href="/wiki/Doi_(identifier)" class="mw-redirect" title="Doi (identifier)">doi</a>:<a rel="nofollow" class="external text" href="https://doi.org/10.1109%2FTAC.2000.880982">10.1109/TAC.2000.880982</a>.</cite><span title="ctx_ver=Z39.88-2004&amp;rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&amp;rft.genre=article&amp;rft.jtitle=IEEE+Transactions+on+Automatic+Control&amp;rft.atitle=Adaptive+Stochastic+Approximation+by+the+Simultaneous+Perturbation+Method&amp;rft.volume=45&amp;rft.issue=10&amp;rft.pages=1839%E2%88%921853&amp;rft.date=2000&amp;rft_id=info%3Adoi%2F10.1109%2FTAC.2000.880982&amp;rft.aulast=Spall&amp;rft.aufirst=J.+C.&amp;rfr_id=info%3Asid%2Fen.wikipedia.org%3AStochastic+gradient+descent" class="Z3988"></span></span> </li> <li id="cite_note-59"><span class="mw-cite-backlink"><b><a href="#cite_ref-59">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFSpall2009" class="citation journal cs1">Spall, J. C. (2009). "Feedback and Weighting Mechanisms for Improving Jacobian Estimates in the Adaptive Simultaneous Perturbation Algorithm". <i>IEEE Transactions on Automatic Control</i>. <b>54</b> (6): <span class="nowrap">1216–</span>1229. <a href="/wiki/Doi_(identifier)" class="mw-redirect" title="Doi (identifier)">doi</a>:<a rel="nofollow" class="external text" href="https://doi.org/10.1109%2FTAC.2009.2019793">10.1109/TAC.2009.2019793</a>. <a href="/wiki/S2CID_(identifier)" class="mw-redirect" title="S2CID (identifier)">S2CID</a>&#160;<a rel="nofollow" class="external text" href="https://api.semanticscholar.org/CorpusID:3564529">3564529</a>.</cite><span title="ctx_ver=Z39.88-2004&amp;rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&amp;rft.genre=article&amp;rft.jtitle=IEEE+Transactions+on+Automatic+Control&amp;rft.atitle=Feedback+and+Weighting+Mechanisms+for+Improving+Jacobian+Estimates+in+the+Adaptive+Simultaneous+Perturbation+Algorithm&amp;rft.volume=54&amp;rft.issue=6&amp;rft.pages=%3Cspan+class%3D%22nowrap%22%3E1216-%3C%2Fspan%3E1229&amp;rft.date=2009&amp;rft_id=info%3Adoi%2F10.1109%2FTAC.2009.2019793&amp;rft_id=https%3A%2F%2Fapi.semanticscholar.org%2FCorpusID%3A3564529%23id-name%3DS2CID&amp;rft.aulast=Spall&amp;rft.aufirst=J.+C.&amp;rfr_id=info%3Asid%2Fen.wikipedia.org%3AStochastic+gradient+descent" class="Z3988"></span></span> </li> <li id="cite_note-60"><span class="mw-cite-backlink"><b><a href="#cite_ref-60">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFBhatnagarPrasadPrashanth2013" class="citation book cs1">Bhatnagar, S.; Prasad, H. L.; Prashanth, L. A. (2013). <i>Stochastic Recursive Algorithms for Optimization: Simultaneous Perturbation Methods</i>. London: Springer. <a href="/wiki/ISBN_(identifier)" class="mw-redirect" title="ISBN (identifier)">ISBN</a>&#160;<a href="/wiki/Special:BookSources/978-1-4471-4284-3" title="Special:BookSources/978-1-4471-4284-3"><bdi>978-1-4471-4284-3</bdi></a>.</cite><span title="ctx_ver=Z39.88-2004&amp;rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&amp;rft.genre=book&amp;rft.btitle=Stochastic+Recursive+Algorithms+for+Optimization%3A+Simultaneous+Perturbation+Methods&amp;rft.place=London&amp;rft.pub=Springer&amp;rft.date=2013&amp;rft.isbn=978-1-4471-4284-3&amp;rft.aulast=Bhatnagar&amp;rft.aufirst=S.&amp;rft.au=Prasad%2C+H.+L.&amp;rft.au=Prashanth%2C+L.+A.&amp;rfr_id=info%3Asid%2Fen.wikipedia.org%3AStochastic+gradient+descent" class="Z3988"></span></span> </li> <li id="cite_note-61"><span class="mw-cite-backlink"><b><a href="#cite_ref-61">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFRuppert1985" class="citation journal cs1">Ruppert, D. (1985). <a rel="nofollow" class="external text" href="https://doi.org/10.1214%2Faos%2F1176346589">"A Newton-Raphson Version of the Multivariate Robbins-Monro Procedure"</a>. <i><a href="/wiki/Annals_of_Statistics" title="Annals of Statistics">Annals of Statistics</a></i>. <b>13</b> (1): <span class="nowrap">236–</span>245. <a href="/wiki/Doi_(identifier)" class="mw-redirect" title="Doi (identifier)">doi</a>:<span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://doi.org/10.1214%2Faos%2F1176346589">10.1214/aos/1176346589</a></span>.</cite><span title="ctx_ver=Z39.88-2004&amp;rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&amp;rft.genre=article&amp;rft.jtitle=Annals+of+Statistics&amp;rft.atitle=A+Newton-Raphson+Version+of+the+Multivariate+Robbins-Monro+Procedure&amp;rft.volume=13&amp;rft.issue=1&amp;rft.pages=%3Cspan+class%3D%22nowrap%22%3E236-%3C%2Fspan%3E245&amp;rft.date=1985&amp;rft_id=info%3Adoi%2F10.1214%2Faos%2F1176346589&amp;rft.aulast=Ruppert&amp;rft.aufirst=D.&amp;rft_id=https%3A%2F%2Fdoi.org%2F10.1214%252Faos%252F1176346589&amp;rfr_id=info%3Asid%2Fen.wikipedia.org%3AStochastic+gradient+descent" class="Z3988"></span></span> </li> <li id="cite_note-62"><span class="mw-cite-backlink"><b><a href="#cite_ref-62">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFAmari1998" class="citation journal cs1">Amari, S. (1998). "Natural gradient works efficiently in learning". <i>Neural Computation</i>. <b>10</b> (2): <span class="nowrap">251–</span>276. <a href="/wiki/Doi_(identifier)" class="mw-redirect" title="Doi (identifier)">doi</a>:<a rel="nofollow" class="external text" href="https://doi.org/10.1162%2F089976698300017746">10.1162/089976698300017746</a>. <a href="/wiki/S2CID_(identifier)" class="mw-redirect" title="S2CID (identifier)">S2CID</a>&#160;<a rel="nofollow" class="external text" href="https://api.semanticscholar.org/CorpusID:207585383">207585383</a>.</cite><span title="ctx_ver=Z39.88-2004&amp;rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&amp;rft.genre=article&amp;rft.jtitle=Neural+Computation&amp;rft.atitle=Natural+gradient+works+efficiently+in+learning&amp;rft.volume=10&amp;rft.issue=2&amp;rft.pages=%3Cspan+class%3D%22nowrap%22%3E251-%3C%2Fspan%3E276&amp;rft.date=1998&amp;rft_id=info%3Adoi%2F10.1162%2F089976698300017746&amp;rft_id=https%3A%2F%2Fapi.semanticscholar.org%2FCorpusID%3A207585383%23id-name%3DS2CID&amp;rft.aulast=Amari&amp;rft.aufirst=S.&amp;rfr_id=info%3Asid%2Fen.wikipedia.org%3AStochastic+gradient+descent" class="Z3988"></span></span> </li> <li id="cite_note-63"><span class="mw-cite-backlink"><b><a href="#cite_ref-63">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFBrust2021" class="citation conference cs1">Brust, J.J. (2021). "Nonlinear least squares for large-scale machine learning using stochastic Jacobian estimates". <i>Workshop: Beyond First Order Methods in Machine Learning</i>. ICML 2021. <a href="/wiki/ArXiv_(identifier)" class="mw-redirect" title="ArXiv (identifier)">arXiv</a>:<span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://arxiv.org/abs/2107.05598">2107.05598</a></span>.</cite><span title="ctx_ver=Z39.88-2004&amp;rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&amp;rft.genre=conference&amp;rft.atitle=Nonlinear+least+squares+for+large-scale+machine+learning+using+stochastic+Jacobian+estimates&amp;rft.btitle=Workshop%3A+Beyond+First+Order+Methods+in+Machine+Learning&amp;rft.date=2021&amp;rft_id=info%3Aarxiv%2F2107.05598&amp;rft.aulast=Brust&amp;rft.aufirst=J.J.&amp;rfr_id=info%3Asid%2Fen.wikipedia.org%3AStochastic+gradient+descent" class="Z3988"></span></span> </li> <li id="cite_note-64"><span class="mw-cite-backlink"><b><a href="#cite_ref-64">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFLiTaiE2019" class="citation journal cs1">Li, Qianxiao; Tai, Cheng; E, Weinan (2019). <a rel="nofollow" class="external text" href="http://jmlr.org/papers/v20/17-526.html">"Stochastic Modified Equations and Dynamics of Stochastic Gradient Algorithms I: Mathematical Foundations"</a>. <i>Journal of Machine Learning Research</i>. <b>20</b> (40): <span class="nowrap">1–</span>47. <a href="/wiki/ArXiv_(identifier)" class="mw-redirect" title="ArXiv (identifier)">arXiv</a>:<span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://arxiv.org/abs/1811.01558">1811.01558</a></span>. <a href="/wiki/ISSN_(identifier)" class="mw-redirect" title="ISSN (identifier)">ISSN</a>&#160;<a rel="nofollow" class="external text" href="https://search.worldcat.org/issn/1533-7928">1533-7928</a>.</cite><span title="ctx_ver=Z39.88-2004&amp;rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&amp;rft.genre=article&amp;rft.jtitle=Journal+of+Machine+Learning+Research&amp;rft.atitle=Stochastic+Modified+Equations+and+Dynamics+of+Stochastic+Gradient+Algorithms+I%3A+Mathematical+Foundations&amp;rft.volume=20&amp;rft.issue=40&amp;rft.pages=%3Cspan+class%3D%22nowrap%22%3E1-%3C%2Fspan%3E47&amp;rft.date=2019&amp;rft_id=info%3Aarxiv%2F1811.01558&amp;rft.issn=1533-7928&amp;rft.aulast=Li&amp;rft.aufirst=Qianxiao&amp;rft.au=Tai%2C+Cheng&amp;rft.au=E%2C+Weinan&amp;rft_id=http%3A%2F%2Fjmlr.org%2Fpapers%2Fv20%2F17-526.html&amp;rfr_id=info%3Asid%2Fen.wikipedia.org%3AStochastic+gradient+descent" class="Z3988"></span></span> </li> <li id="cite_note-65"><span class="mw-cite-backlink"><b><a href="#cite_ref-65">^</a></b></span> <span class="reference-text"> <link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFGessKassingKonarovskyi2023" class="citation arxiv cs1">Gess, Benjamin; Kassing, Sebastian; Konarovskyi, Vitalii (14 February 2023). "Stochastic Modified Flows, Mean-Field Limits and Dynamics of Stochastic Gradient Descent". <a href="/wiki/ArXiv_(identifier)" class="mw-redirect" title="ArXiv (identifier)">arXiv</a>:<span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://arxiv.org/abs/2302.07125">2302.07125</a></span> [<a rel="nofollow" class="external text" href="https://arxiv.org/archive/math.PR">math.PR</a>].</cite><span title="ctx_ver=Z39.88-2004&amp;rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&amp;rft.genre=preprint&amp;rft.jtitle=arXiv&amp;rft.atitle=Stochastic+Modified+Flows%2C+Mean-Field+Limits+and+Dynamics+of+Stochastic+Gradient+Descent&amp;rft.date=2023-02-14&amp;rft_id=info%3Aarxiv%2F2302.07125&amp;rft.aulast=Gess&amp;rft.aufirst=Benjamin&amp;rft.au=Kassing%2C+Sebastian&amp;rft.au=Konarovskyi%2C+Vitalii&amp;rfr_id=info%3Asid%2Fen.wikipedia.org%3AStochastic+gradient+descent" class="Z3988"></span></span> </li> </ol></div> <div class="mw-heading mw-heading2"><h2 id="Further_reading">Further reading</h2><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Stochastic_gradient_descent&amp;action=edit&amp;section=21" title="Edit section: Further reading"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <ul><li><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite class="citation cs2"><a href="/wiki/L%C3%A9on_Bottou" title="Léon Bottou">Bottou, Léon</a> (2004), <a rel="nofollow" class="external text" href="http://leon.bottou.org/papers/bottou-mlss-2004">"Stochastic Learning"</a>, <i>Advanced Lectures on Machine Learning</i>, LNAI, vol.&#160;3176, Springer, pp.&#160;<span class="nowrap">146–</span>168, <a href="/wiki/ISBN_(identifier)" class="mw-redirect" title="ISBN (identifier)">ISBN</a>&#160;<a href="/wiki/Special:BookSources/978-3-540-23122-6" title="Special:BookSources/978-3-540-23122-6"><bdi>978-3-540-23122-6</bdi></a></cite><span title="ctx_ver=Z39.88-2004&amp;rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&amp;rft.genre=bookitem&amp;rft.atitle=Stochastic+Learning&amp;rft.btitle=Advanced+Lectures+on+Machine+Learning&amp;rft.series=LNAI&amp;rft.pages=%3Cspan+class%3D%22nowrap%22%3E146-%3C%2Fspan%3E168&amp;rft.pub=Springer&amp;rft.date=2004&amp;rft.isbn=978-3-540-23122-6&amp;rft.aulast=Bottou&amp;rft.aufirst=L%C3%A9on&amp;rft_id=http%3A%2F%2Fleon.bottou.org%2Fpapers%2Fbottou-mlss-2004&amp;rfr_id=info%3Asid%2Fen.wikipedia.org%3AStochastic+gradient+descent" class="Z3988"></span></li> <li><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFBudumaLocascio2017" class="citation cs2">Buduma, Nikhil; Locascio, Nicholas (2017), <a rel="nofollow" class="external text" href="https://books.google.com/books?id=80glDwAAQBAJ&amp;pg=PA63">"Beyond Gradient Descent"</a>, <i>Fundamentals of Deep Learning&#160;: Designing Next-Generation Machine Intelligence Algorithms</i>, O'Reilly, <a href="/wiki/ISBN_(identifier)" class="mw-redirect" title="ISBN (identifier)">ISBN</a>&#160;<a href="/wiki/Special:BookSources/9781491925584" title="Special:BookSources/9781491925584"><bdi>9781491925584</bdi></a></cite><span title="ctx_ver=Z39.88-2004&amp;rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&amp;rft.genre=bookitem&amp;rft.atitle=Beyond+Gradient+Descent&amp;rft.btitle=Fundamentals+of+Deep+Learning+%3A+Designing+Next-Generation+Machine+Intelligence+Algorithms&amp;rft.pub=O%27Reilly&amp;rft.date=2017&amp;rft.isbn=9781491925584&amp;rft.aulast=Buduma&amp;rft.aufirst=Nikhil&amp;rft.au=Locascio%2C+Nicholas&amp;rft_id=https%3A%2F%2Fbooks.google.com%2Fbooks%3Fid%3D80glDwAAQBAJ%26pg%3DPA63&amp;rfr_id=info%3Asid%2Fen.wikipedia.org%3AStochastic+gradient+descent" class="Z3988"></span></li> <li><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFLeCunBottouOrrMüller2012" class="citation cs2"><a href="/wiki/Yann_LeCun" title="Yann LeCun">LeCun, Yann A.</a>; Bottou, Léon; Orr, Genevieve B.; <a href="/wiki/Klaus-Robert_M%C3%BCller" title="Klaus-Robert Müller">Müller, Klaus-Robert</a> (2012), <a rel="nofollow" class="external text" href="https://books.google.com/books?id=VCKqCAAAQBAJ&amp;pg=PA9">"Efficient BackProp"</a>, <i>Neural Networks: Tricks of the Trade</i>, Springer, pp.&#160;<span class="nowrap">9–</span>48, <a href="/wiki/ISBN_(identifier)" class="mw-redirect" title="ISBN (identifier)">ISBN</a>&#160;<a href="/wiki/Special:BookSources/978-3-642-35288-1" title="Special:BookSources/978-3-642-35288-1"><bdi>978-3-642-35288-1</bdi></a></cite><span title="ctx_ver=Z39.88-2004&amp;rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&amp;rft.genre=bookitem&amp;rft.atitle=Efficient+BackProp&amp;rft.btitle=Neural+Networks%3A+Tricks+of+the+Trade&amp;rft.pages=%3Cspan+class%3D%22nowrap%22%3E9-%3C%2Fspan%3E48&amp;rft.pub=Springer&amp;rft.date=2012&amp;rft.isbn=978-3-642-35288-1&amp;rft.aulast=LeCun&amp;rft.aufirst=Yann+A.&amp;rft.au=Bottou%2C+L%C3%A9on&amp;rft.au=Orr%2C+Genevieve+B.&amp;rft.au=M%C3%BCller%2C+Klaus-Robert&amp;rft_id=https%3A%2F%2Fbooks.google.com%2Fbooks%3Fid%3DVCKqCAAAQBAJ%26pg%3DPA9&amp;rfr_id=info%3Asid%2Fen.wikipedia.org%3AStochastic+gradient+descent" class="Z3988"></span></li> <li><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite class="citation cs2">Spall, James C. (2003), <i>Introduction to Stochastic Search and Optimization</i>, <a href="/wiki/John_Wiley_%26_Sons" class="mw-redirect" title="John Wiley &amp; Sons">Wiley</a>, <a href="/wiki/ISBN_(identifier)" class="mw-redirect" title="ISBN (identifier)">ISBN</a>&#160;<a href="/wiki/Special:BookSources/978-0-471-33052-3" title="Special:BookSources/978-0-471-33052-3"><bdi>978-0-471-33052-3</bdi></a></cite><span title="ctx_ver=Z39.88-2004&amp;rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&amp;rft.genre=book&amp;rft.btitle=Introduction+to+Stochastic+Search+and+Optimization&amp;rft.pub=Wiley&amp;rft.date=2003&amp;rft.isbn=978-0-471-33052-3&amp;rft.aulast=Spall&amp;rft.aufirst=James+C.&amp;rfr_id=info%3Asid%2Fen.wikipedia.org%3AStochastic+gradient+descent" class="Z3988"></span></li></ul> <div class="mw-heading mw-heading2"><h2 id="External_links">External links</h2><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Stochastic_gradient_descent&amp;action=edit&amp;section=22" title="Edit section: External links"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <ul><li><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite class="citation web cs1"><a rel="nofollow" class="external text" href="https://www.youtube.com/watch?v=IHZwWFHWa-w&amp;list=PLZHQObOWTQDNU6R1_67000Dx_ZCJB-3pi&amp;index=2">"Gradient Descent, How Neural Networks Learn"</a>. <i>3Blue1Brown</i>. October 16, 2017. <a rel="nofollow" class="external text" href="https://ghostarchive.org/varchive/youtube/20211222/IHZwWFHWa-w">Archived</a> from the original on 2021-12-22 &#8211; via <a href="/wiki/YouTube" title="YouTube">YouTube</a>.</cite><span title="ctx_ver=Z39.88-2004&amp;rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&amp;rft.genre=unknown&amp;rft.jtitle=3Blue1Brown&amp;rft.atitle=Gradient+Descent%2C+How+Neural+Networks+Learn&amp;rft.date=2017-10-16&amp;rft_id=https%3A%2F%2Fwww.youtube.com%2Fwatch%3Fv%3DIHZwWFHWa-w%26list%3DPLZHQObOWTQDNU6R1_67000Dx_ZCJB-3pi%26index%3D2&amp;rfr_id=info%3Asid%2Fen.wikipedia.org%3AStochastic+gradient+descent" class="Z3988"></span></li> <li><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFGoh2017" class="citation journal cs1">Goh (April 4, 2017). <a rel="nofollow" class="external text" href="https://distill.pub/2017/momentum/">"Why Momentum Really Works"</a>. <i><a href="/wiki/Distill_(journal)" title="Distill (journal)">Distill</a></i>. <b>2</b> (4). <a href="/wiki/Doi_(identifier)" class="mw-redirect" title="Doi (identifier)">doi</a>:<span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://doi.org/10.23915%2Fdistill.00006">10.23915/distill.00006</a></span>.</cite><span title="ctx_ver=Z39.88-2004&amp;rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&amp;rft.genre=article&amp;rft.jtitle=Distill&amp;rft.atitle=Why+Momentum+Really+Works&amp;rft.volume=2&amp;rft.issue=4&amp;rft.date=2017-04-04&amp;rft_id=info%3Adoi%2F10.23915%2Fdistill.00006&amp;rft.au=Goh&amp;rft_id=https%3A%2F%2Fdistill.pub%2F2017%2Fmomentum%2F&amp;rfr_id=info%3Asid%2Fen.wikipedia.org%3AStochastic+gradient+descent" class="Z3988"></span> Interactive paper explaining momentum.</li></ul> <div class="navbox-styles"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1129693374"><style data-mw-deduplicate="TemplateStyles:r1236075235">.mw-parser-output .navbox{box-sizing:border-box;border:1px solid #a2a9b1;width:100%;clear:both;font-size:88%;text-align:center;padding:1px;margin:1em auto 0}.mw-parser-output .navbox .navbox{margin-top:0}.mw-parser-output .navbox+.navbox,.mw-parser-output .navbox+.navbox-styles+.navbox{margin-top:-1px}.mw-parser-output .navbox-inner,.mw-parser-output .navbox-subgroup{width:100%}.mw-parser-output .navbox-group,.mw-parser-output .navbox-title,.mw-parser-output .navbox-abovebelow{padding:0.25em 1em;line-height:1.5em;text-align:center}.mw-parser-output .navbox-group{white-space:nowrap;text-align:right}.mw-parser-output .navbox,.mw-parser-output .navbox-subgroup{background-color:#fdfdfd}.mw-parser-output .navbox-list{line-height:1.5em;border-color:#fdfdfd}.mw-parser-output .navbox-list-with-group{text-align:left;border-left-width:2px;border-left-style:solid}.mw-parser-output tr+tr>.navbox-abovebelow,.mw-parser-output tr+tr>.navbox-group,.mw-parser-output tr+tr>.navbox-image,.mw-parser-output tr+tr>.navbox-list{border-top:2px solid #fdfdfd}.mw-parser-output .navbox-title{background-color:#ccf}.mw-parser-output .navbox-abovebelow,.mw-parser-output .navbox-group,.mw-parser-output .navbox-subgroup .navbox-title{background-color:#ddf}.mw-parser-output .navbox-subgroup .navbox-group,.mw-parser-output .navbox-subgroup .navbox-abovebelow{background-color:#e6e6ff}.mw-parser-output .navbox-even{background-color:#f7f7f7}.mw-parser-output .navbox-odd{background-color:transparent}.mw-parser-output .navbox .hlist td dl,.mw-parser-output .navbox .hlist td ol,.mw-parser-output .navbox .hlist td ul,.mw-parser-output .navbox td.hlist dl,.mw-parser-output .navbox td.hlist ol,.mw-parser-output .navbox td.hlist ul{padding:0.125em 0}.mw-parser-output .navbox .navbar{display:block;font-size:100%}.mw-parser-output .navbox-title .navbar{float:left;text-align:left;margin-right:0.5em}body.skin--responsive .mw-parser-output .navbox-image img{max-width:none!important}@media print{body.ns-0 .mw-parser-output .navbox{display:none!important}}</style></div><div role="navigation" class="navbox" aria-labelledby="Artificial_intelligence_(AI)776" style="padding:3px"><table class="nowraplinks hlist mw-collapsible autocollapse navbox-inner" style="border-spacing:0;background:transparent;color:inherit"><tbody><tr><th scope="col" class="navbox-title" colspan="2"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1129693374"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1239400231"><div class="navbar plainlinks hlist navbar-mini"><ul><li class="nv-view"><a href="/wiki/Template:Artificial_intelligence_navbox" title="Template:Artificial intelligence navbox"><abbr title="View this template">v</abbr></a></li><li class="nv-talk"><a href="/wiki/Template_talk:Artificial_intelligence_navbox" title="Template talk:Artificial intelligence navbox"><abbr title="Discuss this template">t</abbr></a></li><li class="nv-edit"><a href="/wiki/Special:EditPage/Template:Artificial_intelligence_navbox" title="Special:EditPage/Template:Artificial intelligence navbox"><abbr title="Edit this template">e</abbr></a></li></ul></div><div id="Artificial_intelligence_(AI)776" style="font-size:114%;margin:0 4em"><a href="/wiki/Artificial_intelligence" title="Artificial intelligence">Artificial intelligence (AI)</a></div></th></tr><tr><td class="navbox-abovebelow" colspan="2"><div><a href="/wiki/History_of_artificial_intelligence" title="History of artificial intelligence">History</a> (<a href="/wiki/Timeline_of_artificial_intelligence" title="Timeline of artificial intelligence">timeline</a>)</div></td></tr><tr><th scope="row" class="navbox-group" style="width:1%">Concepts</th><td class="navbox-list-with-group navbox-list navbox-odd" style="width:100%;padding:0"><div style="padding:0 0.25em"> <ul><li><a href="/wiki/Parameter" title="Parameter">Parameter</a> <ul><li><a href="/wiki/Hyperparameter_(machine_learning)" title="Hyperparameter (machine learning)">Hyperparameter</a></li></ul></li> <li><a href="/wiki/Loss_functions_for_classification" title="Loss functions for classification">Loss functions</a></li> <li><a href="/wiki/Regression_analysis" title="Regression analysis">Regression</a> <ul><li><a href="/wiki/Bias%E2%80%93variance_tradeoff" title="Bias–variance tradeoff">Bias–variance tradeoff</a></li> <li><a href="/wiki/Double_descent" title="Double descent">Double descent</a></li> <li><a href="/wiki/Overfitting" title="Overfitting">Overfitting</a></li></ul></li> <li><a href="/wiki/Cluster_analysis" title="Cluster analysis">Clustering</a></li> <li><a href="/wiki/Gradient_descent" title="Gradient descent">Gradient descent</a> <ul><li><a class="mw-selflink selflink">SGD</a></li> <li><a href="/wiki/Quasi-Newton_method" title="Quasi-Newton method">Quasi-Newton method</a></li> <li><a href="/wiki/Conjugate_gradient_method" title="Conjugate gradient method">Conjugate gradient method</a></li></ul></li> <li><a href="/wiki/Backpropagation" title="Backpropagation">Backpropagation</a></li> <li><a href="/wiki/Attention_(machine_learning)" title="Attention (machine learning)">Attention</a></li> <li><a href="/wiki/Convolution" title="Convolution">Convolution</a></li> <li><a href="/wiki/Normalization_(machine_learning)" title="Normalization (machine learning)">Normalization</a> <ul><li><a href="/wiki/Batch_normalization" title="Batch normalization">Batchnorm</a></li></ul></li> <li><a href="/wiki/Activation_function" title="Activation function">Activation</a> <ul><li><a href="/wiki/Softmax_function" title="Softmax function">Softmax</a></li> <li><a href="/wiki/Sigmoid_function" title="Sigmoid function">Sigmoid</a></li> <li><a href="/wiki/Rectifier_(neural_networks)" title="Rectifier (neural networks)">Rectifier</a></li></ul></li> <li><a href="/wiki/Gating_mechanism" title="Gating mechanism">Gating</a></li> <li><a href="/wiki/Weight_initialization" title="Weight initialization">Weight initialization</a></li> <li><a href="/wiki/Regularization_(mathematics)" title="Regularization (mathematics)">Regularization</a></li> <li><a href="/wiki/Training,_validation,_and_test_data_sets" title="Training, validation, and test data sets">Datasets</a> <ul><li><a href="/wiki/Data_augmentation" title="Data augmentation">Augmentation</a></li></ul></li> <li><a href="/wiki/Prompt_engineering" title="Prompt engineering">Prompt engineering</a></li> <li><a href="/wiki/Reinforcement_learning" title="Reinforcement learning">Reinforcement learning</a> <ul><li><a href="/wiki/Q-learning" title="Q-learning">Q-learning</a></li> <li><a href="/wiki/State%E2%80%93action%E2%80%93reward%E2%80%93state%E2%80%93action" title="State–action–reward–state–action">SARSA</a></li> <li><a href="/wiki/Imitation_learning" title="Imitation learning">Imitation</a></li> <li><a href="/wiki/Policy_gradient_method" title="Policy gradient method">Policy gradient</a></li></ul></li> <li><a href="/wiki/Diffusion_process" title="Diffusion process">Diffusion</a></li> <li><a href="/wiki/Latent_diffusion_model" title="Latent diffusion model">Latent diffusion model</a></li> <li><a href="/wiki/Autoregressive_model" title="Autoregressive model">Autoregression</a></li> <li><a href="/wiki/Adversarial_machine_learning" title="Adversarial machine learning">Adversary</a></li> <li><a href="/wiki/Retrieval-augmented_generation" title="Retrieval-augmented generation">RAG</a></li> <li><a href="/wiki/Uncanny_valley" title="Uncanny valley">Uncanny valley</a></li> <li><a href="/wiki/Reinforcement_learning_from_human_feedback" title="Reinforcement learning from human feedback">RLHF</a></li> <li><a href="/wiki/Self-supervised_learning" title="Self-supervised learning">Self-supervised learning</a></li> <li><a href="/wiki/Recursive_self-improvement" title="Recursive self-improvement">Recursive self-improvement</a></li> <li><a href="/wiki/Word_embedding" title="Word embedding">Word embedding</a></li> <li><a href="/wiki/Hallucination_(artificial_intelligence)" title="Hallucination (artificial intelligence)">Hallucination</a></li></ul> </div></td></tr><tr><th scope="row" class="navbox-group" style="width:1%">Applications</th><td class="navbox-list-with-group navbox-list navbox-even" style="width:100%;padding:0"><div style="padding:0 0.25em"> <ul><li><a href="/wiki/Machine_learning" title="Machine learning">Machine learning</a> <ul><li><a href="/wiki/Prompt_engineering#In-context_learning" title="Prompt engineering">In-context learning</a></li></ul></li> <li><a href="/wiki/Neural_network_(machine_learning)" title="Neural network (machine learning)">Artificial neural network</a> <ul><li><a href="/wiki/Deep_learning" title="Deep learning">Deep learning</a></li></ul></li> <li><a href="/wiki/Language_model" title="Language model">Language model</a> <ul><li><a href="/wiki/Large_language_model" title="Large language model">Large language model</a></li> <li><a href="/wiki/Neural_machine_translation" title="Neural machine translation">NMT</a></li></ul></li> <li><a href="/wiki/Artificial_general_intelligence" title="Artificial general intelligence">Artificial general intelligence</a></li></ul> </div></td></tr><tr><th scope="row" class="navbox-group" style="width:1%">Implementations</th><td class="navbox-list-with-group navbox-list navbox-odd" style="width:100%;padding:0"><div style="padding:0 0.25em"></div><table class="nowraplinks navbox-subgroup" style="border-spacing:0"><tbody><tr><th scope="row" class="navbox-group" style="width:1%">Audio–visual</th><td class="navbox-list-with-group navbox-list navbox-odd" style="width:100%;padding:0"><div style="padding:0 0.25em"> <ul><li><a href="/wiki/AlexNet" title="AlexNet">AlexNet</a></li> <li><a href="/wiki/WaveNet" title="WaveNet">WaveNet</a></li> <li><a href="/wiki/Human_image_synthesis" title="Human image synthesis">Human image synthesis</a></li> <li><a href="/wiki/Handwriting_recognition" title="Handwriting recognition">HWR</a></li> <li><a href="/wiki/Optical_character_recognition" title="Optical character recognition">OCR</a></li> <li><a href="/wiki/Deep_learning_speech_synthesis" title="Deep learning speech synthesis">Speech synthesis</a> <ul><li><a href="/wiki/15.ai" title="15.ai">15.ai</a></li> <li><a href="/wiki/ElevenLabs" title="ElevenLabs">ElevenLabs</a></li></ul></li> <li><a href="/wiki/Speech_recognition" title="Speech recognition">Speech recognition</a> <ul><li><a href="/wiki/Whisper_(speech_recognition_system)" title="Whisper (speech recognition system)">Whisper</a></li></ul></li> <li><a href="/wiki/Facial_recognition_system" title="Facial recognition system">Facial recognition</a></li> <li><a href="/wiki/AlphaFold" title="AlphaFold">AlphaFold</a></li> <li><a href="/wiki/Text-to-image_model" title="Text-to-image model">Text-to-image models</a> <ul><li><a href="/wiki/Aurora_(text-to-image_model)" class="mw-redirect" title="Aurora (text-to-image model)">Aurora</a></li> <li><a href="/wiki/DALL-E" title="DALL-E">DALL-E</a></li> <li><a href="/wiki/Adobe_Firefly" title="Adobe Firefly">Firefly</a></li> <li><a href="/wiki/Flux_(text-to-image_model)" title="Flux (text-to-image model)">Flux</a></li> <li><a href="/wiki/Ideogram_(text-to-image_model)" title="Ideogram (text-to-image model)">Ideogram</a></li> <li><a href="/wiki/Google_Brain#Text-to-image_model" title="Google Brain">Imagen</a></li> <li><a href="/wiki/Midjourney" title="Midjourney">Midjourney</a></li> <li><a href="/wiki/Stable_Diffusion" title="Stable Diffusion">Stable Diffusion</a></li></ul></li> <li><a href="/wiki/Text-to-video_model" title="Text-to-video model">Text-to-video models</a> <ul><li><a href="/wiki/Dream_Machine_(text-to-video_model)" title="Dream Machine (text-to-video model)">Dream Machine</a></li> <li><a href="/wiki/Runway_(company)#Gen-3_Alpha" title="Runway (company)">Gen-3 Alpha</a></li> <li><a href="/wiki/MiniMax_(company)#Hailuo_AI" title="MiniMax (company)">Hailuo AI</a></li> <li><a href="/wiki/Kling_(text-to-video_model)" class="mw-redirect" title="Kling (text-to-video model)">Kling</a></li> <li><a href="/wiki/Sora_(text-to-video_model)" title="Sora (text-to-video model)">Sora</a></li> <li><a href="/wiki/Google_DeepMind#Video_model" title="Google DeepMind">Veo</a></li></ul></li> <li><a href="/wiki/Music_and_artificial_intelligence" title="Music and artificial intelligence">Music generation</a> <ul><li><a href="/wiki/Suno_AI" title="Suno AI">Suno AI</a></li> <li><a href="/wiki/Udio" title="Udio">Udio</a></li></ul></li></ul> </div></td></tr><tr><th scope="row" class="navbox-group" style="width:1%">Text</th><td class="navbox-list-with-group navbox-list navbox-even" style="width:100%;padding:0"><div style="padding:0 0.25em"> <ul><li><a href="/wiki/Word2vec" title="Word2vec">Word2vec</a></li> <li><a href="/wiki/Seq2seq" title="Seq2seq">Seq2seq</a></li> <li><a href="/wiki/GloVe" title="GloVe">GloVe</a></li> <li><a href="/wiki/BERT_(language_model)" title="BERT (language model)">BERT</a></li> <li><a href="/wiki/T5_(language_model)" title="T5 (language model)">T5</a></li> <li><a href="/wiki/Llama_(language_model)" title="Llama (language model)">Llama</a></li> <li><a href="/wiki/Chinchilla_(language_model)" title="Chinchilla (language model)">Chinchilla AI</a></li> <li><a href="/wiki/PaLM" title="PaLM">PaLM</a></li> <li><a href="/wiki/Generative_pre-trained_transformer" title="Generative pre-trained transformer">GPT</a> <ul><li><a href="/wiki/GPT-1" title="GPT-1">1</a></li> <li><a href="/wiki/GPT-2" title="GPT-2">2</a></li> <li><a href="/wiki/GPT-3" title="GPT-3">3</a></li> <li><a href="/wiki/GPT-J" title="GPT-J">J</a></li> <li><a href="/wiki/ChatGPT" title="ChatGPT">ChatGPT</a></li> <li><a href="/wiki/GPT-4" title="GPT-4">4</a></li> <li><a href="/wiki/GPT-4o" title="GPT-4o">4o</a></li> <li><a href="/wiki/OpenAI_o1" title="OpenAI o1">o1</a></li> <li><a href="/wiki/OpenAI_o3" title="OpenAI o3">o3</a></li></ul></li> <li><a href="/wiki/Claude_(language_model)" title="Claude (language model)">Claude</a></li> <li><a href="/wiki/Gemini_(language_model)" title="Gemini (language model)">Gemini</a> <ul><li><a href="/wiki/Gemini_(chatbot)" title="Gemini (chatbot)">chatbot</a></li></ul></li> <li><a href="/wiki/Grok_(chatbot)" title="Grok (chatbot)">Grok</a></li> <li><a href="/wiki/LaMDA" title="LaMDA">LaMDA</a></li> <li><a href="/wiki/BLOOM_(language_model)" title="BLOOM (language model)">BLOOM</a></li> <li><a href="/wiki/Project_Debater" title="Project Debater">Project Debater</a></li> <li><a href="/wiki/IBM_Watson" title="IBM Watson">IBM Watson</a></li> <li><a href="/wiki/IBM_Watsonx" title="IBM Watsonx">IBM Watsonx</a></li> <li><a href="/wiki/IBM_Granite" title="IBM Granite">Granite</a></li> <li><a href="/wiki/Huawei_PanGu" title="Huawei PanGu">PanGu-Σ</a></li> <li><a href="/wiki/DeepSeek" title="DeepSeek">DeepSeek</a></li> <li><a href="/wiki/Qwen" title="Qwen">Qwen</a></li></ul> </div></td></tr><tr><th scope="row" class="navbox-group" style="width:1%">Decisional</th><td class="navbox-list-with-group navbox-list navbox-odd" style="width:100%;padding:0"><div style="padding:0 0.25em"> <ul><li><a href="/wiki/AlphaGo" title="AlphaGo">AlphaGo</a></li> <li><a href="/wiki/AlphaZero" title="AlphaZero">AlphaZero</a></li> <li><a href="/wiki/OpenAI_Five" title="OpenAI Five">OpenAI Five</a></li> <li><a href="/wiki/Self-driving_car" title="Self-driving car">Self-driving car</a></li> <li><a href="/wiki/MuZero" title="MuZero">MuZero</a></li> <li><a href="/wiki/Action_selection" title="Action selection">Action selection</a> <ul><li><a href="/wiki/AutoGPT" title="AutoGPT">AutoGPT</a></li></ul></li> <li><a href="/wiki/Robot_control" title="Robot control">Robot control</a></li></ul> </div></td></tr></tbody></table><div></div></td></tr><tr><th scope="row" class="navbox-group" style="width:1%">People</th><td class="navbox-list-with-group navbox-list navbox-even" style="width:100%;padding:0"><div style="padding:0 0.25em"> <ul><li><a href="/wiki/Alan_Turing" title="Alan Turing">Alan Turing</a></li> <li><a href="/wiki/Warren_Sturgis_McCulloch" title="Warren Sturgis McCulloch">Warren Sturgis McCulloch</a></li> <li><a href="/wiki/Walter_Pitts" title="Walter Pitts">Walter Pitts</a></li> <li><a href="/wiki/John_von_Neumann" title="John von Neumann">John von Neumann</a></li> <li><a href="/wiki/Claude_Shannon" title="Claude Shannon">Claude Shannon</a></li> <li><a href="/wiki/Marvin_Minsky" title="Marvin Minsky">Marvin Minsky</a></li> <li><a href="/wiki/John_McCarthy_(computer_scientist)" title="John McCarthy (computer scientist)">John McCarthy</a></li> <li><a href="/wiki/Nathaniel_Rochester_(computer_scientist)" title="Nathaniel Rochester (computer scientist)">Nathaniel Rochester</a></li> <li><a href="/wiki/Allen_Newell" title="Allen Newell">Allen Newell</a></li> <li><a href="/wiki/Cliff_Shaw" title="Cliff Shaw">Cliff Shaw</a></li> <li><a href="/wiki/Herbert_A._Simon" title="Herbert A. Simon">Herbert A. Simon</a></li> <li><a href="/wiki/Oliver_Selfridge" title="Oliver Selfridge">Oliver Selfridge</a></li> <li><a href="/wiki/Frank_Rosenblatt" title="Frank Rosenblatt">Frank Rosenblatt</a></li> <li><a href="/wiki/Bernard_Widrow" title="Bernard Widrow">Bernard Widrow</a></li> <li><a href="/wiki/Joseph_Weizenbaum" title="Joseph Weizenbaum">Joseph Weizenbaum</a></li> <li><a href="/wiki/Seymour_Papert" title="Seymour Papert">Seymour Papert</a></li> <li><a href="/wiki/Seppo_Linnainmaa" title="Seppo Linnainmaa">Seppo Linnainmaa</a></li> <li><a href="/wiki/Paul_Werbos" title="Paul Werbos">Paul Werbos</a></li> <li><a href="/wiki/J%C3%BCrgen_Schmidhuber" title="Jürgen Schmidhuber">Jürgen Schmidhuber</a></li> <li><a href="/wiki/Yann_LeCun" title="Yann LeCun">Yann LeCun</a></li> <li><a href="/wiki/Geoffrey_Hinton" title="Geoffrey Hinton">Geoffrey Hinton</a></li> <li><a href="/wiki/John_Hopfield" title="John Hopfield">John Hopfield</a></li> <li><a href="/wiki/Yoshua_Bengio" title="Yoshua Bengio">Yoshua Bengio</a></li> <li><a href="/wiki/Lotfi_A._Zadeh" title="Lotfi A. Zadeh">Lotfi A. Zadeh</a></li> <li><a href="/wiki/Stephen_Grossberg" title="Stephen Grossberg">Stephen Grossberg</a></li> <li><a href="/wiki/Alex_Graves_(computer_scientist)" title="Alex Graves (computer scientist)">Alex Graves</a></li> <li><a href="/wiki/Andrew_Ng" title="Andrew Ng">Andrew Ng</a></li> <li><a href="/wiki/Fei-Fei_Li" title="Fei-Fei Li">Fei-Fei Li</a></li> <li><a href="/wiki/Alex_Krizhevsky" title="Alex Krizhevsky">Alex Krizhevsky</a></li> <li><a href="/wiki/Ilya_Sutskever" title="Ilya Sutskever">Ilya Sutskever</a></li> <li><a href="/wiki/Demis_Hassabis" title="Demis Hassabis">Demis Hassabis</a></li> <li><a href="/wiki/David_Silver_(computer_scientist)" title="David Silver (computer scientist)">David Silver</a></li> <li><a href="/wiki/Ian_Goodfellow" title="Ian Goodfellow">Ian Goodfellow</a></li> <li><a href="/wiki/Andrej_Karpathy" title="Andrej Karpathy">Andrej Karpathy</a></li></ul> </div></td></tr><tr><th scope="row" class="navbox-group" style="width:1%">Architectures</th><td class="navbox-list-with-group navbox-list navbox-odd" style="width:100%;padding:0"><div style="padding:0 0.25em"> <ul><li><a href="/wiki/Neural_Turing_machine" title="Neural Turing machine">Neural Turing machine</a></li> <li><a href="/wiki/Differentiable_neural_computer" title="Differentiable neural computer">Differentiable neural computer</a></li> <li><a href="/wiki/Transformer_(deep_learning_architecture)" title="Transformer (deep learning architecture)">Transformer</a> <ul><li><a href="/wiki/Vision_transformer" title="Vision transformer">Vision transformer (ViT)</a></li></ul></li> <li><a href="/wiki/Recurrent_neural_network" title="Recurrent neural network">Recurrent neural network (RNN)</a></li> <li><a href="/wiki/Long_short-term_memory" title="Long short-term memory">Long short-term memory (LSTM)</a></li> <li><a href="/wiki/Gated_recurrent_unit" title="Gated recurrent unit">Gated recurrent unit (GRU)</a></li> <li><a href="/wiki/Echo_state_network" title="Echo state network">Echo state network</a></li> <li><a href="/wiki/Multilayer_perceptron" title="Multilayer perceptron">Multilayer perceptron (MLP)</a></li> <li><a href="/wiki/Convolutional_neural_network" title="Convolutional neural network">Convolutional neural network (CNN)</a></li> <li><a href="/wiki/Residual_neural_network" title="Residual neural network">Residual neural network (RNN)</a></li> <li><a href="/wiki/Highway_network" title="Highway network">Highway network</a></li> <li><a href="/wiki/Mamba_(deep_learning_architecture)" title="Mamba (deep learning architecture)">Mamba</a></li> <li><a href="/wiki/Autoencoder" title="Autoencoder">Autoencoder</a></li> <li><a href="/wiki/Variational_autoencoder" title="Variational autoencoder">Variational autoencoder (VAE)</a></li> <li><a href="/wiki/Generative_adversarial_network" title="Generative adversarial network">Generative adversarial network (GAN)</a></li> <li><a href="/wiki/Graph_neural_network" title="Graph neural network">Graph neural network (GNN)</a></li></ul> </div></td></tr><tr><td class="navbox-abovebelow" colspan="2"><div> <ul><li><span class="noviewer" typeof="mw:File"><a href="/wiki/File:Symbol_portal_class.svg" class="mw-file-description" title="Portal"><img alt="" src="//upload.wikimedia.org/wikipedia/en/thumb/e/e2/Symbol_portal_class.svg/16px-Symbol_portal_class.svg.png" decoding="async" width="16" height="16" class="mw-file-element" srcset="//upload.wikimedia.org/wikipedia/en/thumb/e/e2/Symbol_portal_class.svg/23px-Symbol_portal_class.svg.png 1.5x, //upload.wikimedia.org/wikipedia/en/thumb/e/e2/Symbol_portal_class.svg/31px-Symbol_portal_class.svg.png 2x" data-file-width="180" data-file-height="185" /></a></span> Portals <ul><li><a href="/wiki/Portal:Technology" title="Portal:Technology">Technology</a></li></ul></li> <li><span class="noviewer" typeof="mw:File"><span title="Category"><img alt="" src="//upload.wikimedia.org/wikipedia/en/thumb/9/96/Symbol_category_class.svg/16px-Symbol_category_class.svg.png" decoding="async" width="16" height="16" class="mw-file-element" srcset="//upload.wikimedia.org/wikipedia/en/thumb/9/96/Symbol_category_class.svg/23px-Symbol_category_class.svg.png 1.5x, //upload.wikimedia.org/wikipedia/en/thumb/9/96/Symbol_category_class.svg/31px-Symbol_category_class.svg.png 2x" data-file-width="180" data-file-height="185" /></span></span> <a href="/wiki/Category:Artificial_intelligence" title="Category:Artificial intelligence">Category</a> <ul><li><a href="/wiki/Category:Artificial_neural_networks" title="Category:Artificial neural networks">Artificial neural networks</a></li> <li><a href="/wiki/Category:Machine_learning" title="Category:Machine learning">Machine learning</a></li></ul></li> <li><span class="noviewer" typeof="mw:File"><span title="List-Class article"><img alt="" src="//upload.wikimedia.org/wikipedia/en/thumb/d/db/Symbol_list_class.svg/16px-Symbol_list_class.svg.png" decoding="async" width="16" height="16" class="mw-file-element" srcset="//upload.wikimedia.org/wikipedia/en/thumb/d/db/Symbol_list_class.svg/23px-Symbol_list_class.svg.png 1.5x, //upload.wikimedia.org/wikipedia/en/thumb/d/db/Symbol_list_class.svg/31px-Symbol_list_class.svg.png 2x" data-file-width="180" data-file-height="185" /></span></span> List <ul><li><a href="/wiki/List_of_artificial_intelligence_companies" title="List of artificial intelligence companies">Companies</a></li> <li><a href="/wiki/List_of_artificial_intelligence_projects" title="List of artificial intelligence projects">Projects</a></li></ul></li></ul> </div></td></tr></tbody></table></div> <!-- NewPP limit report Parsed by mw‐web.codfw.main‐84749c7844‐p8w5m Cached time: 20250210045037 Cache expiry: 2592000 Reduced expiry: false Complications: [vary‐revision‐sha1, show‐toc] CPU time usage: 1.002 seconds Real time usage: 1.280 seconds Preprocessor visited node count: 6434/1000000 Post‐expand include size: 226508/2097152 bytes Template argument size: 6793/2097152 bytes Highest expansion depth: 17/100 Expensive parser function count: 10/500 Unstrip recursion depth: 1/20 Unstrip post‐expand size: 266437/5000000 bytes Lua time usage: 0.538/10.000 seconds Lua memory usage: 7224748/52428800 bytes Number of Wikibase entities loaded: 0/400 --> <!-- Transclusion expansion time report (%,ms,calls,template) 100.00% 889.793 1 -total 46.55% 414.184 2 Template:Reflist 15.21% 135.339 29 Template:Cite_journal 13.41% 119.283 1 Template:Machine_learning 12.97% 115.394 8 Template:Cite_book 12.59% 112.058 1 Template:Sidebar_with_collapsible_lists 7.60% 67.582 1 Template:Short_description 6.70% 59.647 5 Template:Fix 5.65% 50.266 3 Template:Citation_needed 5.39% 47.943 8 Template:Cite_conference --> <!-- Saved in parser cache with key enwiki:pcache:1180641:|#|:idhash:canonical and timestamp 20250210045037 and revision id 1265558819. Rendering was triggered because: page-view --> </div><!--esi <esi:include src="/esitest-fa8a495983347898/content" /> --><noscript><img src="https://login.wikimedia.org/wiki/Special:CentralAutoLogin/start?useformat=desktop&amp;type=1x1&amp;usesul3=0" alt="" width="1" height="1" style="border: none; position: absolute;"></noscript> <div class="printfooter" data-nosnippet="">Retrieved from "<a dir="ltr" href="https://en.wikipedia.org/w/index.php?title=Stochastic_gradient_descent&amp;oldid=1265558819">https://en.wikipedia.org/w/index.php?title=Stochastic_gradient_descent&amp;oldid=1265558819</a>"</div></div> <div id="catlinks" class="catlinks" data-mw="interface"><div id="mw-normal-catlinks" class="mw-normal-catlinks"><a href="/wiki/Help:Category" title="Help:Category">Categories</a>: <ul><li><a href="/wiki/Category:Stochastic_optimization" title="Category:Stochastic optimization">Stochastic optimization</a></li><li><a href="/wiki/Category:Computational_statistics" title="Category:Computational statistics">Computational statistics</a></li><li><a href="/wiki/Category:Gradient_methods" title="Category:Gradient methods">Gradient methods</a></li><li><a href="/wiki/Category:M-estimators" title="Category:M-estimators">M-estimators</a></li><li><a href="/wiki/Category:Machine_learning_algorithms" title="Category:Machine learning algorithms">Machine learning algorithms</a></li><li><a href="/wiki/Category:Convex_optimization" title="Category:Convex optimization">Convex optimization</a></li><li><a href="/wiki/Category:Statistical_approximations" title="Category:Statistical approximations">Statistical approximations</a></li></ul></div><div id="mw-hidden-catlinks" class="mw-hidden-catlinks mw-hidden-cats-hidden">Hidden categories: <ul><li><a href="/wiki/Category:All_articles_with_dead_external_links" title="Category:All articles with dead external links">All articles with dead external links</a></li><li><a href="/wiki/Category:Articles_with_dead_external_links_from_June_2018" title="Category:Articles with dead external links from June 2018">Articles with dead external links from June 2018</a></li><li><a href="/wiki/Category:Articles_with_permanently_dead_external_links" title="Category:Articles with permanently dead external links">Articles with permanently dead external links</a></li><li><a href="/wiki/Category:CS1_errors:_missing_periodical" title="Category:CS1 errors: missing periodical">CS1 errors: missing periodical</a></li><li><a href="/wiki/Category:CS1_maint:_multiple_names:_authors_list" title="Category:CS1 maint: multiple names: authors list">CS1 maint: multiple names: authors list</a></li><li><a href="/wiki/Category:Articles_with_short_description" title="Category:Articles with short description">Articles with short description</a></li><li><a href="/wiki/Category:Short_description_is_different_from_Wikidata" title="Category:Short description is different from Wikidata">Short description is different from Wikidata</a></li><li><a href="/wiki/Category:All_articles_with_unsourced_statements" title="Category:All articles with unsourced statements">All articles with unsourced statements</a></li><li><a href="/wiki/Category:Articles_with_unsourced_statements_from_July_2015" title="Category:Articles with unsourced statements from July 2015">Articles with unsourced statements from July 2015</a></li><li><a href="/wiki/Category:Wikipedia_articles_needing_clarification_from_November_2023" title="Category:Wikipedia articles needing clarification from November 2023">Wikipedia articles needing clarification from November 2023</a></li><li><a href="/wiki/Category:Articles_with_unsourced_statements_from_June_2023" title="Category:Articles with unsourced statements from June 2023">Articles with unsourced statements from June 2023</a></li><li><a href="/wiki/Category:Articles_with_unsourced_statements_from_June_2024" title="Category:Articles with unsourced statements from June 2024">Articles with unsourced statements from June 2024</a></li><li><a href="/wiki/Category:Articles_to_be_expanded_from_June_2023" title="Category:Articles to be expanded from June 2023">Articles to be expanded from June 2023</a></li><li><a href="/wiki/Category:All_articles_to_be_expanded" title="Category:All articles to be expanded">All articles to be expanded</a></li><li><a href="/wiki/Category:Articles_with_unsourced_statements_from_April_2020" title="Category:Articles with unsourced statements from April 2020">Articles with unsourced statements from April 2020</a></li></ul></div></div> </div> </main> </div> <div class="mw-footer-container"> <footer id="footer" class="mw-footer" > <ul id="footer-info"> <li id="footer-info-lastmod"> This page was last edited on 27 December 2024, at 14:06<span class="anonymous-show">&#160;(UTC)</span>.</li> <li id="footer-info-copyright">Text is available under the <a href="/wiki/Wikipedia:Text_of_the_Creative_Commons_Attribution-ShareAlike_4.0_International_License" title="Wikipedia:Text of the Creative Commons Attribution-ShareAlike 4.0 International License">Creative Commons Attribution-ShareAlike 4.0 License</a>; additional terms may apply. By using this site, you agree to the <a href="https://foundation.wikimedia.org/wiki/Special:MyLanguage/Policy:Terms_of_Use" class="extiw" title="foundation:Special:MyLanguage/Policy:Terms of Use">Terms of Use</a> and <a href="https://foundation.wikimedia.org/wiki/Special:MyLanguage/Policy:Privacy_policy" class="extiw" title="foundation:Special:MyLanguage/Policy:Privacy policy">Privacy Policy</a>. Wikipedia® is a registered trademark of the <a rel="nofollow" class="external text" href="https://wikimediafoundation.org/">Wikimedia Foundation, Inc.</a>, a non-profit organization.</li> </ul> <ul id="footer-places"> <li id="footer-places-privacy"><a href="https://foundation.wikimedia.org/wiki/Special:MyLanguage/Policy:Privacy_policy">Privacy policy</a></li> <li id="footer-places-about"><a href="/wiki/Wikipedia:About">About Wikipedia</a></li> <li id="footer-places-disclaimers"><a href="/wiki/Wikipedia:General_disclaimer">Disclaimers</a></li> <li id="footer-places-contact"><a href="//en.wikipedia.org/wiki/Wikipedia:Contact_us">Contact Wikipedia</a></li> <li id="footer-places-wm-codeofconduct"><a href="https://foundation.wikimedia.org/wiki/Special:MyLanguage/Policy:Universal_Code_of_Conduct">Code of Conduct</a></li> <li id="footer-places-developers"><a href="https://developer.wikimedia.org">Developers</a></li> <li id="footer-places-statslink"><a href="https://stats.wikimedia.org/#/en.wikipedia.org">Statistics</a></li> <li id="footer-places-cookiestatement"><a href="https://foundation.wikimedia.org/wiki/Special:MyLanguage/Policy:Cookie_statement">Cookie statement</a></li> <li id="footer-places-mobileview"><a href="//en.m.wikipedia.org/w/index.php?title=Stochastic_gradient_descent&amp;mobileaction=toggle_view_mobile" class="noprint stopMobileRedirectToggle">Mobile view</a></li> </ul> <ul id="footer-icons" class="noprint"> <li id="footer-copyrightico"><a href="https://wikimediafoundation.org/" class="cdx-button cdx-button--fake-button cdx-button--size-large cdx-button--fake-button--enabled"><img src="/static/images/footer/wikimedia-button.svg" width="84" height="29" alt="Wikimedia Foundation" lang="en" loading="lazy"></a></li> <li id="footer-poweredbyico"><a href="https://www.mediawiki.org/" class="cdx-button cdx-button--fake-button cdx-button--size-large cdx-button--fake-button--enabled"><img src="/w/resources/assets/poweredby_mediawiki.svg" alt="Powered by MediaWiki" width="88" height="31" loading="lazy"></a></li> </ul> </footer> </div> </div> </div> <div class="vector-header-container vector-sticky-header-container"> <div id="vector-sticky-header" class="vector-sticky-header"> <div class="vector-sticky-header-start"> <div class="vector-sticky-header-icon-start vector-button-flush-left vector-button-flush-right" aria-hidden="true"> <button class="cdx-button cdx-button--weight-quiet cdx-button--icon-only vector-sticky-header-search-toggle" tabindex="-1" data-event-name="ui.vector-sticky-search-form.icon"><span class="vector-icon mw-ui-icon-search mw-ui-icon-wikimedia-search"></span> <span>Search</span> </button> </div> <div role="search" class="vector-search-box-vue vector-search-box-show-thumbnail vector-search-box"> <div class="vector-typeahead-search-container"> <div class="cdx-typeahead-search cdx-typeahead-search--show-thumbnail"> <form action="/w/index.php" id="vector-sticky-search-form" class="cdx-search-input cdx-search-input--has-end-button"> <div class="cdx-search-input__input-wrapper" data-search-loc="header-moved"> <div class="cdx-text-input cdx-text-input--has-start-icon"> <input class="cdx-text-input__input" type="search" name="search" placeholder="Search Wikipedia"> <span class="cdx-text-input__icon cdx-text-input__start-icon"></span> </div> <input type="hidden" name="title" value="Special:Search"> </div> <button class="cdx-button cdx-search-input__end-button">Search</button> </form> </div> </div> </div> <div class="vector-sticky-header-context-bar"> <nav aria-label="Contents" class="vector-toc-landmark"> <div id="vector-sticky-header-toc" class="vector-dropdown mw-portlet mw-portlet-sticky-header-toc vector-sticky-header-toc vector-button-flush-left" > <input type="checkbox" id="vector-sticky-header-toc-checkbox" role="button" aria-haspopup="true" data-event-name="ui.dropdown-vector-sticky-header-toc" class="vector-dropdown-checkbox " aria-label="Toggle the table of contents" > <label id="vector-sticky-header-toc-label" for="vector-sticky-header-toc-checkbox" class="vector-dropdown-label cdx-button cdx-button--fake-button cdx-button--fake-button--enabled cdx-button--weight-quiet cdx-button--icon-only " aria-hidden="true" ><span class="vector-icon mw-ui-icon-listBullet mw-ui-icon-wikimedia-listBullet"></span> <span class="vector-dropdown-label-text">Toggle the table of contents</span> </label> <div class="vector-dropdown-content"> <div id="vector-sticky-header-toc-unpinned-container" class="vector-unpinned-container"> </div> </div> </div> </nav> <div class="vector-sticky-header-context-bar-primary" aria-hidden="true" ><span class="mw-page-title-main">Stochastic gradient descent</span></div> </div> </div> <div class="vector-sticky-header-end" aria-hidden="true"> <div class="vector-sticky-header-icons"> <a href="#" class="cdx-button cdx-button--fake-button cdx-button--fake-button--enabled cdx-button--weight-quiet cdx-button--icon-only" id="ca-talk-sticky-header" tabindex="-1" data-event-name="talk-sticky-header"><span class="vector-icon mw-ui-icon-speechBubbles mw-ui-icon-wikimedia-speechBubbles"></span> <span></span> </a> <a href="#" class="cdx-button cdx-button--fake-button cdx-button--fake-button--enabled cdx-button--weight-quiet cdx-button--icon-only" id="ca-subject-sticky-header" tabindex="-1" data-event-name="subject-sticky-header"><span class="vector-icon mw-ui-icon-article mw-ui-icon-wikimedia-article"></span> <span></span> </a> <a href="#" class="cdx-button cdx-button--fake-button cdx-button--fake-button--enabled cdx-button--weight-quiet cdx-button--icon-only" id="ca-history-sticky-header" tabindex="-1" data-event-name="history-sticky-header"><span class="vector-icon mw-ui-icon-wikimedia-history mw-ui-icon-wikimedia-wikimedia-history"></span> <span></span> </a> <a href="#" class="cdx-button cdx-button--fake-button cdx-button--fake-button--enabled cdx-button--weight-quiet cdx-button--icon-only mw-watchlink" id="ca-watchstar-sticky-header" tabindex="-1" data-event-name="watch-sticky-header"><span class="vector-icon mw-ui-icon-wikimedia-star mw-ui-icon-wikimedia-wikimedia-star"></span> <span></span> </a> <a href="#" class="cdx-button cdx-button--fake-button cdx-button--fake-button--enabled cdx-button--weight-quiet cdx-button--icon-only" id="ca-edit-sticky-header" tabindex="-1" data-event-name="wikitext-edit-sticky-header"><span class="vector-icon mw-ui-icon-wikimedia-wikiText mw-ui-icon-wikimedia-wikimedia-wikiText"></span> <span></span> </a> <a href="#" class="cdx-button cdx-button--fake-button cdx-button--fake-button--enabled cdx-button--weight-quiet cdx-button--icon-only" id="ca-ve-edit-sticky-header" tabindex="-1" data-event-name="ve-edit-sticky-header"><span class="vector-icon mw-ui-icon-wikimedia-edit mw-ui-icon-wikimedia-wikimedia-edit"></span> <span></span> </a> <a href="#" class="cdx-button cdx-button--fake-button cdx-button--fake-button--enabled cdx-button--weight-quiet cdx-button--icon-only" id="ca-viewsource-sticky-header" tabindex="-1" data-event-name="ve-edit-protected-sticky-header"><span class="vector-icon mw-ui-icon-wikimedia-editLock mw-ui-icon-wikimedia-wikimedia-editLock"></span> <span></span> </a> </div> <div class="vector-sticky-header-buttons"> <button class="cdx-button cdx-button--weight-quiet mw-interlanguage-selector" id="p-lang-btn-sticky-header" tabindex="-1" data-event-name="ui.dropdown-p-lang-btn-sticky-header"><span class="vector-icon mw-ui-icon-wikimedia-language mw-ui-icon-wikimedia-wikimedia-language"></span> <span>13 languages</span> </button> <a href="#" class="cdx-button cdx-button--fake-button cdx-button--fake-button--enabled cdx-button--weight-quiet cdx-button--action-progressive" id="ca-addsection-sticky-header" tabindex="-1" data-event-name="addsection-sticky-header"><span class="vector-icon mw-ui-icon-speechBubbleAdd-progressive mw-ui-icon-wikimedia-speechBubbleAdd-progressive"></span> <span>Add topic</span> </a> </div> <div class="vector-sticky-header-icon-end"> <div class="vector-user-links"> </div> </div> </div> </div> </div> <div class="vector-settings" id="p-dock-bottom"> <ul></ul> </div><script>(RLQ=window.RLQ||[]).push(function(){mw.config.set({"wgHostname":"mw-web.codfw.main-654b9d4bd7-n57ws","wgBackendResponseTime":134,"wgPageParseReport":{"limitreport":{"cputime":"1.002","walltime":"1.280","ppvisitednodes":{"value":6434,"limit":1000000},"postexpandincludesize":{"value":226508,"limit":2097152},"templateargumentsize":{"value":6793,"limit":2097152},"expansiondepth":{"value":17,"limit":100},"expensivefunctioncount":{"value":10,"limit":500},"unstrip-depth":{"value":1,"limit":20},"unstrip-size":{"value":266437,"limit":5000000},"entityaccesscount":{"value":0,"limit":400},"timingprofile":["100.00% 889.793 1 -total"," 46.55% 414.184 2 Template:Reflist"," 15.21% 135.339 29 Template:Cite_journal"," 13.41% 119.283 1 Template:Machine_learning"," 12.97% 115.394 8 Template:Cite_book"," 12.59% 112.058 1 Template:Sidebar_with_collapsible_lists"," 7.60% 67.582 1 Template:Short_description"," 6.70% 59.647 5 Template:Fix"," 5.65% 50.266 3 Template:Citation_needed"," 5.39% 47.943 8 Template:Cite_conference"]},"scribunto":{"limitreport-timeusage":{"value":"0.538","limit":"10.000"},"limitreport-memusage":{"value":7224748,"limit":52428800}},"cachereport":{"origin":"mw-web.codfw.main-84749c7844-p8w5m","timestamp":"20250210045037","ttl":2592000,"transientcontent":false}}});});</script> <script type="application/ld+json">{"@context":"https:\/\/schema.org","@type":"Article","name":"Stochastic gradient descent","url":"https:\/\/en.wikipedia.org\/wiki\/Stochastic_gradient_descent","sameAs":"http:\/\/www.wikidata.org\/entity\/Q7617819","mainEntity":"http:\/\/www.wikidata.org\/entity\/Q7617819","author":{"@type":"Organization","name":"Contributors to Wikimedia projects"},"publisher":{"@type":"Organization","name":"Wikimedia Foundation, Inc.","logo":{"@type":"ImageObject","url":"https:\/\/www.wikimedia.org\/static\/images\/wmf-hor-googpub.png"}},"datePublished":"2004-11-17T19:05:49Z","dateModified":"2024-12-27T14:06:09Z","headline":"gradient descent method used for the minimization of an objective function"}</script> </body> </html>

Pages: 1 2 3 4 5 6 7 8 9 10