CINXE.COM
Vanishing gradient problem - Wikipedia
<!DOCTYPE html> <html class="client-nojs vector-feature-language-in-header-enabled vector-feature-language-in-main-page-header-disabled vector-feature-page-tools-pinned-disabled vector-feature-toc-pinned-clientpref-1 vector-feature-main-menu-pinned-disabled vector-feature-limited-width-clientpref-1 vector-feature-limited-width-content-enabled vector-feature-custom-font-size-clientpref-1 vector-feature-appearance-pinned-clientpref-1 vector-feature-night-mode-enabled skin-theme-clientpref-day vector-sticky-header-enabled vector-toc-available" lang="en" dir="ltr"> <head> <meta charset="UTF-8"> <title>Vanishing gradient problem - Wikipedia</title> <script>(function(){var className="client-js vector-feature-language-in-header-enabled vector-feature-language-in-main-page-header-disabled vector-feature-page-tools-pinned-disabled vector-feature-toc-pinned-clientpref-1 vector-feature-main-menu-pinned-disabled vector-feature-limited-width-clientpref-1 vector-feature-limited-width-content-enabled vector-feature-custom-font-size-clientpref-1 vector-feature-appearance-pinned-clientpref-1 vector-feature-night-mode-enabled skin-theme-clientpref-day vector-sticky-header-enabled vector-toc-available";var cookie=document.cookie.match(/(?:^|; )enwikimwclientpreferences=([^;]+)/);if(cookie){cookie[1].split('%2C').forEach(function(pref){className=className.replace(new RegExp('(^| )'+pref.replace(/-clientpref-\w+$|[^\w-]+/g,'')+'-clientpref-\\w+( |$)'),'$1'+pref+'$2');});}document.documentElement.className=className;}());RLCONF={"wgBreakFrames":false,"wgSeparatorTransformTable":["",""],"wgDigitTransformTable":["",""],"wgDefaultDateFormat":"dmy","wgMonthNames":["","January","February","March","April","May","June","July","August","September","October","November","December"],"wgRequestId":"a146cafc-1d97-4f60-8a2d-9f03e0bc65ed","wgCanonicalNamespace":"","wgCanonicalSpecialPageName":false,"wgNamespaceNumber":0,"wgPageName":"Vanishing_gradient_problem","wgTitle":"Vanishing gradient problem","wgCurRevisionId":1272854228,"wgRevisionId":1272854228,"wgArticleId":43502368,"wgIsArticle":true,"wgIsRedirect":false,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgCategories":["Articles with short description","Short description matches Wikidata","Articles lacking reliable references from December 2017","All articles lacking reliable references","Articles with multiple maintenance issues","All articles with unsourced statements","Articles with unsourced statements from June 2017","Use dmy dates from August 2019","Artificial neural networks"],"wgPageViewLanguage":"en","wgPageContentLanguage":"en","wgPageContentModel":"wikitext","wgRelevantPageName":"Vanishing_gradient_problem","wgRelevantArticleId":43502368,"wgIsProbablyEditable":true,"wgRelevantPageIsProbablyEditable":true,"wgRestrictionEdit":[],"wgRestrictionMove":[],"wgNoticeProject":"wikipedia","wgCiteReferencePreviewsActive":false,"wgFlaggedRevsParams":{"tags":{"status":{"levels":1}}},"wgMediaViewerOnClick":true,"wgMediaViewerEnabledByDefault":true,"wgPopupsFlags":0,"wgVisualEditor":{"pageLanguageCode":"en","pageLanguageDir":"ltr","pageVariantFallbacks":"en"},"wgMFDisplayWikibaseDescriptions":{"search":true,"watchlist":true,"tagline":false,"nearby":true},"wgWMESchemaEditAttemptStepOversample":false,"wgWMEPageLength":20000,"wgEditSubmitButtonLabelPublish":true,"wgULSPosition":"interlanguage","wgULSisCompactLinksEnabled":false,"wgVector2022LanguageInHeader":true,"wgULSisLanguageSelectorEmpty":false,"wgWikibaseItemId":"Q18358230","wgCheckUserClientHintsHeadersJsApi":["brands","architecture","bitness","fullVersionList","mobile","model","platform","platformVersion"],"GEHomepageSuggestedEditsEnableTopics":true,"wgGETopicsMatchModeEnabled":false,"wgGELevelingUpEnabledForUser":false}; RLSTATE={"ext.globalCssJs.user.styles":"ready","site.styles":"ready","user.styles":"ready","ext.globalCssJs.user":"ready","user":"ready","user.options":"loading","ext.cite.styles":"ready","ext.math.styles":"ready","skins.vector.search.codex.styles":"ready","skins.vector.styles":"ready","skins.vector.icons":"ready","jquery.makeCollapsible.styles":"ready","ext.wikimediamessages.styles":"ready","ext.visualEditor.desktopArticleTarget.noscript":"ready","ext.uls.interlanguage":"ready","wikibase.client.init":"ready"};RLPAGEMODULES=["ext.cite.ux-enhancements","mediawiki.page.media","site","mediawiki.page.ready","jquery.makeCollapsible","mediawiki.toc","skins.vector.js","ext.centralNotice.geoIP","ext.centralNotice.startUp","ext.gadget.ReferenceTooltips","ext.gadget.switcher","ext.urlShortener.toolbar","ext.centralauth.centralautologin","mmv.bootstrap","ext.popups","ext.visualEditor.desktopArticleTarget.init","ext.visualEditor.targetLoader","ext.echo.centralauth","ext.eventLogging","ext.wikimediaEvents","ext.navigationTiming","ext.uls.interface","ext.cx.eventlogging.campaigns","ext.cx.uls.quick.actions","wikibase.client.vector-2022","ext.checkUser.clientHints","ext.quicksurveys.init","ext.growthExperiments.SuggestedEditSession"];</script> <script>(RLQ=window.RLQ||[]).push(function(){mw.loader.impl(function(){return["user.options@12s5i",function($,jQuery,require,module){mw.user.tokens.set({"patrolToken":"+\\","watchToken":"+\\","csrfToken":"+\\"}); }];});});</script> <link rel="stylesheet" href="/w/load.php?lang=en&modules=ext.cite.styles%7Cext.math.styles%7Cext.uls.interlanguage%7Cext.visualEditor.desktopArticleTarget.noscript%7Cext.wikimediamessages.styles%7Cjquery.makeCollapsible.styles%7Cskins.vector.icons%2Cstyles%7Cskins.vector.search.codex.styles%7Cwikibase.client.init&only=styles&skin=vector-2022"> <script async="" src="/w/load.php?lang=en&modules=startup&only=scripts&raw=1&skin=vector-2022"></script> <meta name="ResourceLoaderDynamicStyles" content=""> <link rel="stylesheet" href="/w/load.php?lang=en&modules=site.styles&only=styles&skin=vector-2022"> <meta name="generator" content="MediaWiki 1.44.0-wmf.21"> <meta name="referrer" content="origin"> <meta name="referrer" content="origin-when-cross-origin"> <meta name="robots" content="max-image-preview:standard"> <meta name="format-detection" content="telephone=no"> <meta name="viewport" content="width=1120"> <meta property="og:title" content="Vanishing gradient problem - Wikipedia"> <meta property="og:type" content="website"> <link rel="preconnect" href="//upload.wikimedia.org"> <link rel="alternate" media="only screen and (max-width: 640px)" href="//en.m.wikipedia.org/wiki/Vanishing_gradient_problem"> <link rel="alternate" type="application/x-wiki" title="Edit this page" href="/w/index.php?title=Vanishing_gradient_problem&action=edit"> <link rel="apple-touch-icon" href="/static/apple-touch/wikipedia.png"> <link rel="icon" href="/static/favicon/wikipedia.ico"> <link rel="search" type="application/opensearchdescription+xml" href="/w/rest.php/v1/search" title="Wikipedia (en)"> <link rel="EditURI" type="application/rsd+xml" href="//en.wikipedia.org/w/api.php?action=rsd"> <link rel="canonical" href="https://en.wikipedia.org/wiki/Vanishing_gradient_problem"> <link rel="license" href="https://creativecommons.org/licenses/by-sa/4.0/deed.en"> <link rel="alternate" type="application/atom+xml" title="Wikipedia Atom feed" href="/w/index.php?title=Special:RecentChanges&feed=atom"> <link rel="dns-prefetch" href="//meta.wikimedia.org" /> <link rel="dns-prefetch" href="login.wikimedia.org"> </head> <body class="skin--responsive skin-vector skin-vector-search-vue mediawiki ltr sitedir-ltr mw-hide-empty-elt ns-0 ns-subject mw-editable page-Vanishing_gradient_problem rootpage-Vanishing_gradient_problem skin-vector-2022 action-view"><a class="mw-jump-link" href="#bodyContent">Jump to content</a> <div class="vector-header-container"> <header class="vector-header mw-header"> <div class="vector-header-start"> <nav class="vector-main-menu-landmark" aria-label="Site"> <div id="vector-main-menu-dropdown" class="vector-dropdown vector-main-menu-dropdown vector-button-flush-left vector-button-flush-right" title="Main menu" > <input type="checkbox" id="vector-main-menu-dropdown-checkbox" role="button" aria-haspopup="true" data-event-name="ui.dropdown-vector-main-menu-dropdown" class="vector-dropdown-checkbox " aria-label="Main menu" > <label id="vector-main-menu-dropdown-label" for="vector-main-menu-dropdown-checkbox" class="vector-dropdown-label cdx-button cdx-button--fake-button cdx-button--fake-button--enabled cdx-button--weight-quiet cdx-button--icon-only " aria-hidden="true" ><span class="vector-icon mw-ui-icon-menu mw-ui-icon-wikimedia-menu"></span> <span class="vector-dropdown-label-text">Main menu</span> </label> <div class="vector-dropdown-content"> <div id="vector-main-menu-unpinned-container" class="vector-unpinned-container"> <div id="vector-main-menu" class="vector-main-menu vector-pinnable-element"> <div class="vector-pinnable-header vector-main-menu-pinnable-header vector-pinnable-header-unpinned" data-feature-name="main-menu-pinned" data-pinnable-element-id="vector-main-menu" data-pinned-container-id="vector-main-menu-pinned-container" data-unpinned-container-id="vector-main-menu-unpinned-container" > <div class="vector-pinnable-header-label">Main menu</div> <button class="vector-pinnable-header-toggle-button vector-pinnable-header-pin-button" data-event-name="pinnable-header.vector-main-menu.pin">move to sidebar</button> <button class="vector-pinnable-header-toggle-button vector-pinnable-header-unpin-button" data-event-name="pinnable-header.vector-main-menu.unpin">hide</button> </div> <div id="p-navigation" class="vector-menu mw-portlet mw-portlet-navigation" > <div class="vector-menu-heading"> Navigation </div> <div class="vector-menu-content"> <ul class="vector-menu-content-list"> <li id="n-mainpage-description" class="mw-list-item"><a href="/wiki/Main_Page" title="Visit the main page [z]" accesskey="z"><span>Main page</span></a></li><li id="n-contents" class="mw-list-item"><a href="/wiki/Wikipedia:Contents" title="Guides to browsing Wikipedia"><span>Contents</span></a></li><li id="n-currentevents" class="mw-list-item"><a href="/wiki/Portal:Current_events" title="Articles related to current events"><span>Current events</span></a></li><li id="n-randompage" class="mw-list-item"><a href="/wiki/Special:Random" title="Visit a randomly selected article [x]" accesskey="x"><span>Random article</span></a></li><li id="n-aboutsite" class="mw-list-item"><a href="/wiki/Wikipedia:About" title="Learn about Wikipedia and how it works"><span>About Wikipedia</span></a></li><li id="n-contactpage" class="mw-list-item"><a href="//en.wikipedia.org/wiki/Wikipedia:Contact_us" title="How to contact Wikipedia"><span>Contact us</span></a></li> </ul> </div> </div> <div id="p-interaction" class="vector-menu mw-portlet mw-portlet-interaction" > <div class="vector-menu-heading"> Contribute </div> <div class="vector-menu-content"> <ul class="vector-menu-content-list"> <li id="n-help" class="mw-list-item"><a href="/wiki/Help:Contents" title="Guidance on how to use and edit Wikipedia"><span>Help</span></a></li><li id="n-introduction" class="mw-list-item"><a href="/wiki/Help:Introduction" title="Learn how to edit Wikipedia"><span>Learn to edit</span></a></li><li id="n-portal" class="mw-list-item"><a href="/wiki/Wikipedia:Community_portal" title="The hub for editors"><span>Community portal</span></a></li><li id="n-recentchanges" class="mw-list-item"><a href="/wiki/Special:RecentChanges" title="A list of recent changes to Wikipedia [r]" accesskey="r"><span>Recent changes</span></a></li><li id="n-upload" class="mw-list-item"><a href="/wiki/Wikipedia:File_upload_wizard" title="Add images or other media for use on Wikipedia"><span>Upload file</span></a></li><li id="n-specialpages" class="mw-list-item"><a href="/wiki/Special:SpecialPages"><span>Special pages</span></a></li> </ul> </div> </div> </div> </div> </div> </div> </nav> <a href="/wiki/Main_Page" class="mw-logo"> <img class="mw-logo-icon" src="/static/images/icons/wikipedia.png" alt="" aria-hidden="true" height="50" width="50"> <span class="mw-logo-container skin-invert"> <img class="mw-logo-wordmark" alt="Wikipedia" src="/static/images/mobile/copyright/wikipedia-wordmark-en.svg" style="width: 7.5em; height: 1.125em;"> <img class="mw-logo-tagline" alt="The Free Encyclopedia" src="/static/images/mobile/copyright/wikipedia-tagline-en.svg" width="117" height="13" style="width: 7.3125em; height: 0.8125em;"> </span> </a> </div> <div class="vector-header-end"> <div id="p-search" role="search" class="vector-search-box-vue vector-search-box-collapses vector-search-box-show-thumbnail vector-search-box-auto-expand-width vector-search-box"> <a href="/wiki/Special:Search" class="cdx-button cdx-button--fake-button cdx-button--fake-button--enabled cdx-button--weight-quiet cdx-button--icon-only search-toggle" title="Search Wikipedia [f]" accesskey="f"><span class="vector-icon mw-ui-icon-search mw-ui-icon-wikimedia-search"></span> <span>Search</span> </a> <div class="vector-typeahead-search-container"> <div class="cdx-typeahead-search cdx-typeahead-search--show-thumbnail cdx-typeahead-search--auto-expand-width"> <form action="/w/index.php" id="searchform" class="cdx-search-input cdx-search-input--has-end-button"> <div id="simpleSearch" class="cdx-search-input__input-wrapper" data-search-loc="header-moved"> <div class="cdx-text-input cdx-text-input--has-start-icon"> <input class="cdx-text-input__input" type="search" name="search" placeholder="Search Wikipedia" aria-label="Search Wikipedia" autocapitalize="sentences" title="Search Wikipedia [f]" accesskey="f" id="searchInput" > <span class="cdx-text-input__icon cdx-text-input__start-icon"></span> </div> <input type="hidden" name="title" value="Special:Search"> </div> <button class="cdx-button cdx-search-input__end-button">Search</button> </form> </div> </div> </div> <nav class="vector-user-links vector-user-links-wide" aria-label="Personal tools"> <div class="vector-user-links-main"> <div id="p-vector-user-menu-preferences" class="vector-menu mw-portlet emptyPortlet" > <div class="vector-menu-content"> <ul class="vector-menu-content-list"> </ul> </div> </div> <div id="p-vector-user-menu-userpage" class="vector-menu mw-portlet emptyPortlet" > <div class="vector-menu-content"> <ul class="vector-menu-content-list"> </ul> </div> </div> <nav class="vector-appearance-landmark" aria-label="Appearance"> <div id="vector-appearance-dropdown" class="vector-dropdown " title="Change the appearance of the page's font size, width, and color" > <input type="checkbox" id="vector-appearance-dropdown-checkbox" role="button" aria-haspopup="true" data-event-name="ui.dropdown-vector-appearance-dropdown" class="vector-dropdown-checkbox " aria-label="Appearance" > <label id="vector-appearance-dropdown-label" for="vector-appearance-dropdown-checkbox" class="vector-dropdown-label cdx-button cdx-button--fake-button cdx-button--fake-button--enabled cdx-button--weight-quiet cdx-button--icon-only " aria-hidden="true" ><span class="vector-icon mw-ui-icon-appearance mw-ui-icon-wikimedia-appearance"></span> <span class="vector-dropdown-label-text">Appearance</span> </label> <div class="vector-dropdown-content"> <div id="vector-appearance-unpinned-container" class="vector-unpinned-container"> </div> </div> </div> </nav> <div id="p-vector-user-menu-notifications" class="vector-menu mw-portlet emptyPortlet" > <div class="vector-menu-content"> <ul class="vector-menu-content-list"> </ul> </div> </div> <div id="p-vector-user-menu-overflow" class="vector-menu mw-portlet" > <div class="vector-menu-content"> <ul class="vector-menu-content-list"> <li id="pt-sitesupport-2" class="user-links-collapsible-item mw-list-item user-links-collapsible-item"><a data-mw="interface" href="https://donate.wikimedia.org/?wmf_source=donate&wmf_medium=sidebar&wmf_campaign=en.wikipedia.org&uselang=en" class=""><span>Donate</span></a> </li> <li id="pt-createaccount-2" class="user-links-collapsible-item mw-list-item user-links-collapsible-item"><a data-mw="interface" href="/w/index.php?title=Special:CreateAccount&returnto=Vanishing+gradient+problem" title="You are encouraged to create an account and log in; however, it is not mandatory" class=""><span>Create account</span></a> </li> <li id="pt-login-2" class="user-links-collapsible-item mw-list-item user-links-collapsible-item"><a data-mw="interface" href="/w/index.php?title=Special:UserLogin&returnto=Vanishing+gradient+problem" title="You're encouraged to log in; however, it's not mandatory. [o]" accesskey="o" class=""><span>Log in</span></a> </li> </ul> </div> </div> </div> <div id="vector-user-links-dropdown" class="vector-dropdown vector-user-menu vector-button-flush-right vector-user-menu-logged-out" title="Log in and more options" > <input type="checkbox" id="vector-user-links-dropdown-checkbox" role="button" aria-haspopup="true" data-event-name="ui.dropdown-vector-user-links-dropdown" class="vector-dropdown-checkbox " aria-label="Personal tools" > <label id="vector-user-links-dropdown-label" for="vector-user-links-dropdown-checkbox" class="vector-dropdown-label cdx-button cdx-button--fake-button cdx-button--fake-button--enabled cdx-button--weight-quiet cdx-button--icon-only " aria-hidden="true" ><span class="vector-icon mw-ui-icon-ellipsis mw-ui-icon-wikimedia-ellipsis"></span> <span class="vector-dropdown-label-text">Personal tools</span> </label> <div class="vector-dropdown-content"> <div id="p-personal" class="vector-menu mw-portlet mw-portlet-personal user-links-collapsible-item" title="User menu" > <div class="vector-menu-content"> <ul class="vector-menu-content-list"> <li id="pt-sitesupport" class="user-links-collapsible-item mw-list-item"><a href="https://donate.wikimedia.org/?wmf_source=donate&wmf_medium=sidebar&wmf_campaign=en.wikipedia.org&uselang=en"><span>Donate</span></a></li><li id="pt-createaccount" class="user-links-collapsible-item mw-list-item"><a href="/w/index.php?title=Special:CreateAccount&returnto=Vanishing+gradient+problem" title="You are encouraged to create an account and log in; however, it is not mandatory"><span class="vector-icon mw-ui-icon-userAdd mw-ui-icon-wikimedia-userAdd"></span> <span>Create account</span></a></li><li id="pt-login" class="user-links-collapsible-item mw-list-item"><a href="/w/index.php?title=Special:UserLogin&returnto=Vanishing+gradient+problem" title="You're encouraged to log in; however, it's not mandatory. [o]" accesskey="o"><span class="vector-icon mw-ui-icon-logIn mw-ui-icon-wikimedia-logIn"></span> <span>Log in</span></a></li> </ul> </div> </div> <div id="p-user-menu-anon-editor" class="vector-menu mw-portlet mw-portlet-user-menu-anon-editor" > <div class="vector-menu-heading"> Pages for logged out editors <a href="/wiki/Help:Introduction" aria-label="Learn more about editing"><span>learn more</span></a> </div> <div class="vector-menu-content"> <ul class="vector-menu-content-list"> <li id="pt-anoncontribs" class="mw-list-item"><a href="/wiki/Special:MyContributions" title="A list of edits made from this IP address [y]" accesskey="y"><span>Contributions</span></a></li><li id="pt-anontalk" class="mw-list-item"><a href="/wiki/Special:MyTalk" title="Discussion about edits from this IP address [n]" accesskey="n"><span>Talk</span></a></li> </ul> </div> </div> </div> </div> </nav> </div> </header> </div> <div class="mw-page-container"> <div class="mw-page-container-inner"> <div class="vector-sitenotice-container"> <div id="siteNotice"><!-- CentralNotice --></div> </div> <div class="vector-column-start"> <div class="vector-main-menu-container"> <div id="mw-navigation"> <nav id="mw-panel" class="vector-main-menu-landmark" aria-label="Site"> <div id="vector-main-menu-pinned-container" class="vector-pinned-container"> </div> </nav> </div> </div> <div class="vector-sticky-pinned-container"> <nav id="mw-panel-toc" aria-label="Contents" data-event-name="ui.sidebar-toc" class="mw-table-of-contents-container vector-toc-landmark"> <div id="vector-toc-pinned-container" class="vector-pinned-container"> <div id="vector-toc" class="vector-toc vector-pinnable-element"> <div class="vector-pinnable-header vector-toc-pinnable-header vector-pinnable-header-pinned" data-feature-name="toc-pinned" data-pinnable-element-id="vector-toc" > <h2 class="vector-pinnable-header-label">Contents</h2> <button class="vector-pinnable-header-toggle-button vector-pinnable-header-pin-button" data-event-name="pinnable-header.vector-toc.pin">move to sidebar</button> <button class="vector-pinnable-header-toggle-button vector-pinnable-header-unpin-button" data-event-name="pinnable-header.vector-toc.unpin">hide</button> </div> <ul class="vector-toc-contents" id="mw-panel-toc-list"> <li id="toc-mw-content-text" class="vector-toc-list-item vector-toc-level-1"> <a href="#" class="vector-toc-link"> <div class="vector-toc-text">(Top)</div> </a> </li> <li id="toc-Prototypical_models" class="vector-toc-list-item vector-toc-level-1 vector-toc-list-item-expanded"> <a class="vector-toc-link" href="#Prototypical_models"> <div class="vector-toc-text"> <span class="vector-toc-numb">1</span> <span>Prototypical models</span> </div> </a> <button aria-controls="toc-Prototypical_models-sublist" class="cdx-button cdx-button--weight-quiet cdx-button--icon-only vector-toc-toggle"> <span class="vector-icon mw-ui-icon-wikimedia-expand"></span> <span>Toggle Prototypical models subsection</span> </button> <ul id="toc-Prototypical_models-sublist" class="vector-toc-list"> <li id="toc-Recurrent_network_model" class="vector-toc-list-item vector-toc-level-2"> <a class="vector-toc-link" href="#Recurrent_network_model"> <div class="vector-toc-text"> <span class="vector-toc-numb">1.1</span> <span>Recurrent network model</span> </div> </a> <ul id="toc-Recurrent_network_model-sublist" class="vector-toc-list"> <li id="toc-Example:_recurrent_network_with_sigmoid_activation" class="vector-toc-list-item vector-toc-level-3"> <a class="vector-toc-link" href="#Example:_recurrent_network_with_sigmoid_activation"> <div class="vector-toc-text"> <span class="vector-toc-numb">1.1.1</span> <span>Example: recurrent network with sigmoid activation</span> </div> </a> <ul id="toc-Example:_recurrent_network_with_sigmoid_activation-sublist" class="vector-toc-list"> </ul> </li> </ul> </li> <li id="toc-Dynamical_systems_model" class="vector-toc-list-item vector-toc-level-2"> <a class="vector-toc-link" href="#Dynamical_systems_model"> <div class="vector-toc-text"> <span class="vector-toc-numb">1.2</span> <span>Dynamical systems model</span> </div> </a> <ul id="toc-Dynamical_systems_model-sublist" class="vector-toc-list"> </ul> </li> <li id="toc-Geometric_model" class="vector-toc-list-item vector-toc-level-2"> <a class="vector-toc-link" href="#Geometric_model"> <div class="vector-toc-text"> <span class="vector-toc-numb">1.3</span> <span>Geometric model</span> </div> </a> <ul id="toc-Geometric_model-sublist" class="vector-toc-list"> </ul> </li> </ul> </li> <li id="toc-Solutions" class="vector-toc-list-item vector-toc-level-1 vector-toc-list-item-expanded"> <a class="vector-toc-link" href="#Solutions"> <div class="vector-toc-text"> <span class="vector-toc-numb">2</span> <span>Solutions</span> </div> </a> <button aria-controls="toc-Solutions-sublist" class="cdx-button cdx-button--weight-quiet cdx-button--icon-only vector-toc-toggle"> <span class="vector-icon mw-ui-icon-wikimedia-expand"></span> <span>Toggle Solutions subsection</span> </button> <ul id="toc-Solutions-sublist" class="vector-toc-list"> <li id="toc-RNN" class="vector-toc-list-item vector-toc-level-2"> <a class="vector-toc-link" href="#RNN"> <div class="vector-toc-text"> <span class="vector-toc-numb">2.1</span> <span>RNN</span> </div> </a> <ul id="toc-RNN-sublist" class="vector-toc-list"> </ul> </li> <li id="toc-Batch_normalization" class="vector-toc-list-item vector-toc-level-2"> <a class="vector-toc-link" href="#Batch_normalization"> <div class="vector-toc-text"> <span class="vector-toc-numb">2.2</span> <span>Batch normalization</span> </div> </a> <ul id="toc-Batch_normalization-sublist" class="vector-toc-list"> </ul> </li> <li id="toc-Multi-level_hierarchy" class="vector-toc-list-item vector-toc-level-2"> <a class="vector-toc-link" href="#Multi-level_hierarchy"> <div class="vector-toc-text"> <span class="vector-toc-numb">2.3</span> <span>Multi-level hierarchy</span> </div> </a> <ul id="toc-Multi-level_hierarchy-sublist" class="vector-toc-list"> </ul> </li> <li id="toc-Deep_belief_network" class="vector-toc-list-item vector-toc-level-2"> <a class="vector-toc-link" href="#Deep_belief_network"> <div class="vector-toc-text"> <span class="vector-toc-numb">2.4</span> <span>Deep belief network</span> </div> </a> <ul id="toc-Deep_belief_network-sublist" class="vector-toc-list"> </ul> </li> <li id="toc-Faster_hardware" class="vector-toc-list-item vector-toc-level-2"> <a class="vector-toc-link" href="#Faster_hardware"> <div class="vector-toc-text"> <span class="vector-toc-numb">2.5</span> <span>Faster hardware</span> </div> </a> <ul id="toc-Faster_hardware-sublist" class="vector-toc-list"> </ul> </li> <li id="toc-Residual_connection" class="vector-toc-list-item vector-toc-level-2"> <a class="vector-toc-link" href="#Residual_connection"> <div class="vector-toc-text"> <span class="vector-toc-numb">2.6</span> <span>Residual connection</span> </div> </a> <ul id="toc-Residual_connection-sublist" class="vector-toc-list"> </ul> </li> <li id="toc-Other_activation_functions" class="vector-toc-list-item vector-toc-level-2"> <a class="vector-toc-link" href="#Other_activation_functions"> <div class="vector-toc-text"> <span class="vector-toc-numb">2.7</span> <span>Other activation functions</span> </div> </a> <ul id="toc-Other_activation_functions-sublist" class="vector-toc-list"> </ul> </li> <li id="toc-Weight_initialization" class="vector-toc-list-item vector-toc-level-2"> <a class="vector-toc-link" href="#Weight_initialization"> <div class="vector-toc-text"> <span class="vector-toc-numb">2.8</span> <span>Weight initialization</span> </div> </a> <ul id="toc-Weight_initialization-sublist" class="vector-toc-list"> </ul> </li> <li id="toc-Other" class="vector-toc-list-item vector-toc-level-2"> <a class="vector-toc-link" href="#Other"> <div class="vector-toc-text"> <span class="vector-toc-numb">2.9</span> <span>Other</span> </div> </a> <ul id="toc-Other-sublist" class="vector-toc-list"> </ul> </li> </ul> </li> <li id="toc-See_also" class="vector-toc-list-item vector-toc-level-1 vector-toc-list-item-expanded"> <a class="vector-toc-link" href="#See_also"> <div class="vector-toc-text"> <span class="vector-toc-numb">3</span> <span>See also</span> </div> </a> <ul id="toc-See_also-sublist" class="vector-toc-list"> </ul> </li> <li id="toc-Notes" class="vector-toc-list-item vector-toc-level-1 vector-toc-list-item-expanded"> <a class="vector-toc-link" href="#Notes"> <div class="vector-toc-text"> <span class="vector-toc-numb">4</span> <span>Notes</span> </div> </a> <ul id="toc-Notes-sublist" class="vector-toc-list"> </ul> </li> <li id="toc-References" class="vector-toc-list-item vector-toc-level-1 vector-toc-list-item-expanded"> <a class="vector-toc-link" href="#References"> <div class="vector-toc-text"> <span class="vector-toc-numb">5</span> <span>References</span> </div> </a> <ul id="toc-References-sublist" class="vector-toc-list"> </ul> </li> </ul> </div> </div> </nav> </div> </div> <div class="mw-content-container"> <main id="content" class="mw-body"> <header class="mw-body-header vector-page-titlebar"> <nav aria-label="Contents" class="vector-toc-landmark"> <div id="vector-page-titlebar-toc" class="vector-dropdown vector-page-titlebar-toc vector-button-flush-left" title="Table of Contents" > <input type="checkbox" id="vector-page-titlebar-toc-checkbox" role="button" aria-haspopup="true" data-event-name="ui.dropdown-vector-page-titlebar-toc" class="vector-dropdown-checkbox " aria-label="Toggle the table of contents" > <label id="vector-page-titlebar-toc-label" for="vector-page-titlebar-toc-checkbox" class="vector-dropdown-label cdx-button cdx-button--fake-button cdx-button--fake-button--enabled cdx-button--weight-quiet cdx-button--icon-only " aria-hidden="true" ><span class="vector-icon mw-ui-icon-listBullet mw-ui-icon-wikimedia-listBullet"></span> <span class="vector-dropdown-label-text">Toggle the table of contents</span> </label> <div class="vector-dropdown-content"> <div id="vector-page-titlebar-toc-unpinned-container" class="vector-unpinned-container"> </div> </div> </div> </nav> <h1 id="firstHeading" class="firstHeading mw-first-heading"><span class="mw-page-title-main">Vanishing gradient problem</span></h1> <div id="p-lang-btn" class="vector-dropdown mw-portlet mw-portlet-lang" > <input type="checkbox" id="p-lang-btn-checkbox" role="button" aria-haspopup="true" data-event-name="ui.dropdown-p-lang-btn" class="vector-dropdown-checkbox mw-interlanguage-selector" aria-label="Go to an article in another language. Available in 11 languages" > <label id="p-lang-btn-label" for="p-lang-btn-checkbox" class="vector-dropdown-label cdx-button cdx-button--fake-button cdx-button--fake-button--enabled cdx-button--weight-quiet cdx-button--action-progressive mw-portlet-lang-heading-11" aria-hidden="true" ><span class="vector-icon mw-ui-icon-language-progressive mw-ui-icon-wikimedia-language-progressive"></span> <span class="vector-dropdown-label-text">11 languages</span> </label> <div class="vector-dropdown-content"> <div class="vector-menu-content"> <ul class="vector-menu-content-list"> <li class="interlanguage-link interwiki-ca mw-list-item"><a href="https://ca.wikipedia.org/wiki/Problema_del_gradient_minvant" title="Problema del gradient minvant – Catalan" lang="ca" hreflang="ca" data-title="Problema del gradient minvant" data-language-autonym="Català" data-language-local-name="Catalan" class="interlanguage-link-target"><span>Català</span></a></li><li class="interlanguage-link interwiki-es mw-list-item"><a href="https://es.wikipedia.org/wiki/Problema_de_desvanecimiento_de_gradiente" title="Problema de desvanecimiento de gradiente – Spanish" lang="es" hreflang="es" data-title="Problema de desvanecimiento de gradiente" data-language-autonym="Español" data-language-local-name="Spanish" class="interlanguage-link-target"><span>Español</span></a></li><li class="interlanguage-link interwiki-fa mw-list-item"><a href="https://fa.wikipedia.org/wiki/%DA%AF%D8%B1%D8%A7%D8%AF%DB%8C%D8%A7%D9%86_%D9%85%D8%AD%D9%88%D8%B4%D9%88%D9%86%D8%AF%D9%87" title="گرادیان محوشونده – Persian" lang="fa" hreflang="fa" data-title="گرادیان محوشونده" data-language-autonym="فارسی" data-language-local-name="Persian" class="interlanguage-link-target"><span>فارسی</span></a></li><li class="interlanguage-link interwiki-ko mw-list-item"><a href="https://ko.wikipedia.org/wiki/%EA%B8%B0%EC%9A%B8%EA%B8%B0_%EC%86%8C%EB%A9%B8_%EB%AC%B8%EC%A0%9C" title="기울기 소멸 문제 – Korean" lang="ko" hreflang="ko" data-title="기울기 소멸 문제" data-language-autonym="한국어" data-language-local-name="Korean" class="interlanguage-link-target"><span>한국어</span></a></li><li class="interlanguage-link interwiki-it mw-list-item"><a href="https://it.wikipedia.org/wiki/Problema_della_scomparsa_del_gradiente" title="Problema della scomparsa del gradiente – Italian" lang="it" hreflang="it" data-title="Problema della scomparsa del gradiente" data-language-autonym="Italiano" data-language-local-name="Italian" class="interlanguage-link-target"><span>Italiano</span></a></li><li class="interlanguage-link interwiki-ja mw-list-item"><a href="https://ja.wikipedia.org/wiki/%E5%8B%BE%E9%85%8D%E6%B6%88%E5%A4%B1%E5%95%8F%E9%A1%8C" title="勾配消失問題 – Japanese" lang="ja" hreflang="ja" data-title="勾配消失問題" data-language-autonym="日本語" data-language-local-name="Japanese" class="interlanguage-link-target"><span>日本語</span></a></li><li class="interlanguage-link interwiki-sq mw-list-item"><a href="https://sq.wikipedia.org/wiki/Problemi_i_gradientit_n%C3%AB_shuarje" title="Problemi i gradientit në shuarje – Albanian" lang="sq" hreflang="sq" data-title="Problemi i gradientit në shuarje" data-language-autonym="Shqip" data-language-local-name="Albanian" class="interlanguage-link-target"><span>Shqip</span></a></li><li class="interlanguage-link interwiki-th mw-list-item"><a href="https://th.wikipedia.org/wiki/%E0%B8%9B%E0%B8%B1%E0%B8%8D%E0%B8%AB%E0%B8%B2%E0%B8%84%E0%B8%A7%E0%B8%B2%E0%B8%A1%E0%B8%8A%E0%B8%B1%E0%B8%99%E0%B8%AD%E0%B8%B1%E0%B8%99%E0%B8%95%E0%B8%A3%E0%B8%98%E0%B8%B2%E0%B8%99" title="ปัญหาความชันอันตรธาน – Thai" lang="th" hreflang="th" data-title="ปัญหาความชันอันตรธาน" data-language-autonym="ไทย" data-language-local-name="Thai" class="interlanguage-link-target"><span>ไทย</span></a></li><li class="interlanguage-link interwiki-uk mw-list-item"><a href="https://uk.wikipedia.org/wiki/%D0%9F%D1%80%D0%BE%D0%B1%D0%BB%D0%B5%D0%BC%D0%B0_%D0%B7%D0%BD%D0%B8%D0%BA%D0%B0%D0%BD%D0%BD%D1%8F_%D0%B3%D1%80%D0%B0%D0%B4%D1%96%D1%94%D0%BD%D1%82%D1%83" title="Проблема зникання градієнту – Ukrainian" lang="uk" hreflang="uk" data-title="Проблема зникання градієнту" data-language-autonym="Українська" data-language-local-name="Ukrainian" class="interlanguage-link-target"><span>Українська</span></a></li><li class="interlanguage-link interwiki-zh-yue mw-list-item"><a href="https://zh-yue.wikipedia.org/wiki/%E6%A2%AF%E5%BA%A6%E6%B6%88%E5%A4%B1%E5%95%8F%E9%A1%8C" title="梯度消失問題 – Cantonese" lang="yue" hreflang="yue" data-title="梯度消失問題" data-language-autonym="粵語" data-language-local-name="Cantonese" class="interlanguage-link-target"><span>粵語</span></a></li><li class="interlanguage-link interwiki-zh mw-list-item"><a href="https://zh.wikipedia.org/wiki/%E6%A2%AF%E5%BA%A6%E6%B6%88%E5%A4%B1%E9%97%AE%E9%A2%98" title="梯度消失问题 – Chinese" lang="zh" hreflang="zh" data-title="梯度消失问题" data-language-autonym="中文" data-language-local-name="Chinese" class="interlanguage-link-target"><span>中文</span></a></li> </ul> <div class="after-portlet after-portlet-lang"><span class="wb-langlinks-edit wb-langlinks-link"><a href="https://www.wikidata.org/wiki/Special:EntityPage/Q18358230#sitelinks-wikipedia" title="Edit interlanguage links" class="wbc-editpage">Edit links</a></span></div> </div> </div> </div> </header> <div class="vector-page-toolbar"> <div class="vector-page-toolbar-container"> <div id="left-navigation"> <nav aria-label="Namespaces"> <div id="p-associated-pages" class="vector-menu vector-menu-tabs mw-portlet mw-portlet-associated-pages" > <div class="vector-menu-content"> <ul class="vector-menu-content-list"> <li id="ca-nstab-main" class="selected vector-tab-noicon mw-list-item"><a href="/wiki/Vanishing_gradient_problem" title="View the content page [c]" accesskey="c"><span>Article</span></a></li><li id="ca-talk" class="vector-tab-noicon mw-list-item"><a href="/wiki/Talk:Vanishing_gradient_problem" rel="discussion" title="Discuss improvements to the content page [t]" accesskey="t"><span>Talk</span></a></li> </ul> </div> </div> <div id="vector-variants-dropdown" class="vector-dropdown emptyPortlet" > <input type="checkbox" id="vector-variants-dropdown-checkbox" role="button" aria-haspopup="true" data-event-name="ui.dropdown-vector-variants-dropdown" class="vector-dropdown-checkbox " aria-label="Change language variant" > <label id="vector-variants-dropdown-label" for="vector-variants-dropdown-checkbox" class="vector-dropdown-label cdx-button cdx-button--fake-button cdx-button--fake-button--enabled cdx-button--weight-quiet" aria-hidden="true" ><span class="vector-dropdown-label-text">English</span> </label> <div class="vector-dropdown-content"> <div id="p-variants" class="vector-menu mw-portlet mw-portlet-variants emptyPortlet" > <div class="vector-menu-content"> <ul class="vector-menu-content-list"> </ul> </div> </div> </div> </div> </nav> </div> <div id="right-navigation" class="vector-collapsible"> <nav aria-label="Views"> <div id="p-views" class="vector-menu vector-menu-tabs mw-portlet mw-portlet-views" > <div class="vector-menu-content"> <ul class="vector-menu-content-list"> <li id="ca-view" class="selected vector-tab-noicon mw-list-item"><a href="/wiki/Vanishing_gradient_problem"><span>Read</span></a></li><li id="ca-edit" class="vector-tab-noicon mw-list-item"><a href="/w/index.php?title=Vanishing_gradient_problem&action=edit" title="Edit this page [e]" accesskey="e"><span>Edit</span></a></li><li id="ca-history" class="vector-tab-noicon mw-list-item"><a href="/w/index.php?title=Vanishing_gradient_problem&action=history" title="Past revisions of this page [h]" accesskey="h"><span>View history</span></a></li> </ul> </div> </div> </nav> <nav class="vector-page-tools-landmark" aria-label="Page tools"> <div id="vector-page-tools-dropdown" class="vector-dropdown vector-page-tools-dropdown" > <input type="checkbox" id="vector-page-tools-dropdown-checkbox" role="button" aria-haspopup="true" data-event-name="ui.dropdown-vector-page-tools-dropdown" class="vector-dropdown-checkbox " aria-label="Tools" > <label id="vector-page-tools-dropdown-label" for="vector-page-tools-dropdown-checkbox" class="vector-dropdown-label cdx-button cdx-button--fake-button cdx-button--fake-button--enabled cdx-button--weight-quiet" aria-hidden="true" ><span class="vector-dropdown-label-text">Tools</span> </label> <div class="vector-dropdown-content"> <div id="vector-page-tools-unpinned-container" class="vector-unpinned-container"> <div id="vector-page-tools" class="vector-page-tools vector-pinnable-element"> <div class="vector-pinnable-header vector-page-tools-pinnable-header vector-pinnable-header-unpinned" data-feature-name="page-tools-pinned" data-pinnable-element-id="vector-page-tools" data-pinned-container-id="vector-page-tools-pinned-container" data-unpinned-container-id="vector-page-tools-unpinned-container" > <div class="vector-pinnable-header-label">Tools</div> <button class="vector-pinnable-header-toggle-button vector-pinnable-header-pin-button" data-event-name="pinnable-header.vector-page-tools.pin">move to sidebar</button> <button class="vector-pinnable-header-toggle-button vector-pinnable-header-unpin-button" data-event-name="pinnable-header.vector-page-tools.unpin">hide</button> </div> <div id="p-cactions" class="vector-menu mw-portlet mw-portlet-cactions emptyPortlet vector-has-collapsible-items" title="More options" > <div class="vector-menu-heading"> Actions </div> <div class="vector-menu-content"> <ul class="vector-menu-content-list"> <li id="ca-more-view" class="selected vector-more-collapsible-item mw-list-item"><a href="/wiki/Vanishing_gradient_problem"><span>Read</span></a></li><li id="ca-more-edit" class="vector-more-collapsible-item mw-list-item"><a href="/w/index.php?title=Vanishing_gradient_problem&action=edit" title="Edit this page [e]" accesskey="e"><span>Edit</span></a></li><li id="ca-more-history" class="vector-more-collapsible-item mw-list-item"><a href="/w/index.php?title=Vanishing_gradient_problem&action=history"><span>View history</span></a></li> </ul> </div> </div> <div id="p-tb" class="vector-menu mw-portlet mw-portlet-tb" > <div class="vector-menu-heading"> General </div> <div class="vector-menu-content"> <ul class="vector-menu-content-list"> <li id="t-whatlinkshere" class="mw-list-item"><a href="/wiki/Special:WhatLinksHere/Vanishing_gradient_problem" title="List of all English Wikipedia pages containing links to this page [j]" accesskey="j"><span>What links here</span></a></li><li id="t-recentchangeslinked" class="mw-list-item"><a href="/wiki/Special:RecentChangesLinked/Vanishing_gradient_problem" rel="nofollow" title="Recent changes in pages linked from this page [k]" accesskey="k"><span>Related changes</span></a></li><li id="t-upload" class="mw-list-item"><a href="//en.wikipedia.org/wiki/Wikipedia:File_Upload_Wizard" title="Upload files [u]" accesskey="u"><span>Upload file</span></a></li><li id="t-permalink" class="mw-list-item"><a href="/w/index.php?title=Vanishing_gradient_problem&oldid=1272854228" title="Permanent link to this revision of this page"><span>Permanent link</span></a></li><li id="t-info" class="mw-list-item"><a href="/w/index.php?title=Vanishing_gradient_problem&action=info" title="More information about this page"><span>Page information</span></a></li><li id="t-cite" class="mw-list-item"><a href="/w/index.php?title=Special:CiteThisPage&page=Vanishing_gradient_problem&id=1272854228&wpFormIdentifier=titleform" title="Information on how to cite this page"><span>Cite this page</span></a></li><li id="t-urlshortener" class="mw-list-item"><a href="/w/index.php?title=Special:UrlShortener&url=https%3A%2F%2Fen.wikipedia.org%2Fwiki%2FVanishing_gradient_problem"><span>Get shortened URL</span></a></li><li id="t-urlshortener-qrcode" class="mw-list-item"><a href="/w/index.php?title=Special:QrCode&url=https%3A%2F%2Fen.wikipedia.org%2Fwiki%2FVanishing_gradient_problem"><span>Download QR code</span></a></li> </ul> </div> </div> <div id="p-coll-print_export" class="vector-menu mw-portlet mw-portlet-coll-print_export" > <div class="vector-menu-heading"> Print/export </div> <div class="vector-menu-content"> <ul class="vector-menu-content-list"> <li id="coll-download-as-rl" class="mw-list-item"><a href="/w/index.php?title=Special:DownloadAsPdf&page=Vanishing_gradient_problem&action=show-download-screen" title="Download this page as a PDF file"><span>Download as PDF</span></a></li><li id="t-print" class="mw-list-item"><a href="/w/index.php?title=Vanishing_gradient_problem&printable=yes" title="Printable version of this page [p]" accesskey="p"><span>Printable version</span></a></li> </ul> </div> </div> <div id="p-wikibase-otherprojects" class="vector-menu mw-portlet mw-portlet-wikibase-otherprojects" > <div class="vector-menu-heading"> In other projects </div> <div class="vector-menu-content"> <ul class="vector-menu-content-list"> <li id="t-wikibase" class="wb-otherproject-link wb-otherproject-wikibase-dataitem mw-list-item"><a href="https://www.wikidata.org/wiki/Special:EntityPage/Q18358230" title="Structured data on this page hosted by Wikidata [g]" accesskey="g"><span>Wikidata item</span></a></li> </ul> </div> </div> </div> </div> </div> </div> </nav> </div> </div> </div> <div class="vector-column-end"> <div class="vector-sticky-pinned-container"> <nav class="vector-page-tools-landmark" aria-label="Page tools"> <div id="vector-page-tools-pinned-container" class="vector-pinned-container"> </div> </nav> <nav class="vector-appearance-landmark" aria-label="Appearance"> <div id="vector-appearance-pinned-container" class="vector-pinned-container"> <div id="vector-appearance" class="vector-appearance vector-pinnable-element"> <div class="vector-pinnable-header vector-appearance-pinnable-header vector-pinnable-header-pinned" data-feature-name="appearance-pinned" data-pinnable-element-id="vector-appearance" data-pinned-container-id="vector-appearance-pinned-container" data-unpinned-container-id="vector-appearance-unpinned-container" > <div class="vector-pinnable-header-label">Appearance</div> <button class="vector-pinnable-header-toggle-button vector-pinnable-header-pin-button" data-event-name="pinnable-header.vector-appearance.pin">move to sidebar</button> <button class="vector-pinnable-header-toggle-button vector-pinnable-header-unpin-button" data-event-name="pinnable-header.vector-appearance.unpin">hide</button> </div> </div> </div> </nav> </div> </div> <div id="bodyContent" class="vector-body" aria-labelledby="firstHeading" data-mw-ve-target-container> <div class="vector-body-before-content"> <div class="mw-indicators"> </div> <div id="siteSub" class="noprint">From Wikipedia, the free encyclopedia</div> </div> <div id="contentSub"><div id="mw-content-subtitle"></div></div> <div id="mw-content-text" class="mw-body-content"><div class="mw-content-ltr mw-parser-output" lang="en" dir="ltr"><div class="shortdescription nomobile noexcerpt noprint searchaux" style="display:none">Machine learning model training problem</div> <style data-mw-deduplicate="TemplateStyles:r1244144826">.mw-parser-output .machine-learning-list-title{background-color:#ddddff}html.skin-theme-clientpref-night .mw-parser-output .machine-learning-list-title{background-color:#222}@media(prefers-color-scheme:dark){html.skin-theme-clientpref-os .mw-parser-output .machine-learning-list-title{background-color:#222}}</style> <style data-mw-deduplicate="TemplateStyles:r1129693374">.mw-parser-output .hlist dl,.mw-parser-output .hlist ol,.mw-parser-output .hlist ul{margin:0;padding:0}.mw-parser-output .hlist dd,.mw-parser-output .hlist dt,.mw-parser-output .hlist li{margin:0;display:inline}.mw-parser-output .hlist.inline,.mw-parser-output .hlist.inline dl,.mw-parser-output .hlist.inline ol,.mw-parser-output .hlist.inline ul,.mw-parser-output .hlist dl dl,.mw-parser-output .hlist dl ol,.mw-parser-output .hlist dl ul,.mw-parser-output .hlist ol dl,.mw-parser-output .hlist ol ol,.mw-parser-output .hlist ol ul,.mw-parser-output .hlist ul dl,.mw-parser-output .hlist ul ol,.mw-parser-output .hlist ul ul{display:inline}.mw-parser-output .hlist .mw-empty-li{display:none}.mw-parser-output .hlist dt::after{content:": "}.mw-parser-output .hlist dd::after,.mw-parser-output .hlist li::after{content:" · ";font-weight:bold}.mw-parser-output .hlist dd:last-child::after,.mw-parser-output .hlist dt:last-child::after,.mw-parser-output .hlist li:last-child::after{content:none}.mw-parser-output .hlist dd dd:first-child::before,.mw-parser-output .hlist dd dt:first-child::before,.mw-parser-output .hlist dd li:first-child::before,.mw-parser-output .hlist dt dd:first-child::before,.mw-parser-output .hlist dt dt:first-child::before,.mw-parser-output .hlist dt li:first-child::before,.mw-parser-output .hlist li dd:first-child::before,.mw-parser-output .hlist li dt:first-child::before,.mw-parser-output .hlist li li:first-child::before{content:" (";font-weight:normal}.mw-parser-output .hlist dd dd:last-child::after,.mw-parser-output .hlist dd dt:last-child::after,.mw-parser-output .hlist dd li:last-child::after,.mw-parser-output .hlist dt dd:last-child::after,.mw-parser-output .hlist dt dt:last-child::after,.mw-parser-output .hlist dt li:last-child::after,.mw-parser-output .hlist li dd:last-child::after,.mw-parser-output .hlist li dt:last-child::after,.mw-parser-output .hlist li li:last-child::after{content:")";font-weight:normal}.mw-parser-output .hlist ol{counter-reset:listitem}.mw-parser-output .hlist ol>li{counter-increment:listitem}.mw-parser-output .hlist ol>li::before{content:" "counter(listitem)"\a0 "}.mw-parser-output .hlist dd ol>li:first-child::before,.mw-parser-output .hlist dt ol>li:first-child::before,.mw-parser-output .hlist li ol>li:first-child::before{content:" ("counter(listitem)"\a0 "}</style><style data-mw-deduplicate="TemplateStyles:r1246091330">.mw-parser-output .sidebar{width:22em;float:right;clear:right;margin:0.5em 0 1em 1em;background:var(--background-color-neutral-subtle,#f8f9fa);border:1px solid var(--border-color-base,#a2a9b1);padding:0.2em;text-align:center;line-height:1.4em;font-size:88%;border-collapse:collapse;display:table}body.skin-minerva .mw-parser-output .sidebar{display:table!important;float:right!important;margin:0.5em 0 1em 1em!important}.mw-parser-output .sidebar-subgroup{width:100%;margin:0;border-spacing:0}.mw-parser-output .sidebar-left{float:left;clear:left;margin:0.5em 1em 1em 0}.mw-parser-output .sidebar-none{float:none;clear:both;margin:0.5em 1em 1em 0}.mw-parser-output .sidebar-outer-title{padding:0 0.4em 0.2em;font-size:125%;line-height:1.2em;font-weight:bold}.mw-parser-output .sidebar-top-image{padding:0.4em}.mw-parser-output .sidebar-top-caption,.mw-parser-output .sidebar-pretitle-with-top-image,.mw-parser-output .sidebar-caption{padding:0.2em 0.4em 0;line-height:1.2em}.mw-parser-output .sidebar-pretitle{padding:0.4em 0.4em 0;line-height:1.2em}.mw-parser-output .sidebar-title,.mw-parser-output .sidebar-title-with-pretitle{padding:0.2em 0.8em;font-size:145%;line-height:1.2em}.mw-parser-output .sidebar-title-with-pretitle{padding:0.1em 0.4em}.mw-parser-output .sidebar-image{padding:0.2em 0.4em 0.4em}.mw-parser-output .sidebar-heading{padding:0.1em 0.4em}.mw-parser-output .sidebar-content{padding:0 0.5em 0.4em}.mw-parser-output .sidebar-content-with-subgroup{padding:0.1em 0.4em 0.2em}.mw-parser-output .sidebar-above,.mw-parser-output .sidebar-below{padding:0.3em 0.8em;font-weight:bold}.mw-parser-output .sidebar-collapse .sidebar-above,.mw-parser-output .sidebar-collapse .sidebar-below{border-top:1px solid #aaa;border-bottom:1px solid #aaa}.mw-parser-output .sidebar-navbar{text-align:right;font-size:115%;padding:0 0.4em 0.4em}.mw-parser-output .sidebar-list-title{padding:0 0.4em;text-align:left;font-weight:bold;line-height:1.6em;font-size:105%}.mw-parser-output .sidebar-list-title-c{padding:0 0.4em;text-align:center;margin:0 3.3em}@media(max-width:640px){body.mediawiki .mw-parser-output .sidebar{width:100%!important;clear:both;float:none!important;margin-left:0!important;margin-right:0!important}}body.skin--responsive .mw-parser-output .sidebar a>img{max-width:none!important}@media screen{html.skin-theme-clientpref-night .mw-parser-output .sidebar:not(.notheme) .sidebar-list-title,html.skin-theme-clientpref-night .mw-parser-output .sidebar:not(.notheme) .sidebar-title-with-pretitle{background:transparent!important}html.skin-theme-clientpref-night .mw-parser-output .sidebar:not(.notheme) .sidebar-title-with-pretitle a{color:var(--color-progressive)!important}}@media screen and (prefers-color-scheme:dark){html.skin-theme-clientpref-os .mw-parser-output .sidebar:not(.notheme) .sidebar-list-title,html.skin-theme-clientpref-os .mw-parser-output .sidebar:not(.notheme) .sidebar-title-with-pretitle{background:transparent!important}html.skin-theme-clientpref-os .mw-parser-output .sidebar:not(.notheme) .sidebar-title-with-pretitle a{color:var(--color-progressive)!important}}@media print{body.ns-0 .mw-parser-output .sidebar{display:none!important}}</style><style data-mw-deduplicate="TemplateStyles:r886047488">.mw-parser-output .nobold{font-weight:normal}</style><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r886047488" /><table class="sidebar sidebar-collapse nomobile nowraplinks"><tbody><tr><td class="sidebar-pretitle">Part of a series on</td></tr><tr><th class="sidebar-title-with-pretitle"><a href="/wiki/Machine_learning" title="Machine learning">Machine learning</a><br />and <a href="/wiki/Data_mining" title="Data mining">data mining</a></th></tr><tr><td class="sidebar-content"> <div class="sidebar-list mw-collapsible mw-collapsed machine-learning-list-title"><div class="sidebar-list-title" style="border-top:1px solid #aaa; text-align:center;;color: var(--color-base)">Paradigms</div><div class="sidebar-list-content mw-collapsible-content hlist"> <ul><li><a href="/wiki/Supervised_learning" title="Supervised learning">Supervised learning</a></li> <li><a href="/wiki/Unsupervised_learning" title="Unsupervised learning">Unsupervised learning</a></li> <li><a href="/wiki/Semi-supervised_learning" class="mw-redirect" title="Semi-supervised learning">Semi-supervised learning</a></li> <li><a href="/wiki/Self-supervised_learning" title="Self-supervised learning">Self-supervised learning</a></li> <li><a href="/wiki/Reinforcement_learning" title="Reinforcement learning">Reinforcement learning</a></li> <li><a href="/wiki/Meta-learning_(computer_science)" title="Meta-learning (computer science)">Meta-learning</a></li> <li><a href="/wiki/Online_machine_learning" title="Online machine learning">Online learning</a></li> <li><a href="/wiki/Batch_learning" class="mw-redirect" title="Batch learning">Batch learning</a></li> <li><a href="/wiki/Curriculum_learning" title="Curriculum learning">Curriculum learning</a></li> <li><a href="/wiki/Rule-based_machine_learning" title="Rule-based machine learning">Rule-based learning</a></li> <li><a href="/wiki/Neuro-symbolic_AI" title="Neuro-symbolic AI">Neuro-symbolic AI</a></li> <li><a href="/wiki/Neuromorphic_engineering" class="mw-redirect" title="Neuromorphic engineering">Neuromorphic engineering</a></li> <li><a href="/wiki/Quantum_machine_learning" title="Quantum machine learning">Quantum machine learning</a></li></ul></div></div></td> </tr><tr><td class="sidebar-content"> <div class="sidebar-list mw-collapsible mw-collapsed machine-learning-list-title"><div class="sidebar-list-title" style="border-top:1px solid #aaa; text-align:center;;color: var(--color-base)">Problems</div><div class="sidebar-list-content mw-collapsible-content hlist"> <ul><li><a href="/wiki/Statistical_classification" title="Statistical classification">Classification</a></li> <li><a href="/wiki/Generative_model" title="Generative model">Generative modeling</a></li> <li><a href="/wiki/Regression_analysis" title="Regression analysis">Regression</a></li> <li><a href="/wiki/Cluster_analysis" title="Cluster analysis">Clustering</a></li> <li><a href="/wiki/Dimensionality_reduction" title="Dimensionality reduction">Dimensionality reduction</a></li> <li><a href="/wiki/Density_estimation" title="Density estimation">Density estimation</a></li> <li><a href="/wiki/Anomaly_detection" title="Anomaly detection">Anomaly detection</a></li> <li><a href="/wiki/Data_cleaning" class="mw-redirect" title="Data cleaning">Data cleaning</a></li> <li><a href="/wiki/Automated_machine_learning" title="Automated machine learning">AutoML</a></li> <li><a href="/wiki/Association_rule_learning" title="Association rule learning">Association rules</a></li> <li><a href="/wiki/Semantic_analysis_(machine_learning)" title="Semantic analysis (machine learning)">Semantic analysis</a></li> <li><a href="/wiki/Structured_prediction" title="Structured prediction">Structured prediction</a></li> <li><a href="/wiki/Feature_engineering" title="Feature engineering">Feature engineering</a></li> <li><a href="/wiki/Feature_learning" title="Feature learning">Feature learning</a></li> <li><a href="/wiki/Learning_to_rank" title="Learning to rank">Learning to rank</a></li> <li><a href="/wiki/Grammar_induction" title="Grammar induction">Grammar induction</a></li> <li><a href="/wiki/Ontology_learning" title="Ontology learning">Ontology learning</a></li> <li><a href="/wiki/Multimodal_learning" title="Multimodal learning">Multimodal learning</a></li></ul></div></div></td> </tr><tr><td class="sidebar-content"> <div class="sidebar-list mw-collapsible mw-collapsed machine-learning-list-title"><div class="sidebar-list-title" style="border-top:1px solid #aaa; text-align:center;;color: var(--color-base)"><div style="display: inline-block; line-height: 1.2em; padding: .1em 0;"><a href="/wiki/Supervised_learning" title="Supervised learning">Supervised learning</a><br /><span class="nobold"><span style="font-size: 85%;">(<b><a href="/wiki/Statistical_classification" title="Statistical classification">classification</a></b> • <b><a href="/wiki/Regression_analysis" title="Regression analysis">regression</a></b>)</span></span> </div></div><div class="sidebar-list-content mw-collapsible-content hlist"> <ul><li><a href="/wiki/Apprenticeship_learning" title="Apprenticeship learning">Apprenticeship learning</a></li> <li><a href="/wiki/Decision_tree_learning" title="Decision tree learning">Decision trees</a></li> <li><a href="/wiki/Ensemble_learning" title="Ensemble learning">Ensembles</a> <ul><li><a href="/wiki/Bootstrap_aggregating" title="Bootstrap aggregating">Bagging</a></li> <li><a href="/wiki/Boosting_(machine_learning)" title="Boosting (machine learning)">Boosting</a></li> <li><a href="/wiki/Random_forest" title="Random forest">Random forest</a></li></ul></li> <li><a href="/wiki/K-nearest_neighbors_algorithm" title="K-nearest neighbors algorithm"><i>k</i>-NN</a></li> <li><a href="/wiki/Linear_regression" title="Linear regression">Linear regression</a></li> <li><a href="/wiki/Naive_Bayes_classifier" title="Naive Bayes classifier">Naive Bayes</a></li> <li><a href="/wiki/Artificial_neural_network" class="mw-redirect" title="Artificial neural network">Artificial neural networks</a></li> <li><a href="/wiki/Logistic_regression" title="Logistic regression">Logistic regression</a></li> <li><a href="/wiki/Perceptron" title="Perceptron">Perceptron</a></li> <li><a href="/wiki/Relevance_vector_machine" title="Relevance vector machine">Relevance vector machine (RVM)</a></li> <li><a href="/wiki/Support_vector_machine" title="Support vector machine">Support vector machine (SVM)</a></li></ul></div></div></td> </tr><tr><td class="sidebar-content"> <div class="sidebar-list mw-collapsible mw-collapsed machine-learning-list-title"><div class="sidebar-list-title" style="border-top:1px solid #aaa; text-align:center;;color: var(--color-base)"><a href="/wiki/Cluster_analysis" title="Cluster analysis">Clustering</a></div><div class="sidebar-list-content mw-collapsible-content hlist"> <ul><li><a href="/wiki/BIRCH" title="BIRCH">BIRCH</a></li> <li><a href="/wiki/CURE_algorithm" title="CURE algorithm">CURE</a></li> <li><a href="/wiki/Hierarchical_clustering" title="Hierarchical clustering">Hierarchical</a></li> <li><a href="/wiki/K-means_clustering" title="K-means clustering"><i>k</i>-means</a></li> <li><a href="/wiki/Fuzzy_clustering" title="Fuzzy clustering">Fuzzy</a></li> <li><a href="/wiki/Expectation%E2%80%93maximization_algorithm" title="Expectation–maximization algorithm">Expectation–maximization (EM)</a></li> <li><br /><a href="/wiki/DBSCAN" title="DBSCAN">DBSCAN</a></li> <li><a href="/wiki/OPTICS_algorithm" title="OPTICS algorithm">OPTICS</a></li> <li><a href="/wiki/Mean_shift" title="Mean shift">Mean shift</a></li></ul></div></div></td> </tr><tr><td class="sidebar-content"> <div class="sidebar-list mw-collapsible mw-collapsed machine-learning-list-title"><div class="sidebar-list-title" style="border-top:1px solid #aaa; text-align:center;;color: var(--color-base)"><a href="/wiki/Dimensionality_reduction" title="Dimensionality reduction">Dimensionality reduction</a></div><div class="sidebar-list-content mw-collapsible-content hlist"> <ul><li><a href="/wiki/Factor_analysis" title="Factor analysis">Factor analysis</a></li> <li><a href="/wiki/Canonical_correlation" title="Canonical correlation">CCA</a></li> <li><a href="/wiki/Independent_component_analysis" title="Independent component analysis">ICA</a></li> <li><a href="/wiki/Linear_discriminant_analysis" title="Linear discriminant analysis">LDA</a></li> <li><a href="/wiki/Non-negative_matrix_factorization" title="Non-negative matrix factorization">NMF</a></li> <li><a href="/wiki/Principal_component_analysis" title="Principal component analysis">PCA</a></li> <li><a href="/wiki/Proper_generalized_decomposition" title="Proper generalized decomposition">PGD</a></li> <li><a href="/wiki/T-distributed_stochastic_neighbor_embedding" title="T-distributed stochastic neighbor embedding">t-SNE</a></li> <li><a href="/wiki/Sparse_dictionary_learning" title="Sparse dictionary learning">SDL</a></li></ul></div></div></td> </tr><tr><td class="sidebar-content"> <div class="sidebar-list mw-collapsible mw-collapsed machine-learning-list-title"><div class="sidebar-list-title" style="border-top:1px solid #aaa; text-align:center;;color: var(--color-base)"><a href="/wiki/Structured_prediction" title="Structured prediction">Structured prediction</a></div><div class="sidebar-list-content mw-collapsible-content hlist"> <ul><li><a href="/wiki/Graphical_model" title="Graphical model">Graphical models</a> <ul><li><a href="/wiki/Bayesian_network" title="Bayesian network">Bayes net</a></li> <li><a href="/wiki/Conditional_random_field" title="Conditional random field">Conditional random field</a></li> <li><a href="/wiki/Hidden_Markov_model" title="Hidden Markov model">Hidden Markov</a></li></ul></li></ul></div></div></td> </tr><tr><td class="sidebar-content"> <div class="sidebar-list mw-collapsible mw-collapsed machine-learning-list-title"><div class="sidebar-list-title" style="border-top:1px solid #aaa; text-align:center;;color: var(--color-base)"><a href="/wiki/Anomaly_detection" title="Anomaly detection">Anomaly detection</a></div><div class="sidebar-list-content mw-collapsible-content hlist"> <ul><li><a href="/wiki/Random_sample_consensus" title="Random sample consensus">RANSAC</a></li> <li><a href="/wiki/K-nearest_neighbors_algorithm" title="K-nearest neighbors algorithm"><i>k</i>-NN</a></li> <li><a href="/wiki/Local_outlier_factor" title="Local outlier factor">Local outlier factor</a></li> <li><a href="/wiki/Isolation_forest" title="Isolation forest">Isolation forest</a></li></ul></div></div></td> </tr><tr><td class="sidebar-content"> <div class="sidebar-list mw-collapsible mw-collapsed machine-learning-list-title"><div class="sidebar-list-title" style="border-top:1px solid #aaa; text-align:center;;color: var(--color-base)"><a href="/wiki/Artificial_neural_network" class="mw-redirect" title="Artificial neural network">Artificial neural network</a></div><div class="sidebar-list-content mw-collapsible-content hlist"> <ul><li><a href="/wiki/Autoencoder" title="Autoencoder">Autoencoder</a></li> <li><a href="/wiki/Deep_learning" title="Deep learning">Deep learning</a></li> <li><a href="/wiki/Feedforward_neural_network" title="Feedforward neural network">Feedforward neural network</a></li> <li><a href="/wiki/Recurrent_neural_network" title="Recurrent neural network">Recurrent neural network</a> <ul><li><a href="/wiki/Long_short-term_memory" title="Long short-term memory">LSTM</a></li> <li><a href="/wiki/Gated_recurrent_unit" title="Gated recurrent unit">GRU</a></li> <li><a href="/wiki/Echo_state_network" title="Echo state network">ESN</a></li> <li><a href="/wiki/Reservoir_computing" title="Reservoir computing">reservoir computing</a></li></ul></li> <li><a href="/wiki/Boltzmann_machine" title="Boltzmann machine">Boltzmann machine</a> <ul><li><a href="/wiki/Restricted_Boltzmann_machine" title="Restricted Boltzmann machine">Restricted</a></li></ul></li> <li><a href="/wiki/Generative_adversarial_network" title="Generative adversarial network">GAN</a></li> <li><a href="/wiki/Diffusion_model" title="Diffusion model">Diffusion model</a></li> <li><a href="/wiki/Self-organizing_map" title="Self-organizing map">SOM</a></li> <li><a href="/wiki/Convolutional_neural_network" title="Convolutional neural network">Convolutional neural network</a> <ul><li><a href="/wiki/U-Net" title="U-Net">U-Net</a></li> <li><a href="/wiki/LeNet" title="LeNet">LeNet</a></li> <li><a href="/wiki/AlexNet" title="AlexNet">AlexNet</a></li> <li><a href="/wiki/DeepDream" title="DeepDream">DeepDream</a></li></ul></li> <li><a href="/wiki/Neural_radiance_field" title="Neural radiance field">Neural radiance field</a></li> <li><a href="/wiki/Transformer_(machine_learning_model)" class="mw-redirect" title="Transformer (machine learning model)">Transformer</a> <ul><li><a href="/wiki/Vision_transformer" title="Vision transformer">Vision</a></li></ul></li> <li><a href="/wiki/Mamba_(deep_learning_architecture)" title="Mamba (deep learning architecture)">Mamba</a></li> <li><a href="/wiki/Spiking_neural_network" title="Spiking neural network">Spiking neural network</a></li> <li><a href="/wiki/Memtransistor" title="Memtransistor">Memtransistor</a></li> <li><a href="/wiki/Electrochemical_RAM" title="Electrochemical RAM">Electrochemical RAM</a> (ECRAM)</li></ul></div></div></td> </tr><tr><td class="sidebar-content"> <div class="sidebar-list mw-collapsible mw-collapsed machine-learning-list-title"><div class="sidebar-list-title" style="border-top:1px solid #aaa; text-align:center;;color: var(--color-base)"><a href="/wiki/Reinforcement_learning" title="Reinforcement learning">Reinforcement learning</a></div><div class="sidebar-list-content mw-collapsible-content hlist"> <ul><li><a href="/wiki/Q-learning" title="Q-learning">Q-learning</a></li> <li><a href="/wiki/State%E2%80%93action%E2%80%93reward%E2%80%93state%E2%80%93action" title="State–action–reward–state–action">SARSA</a></li> <li><a href="/wiki/Temporal_difference_learning" title="Temporal difference learning">Temporal difference (TD)</a></li> <li><a href="/wiki/Multi-agent_reinforcement_learning" title="Multi-agent reinforcement learning">Multi-agent</a> <ul><li><a href="/wiki/Self-play_(reinforcement_learning_technique)" class="mw-redirect" title="Self-play (reinforcement learning technique)">Self-play</a></li></ul></li></ul></div></div></td> </tr><tr><td class="sidebar-content"> <div class="sidebar-list mw-collapsible mw-collapsed machine-learning-list-title"><div class="sidebar-list-title" style="border-top:1px solid #aaa; text-align:center;;color: var(--color-base)">Learning with humans</div><div class="sidebar-list-content mw-collapsible-content hlist"> <ul><li><a href="/wiki/Active_learning_(machine_learning)" title="Active learning (machine learning)">Active learning</a></li> <li><a href="/wiki/Crowdsourcing" title="Crowdsourcing">Crowdsourcing</a></li> <li><a href="/wiki/Human-in-the-loop" title="Human-in-the-loop">Human-in-the-loop</a></li> <li><a href="/wiki/Reinforcement_learning_from_human_feedback" title="Reinforcement learning from human feedback">RLHF</a></li></ul></div></div></td> </tr><tr><td class="sidebar-content"> <div class="sidebar-list mw-collapsible mw-collapsed machine-learning-list-title"><div class="sidebar-list-title" style="border-top:1px solid #aaa; text-align:center;;color: var(--color-base)">Model diagnostics</div><div class="sidebar-list-content mw-collapsible-content hlist"> <ul><li><a href="/wiki/Coefficient_of_determination" title="Coefficient of determination">Coefficient of determination</a></li> <li><a href="/wiki/Confusion_matrix" title="Confusion matrix">Confusion matrix</a></li> <li><a href="/wiki/Learning_curve_(machine_learning)" title="Learning curve (machine learning)">Learning curve</a></li> <li><a href="/wiki/Receiver_operating_characteristic" title="Receiver operating characteristic">ROC curve</a></li></ul></div></div></td> </tr><tr><td class="sidebar-content"> <div class="sidebar-list mw-collapsible mw-collapsed machine-learning-list-title"><div class="sidebar-list-title" style="border-top:1px solid #aaa; text-align:center;;color: var(--color-base)">Mathematical foundations</div><div class="sidebar-list-content mw-collapsible-content hlist"> <ul><li><a href="/wiki/Kernel_machines" class="mw-redirect" title="Kernel machines">Kernel machines</a></li> <li><a href="/wiki/Bias%E2%80%93variance_tradeoff" title="Bias–variance tradeoff">Bias–variance tradeoff</a></li> <li><a href="/wiki/Computational_learning_theory" title="Computational learning theory">Computational learning theory</a></li> <li><a href="/wiki/Empirical_risk_minimization" title="Empirical risk minimization">Empirical risk minimization</a></li> <li><a href="/wiki/Occam_learning" title="Occam learning">Occam learning</a></li> <li><a href="/wiki/Probably_approximately_correct_learning" title="Probably approximately correct learning">PAC learning</a></li> <li><a href="/wiki/Statistical_learning_theory" title="Statistical learning theory">Statistical learning</a></li> <li><a href="/wiki/Vapnik%E2%80%93Chervonenkis_theory" title="Vapnik–Chervonenkis theory">VC theory</a></li> <li><a href="/wiki/Topological_deep_learning" title="Topological deep learning">Topological deep learning</a></li></ul></div></div></td> </tr><tr><td class="sidebar-content"> <div class="sidebar-list mw-collapsible mw-collapsed machine-learning-list-title"><div class="sidebar-list-title" style="border-top:1px solid #aaa; text-align:center;;color: var(--color-base)">Journals and conferences</div><div class="sidebar-list-content mw-collapsible-content hlist"> <ul><li><a href="/wiki/ECML_PKDD" title="ECML PKDD">ECML PKDD</a></li> <li><a href="/wiki/Conference_on_Neural_Information_Processing_Systems" title="Conference on Neural Information Processing Systems">NeurIPS</a></li> <li><a href="/wiki/International_Conference_on_Machine_Learning" title="International Conference on Machine Learning">ICML</a></li> <li><a href="/wiki/International_Conference_on_Learning_Representations" title="International Conference on Learning Representations">ICLR</a></li> <li><a href="/wiki/International_Joint_Conference_on_Artificial_Intelligence" title="International Joint Conference on Artificial Intelligence">IJCAI</a></li> <li><a href="/wiki/Machine_Learning_(journal)" title="Machine Learning (journal)">ML</a></li> <li><a href="/wiki/Journal_of_Machine_Learning_Research" title="Journal of Machine Learning Research">JMLR</a></li></ul></div></div></td> </tr><tr><td class="sidebar-content"> <div class="sidebar-list mw-collapsible mw-collapsed machine-learning-list-title"><div class="sidebar-list-title" style="border-top:1px solid #aaa; text-align:center;;color: var(--color-base)">Related articles</div><div class="sidebar-list-content mw-collapsible-content hlist"> <ul><li><a href="/wiki/Glossary_of_artificial_intelligence" title="Glossary of artificial intelligence">Glossary of artificial intelligence</a></li> <li><a href="/wiki/List_of_datasets_for_machine-learning_research" title="List of datasets for machine-learning research">List of datasets for machine-learning research</a> <ul><li><a href="/wiki/List_of_datasets_in_computer_vision_and_image_processing" title="List of datasets in computer vision and image processing">List of datasets in computer vision and image processing</a></li></ul></li> <li><a href="/wiki/Outline_of_machine_learning" title="Outline of machine learning">Outline of machine learning</a></li></ul></div></div></td> </tr><tr><td class="sidebar-navbar"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1129693374" /><style data-mw-deduplicate="TemplateStyles:r1239400231">.mw-parser-output .navbar{display:inline;font-size:88%;font-weight:normal}.mw-parser-output .navbar-collapse{float:left;text-align:left}.mw-parser-output .navbar-boxtext{word-spacing:0}.mw-parser-output .navbar ul{display:inline-block;white-space:nowrap;line-height:inherit}.mw-parser-output .navbar-brackets::before{margin-right:-0.125em;content:"[ "}.mw-parser-output .navbar-brackets::after{margin-left:-0.125em;content:" ]"}.mw-parser-output .navbar li{word-spacing:-0.125em}.mw-parser-output .navbar a>span,.mw-parser-output .navbar a>abbr{text-decoration:inherit}.mw-parser-output .navbar-mini abbr{font-variant:small-caps;border-bottom:none;text-decoration:none;cursor:inherit}.mw-parser-output .navbar-ct-full{font-size:114%;margin:0 7em}.mw-parser-output .navbar-ct-mini{font-size:114%;margin:0 4em}html.skin-theme-clientpref-night .mw-parser-output .navbar li a abbr{color:var(--color-base)!important}@media(prefers-color-scheme:dark){html.skin-theme-clientpref-os .mw-parser-output .navbar li a abbr{color:var(--color-base)!important}}@media print{.mw-parser-output .navbar{display:none!important}}</style><div class="navbar plainlinks hlist navbar-mini"><ul><li class="nv-view"><a href="/wiki/Template:Machine_learning" title="Template:Machine learning"><abbr title="View this template">v</abbr></a></li><li class="nv-talk"><a href="/wiki/Template_talk:Machine_learning" title="Template talk:Machine learning"><abbr title="Discuss this template">t</abbr></a></li><li class="nv-edit"><a href="/wiki/Special:EditPage/Template:Machine_learning" title="Special:EditPage/Template:Machine learning"><abbr title="Edit this template">e</abbr></a></li></ul></div></td></tr></tbody></table> <p>In <a href="/wiki/Machine_learning" title="Machine learning">machine learning</a>, the <b>vanishing gradient problem</b> is the problem of greatly diverging <a href="/wiki/Gradient" title="Gradient">gradient</a> magnitudes between earlier and later layers encountered when training <a href="/wiki/Neural_network_(machine_learning)" title="Neural network (machine learning)">neural networks</a> with <a href="/wiki/Backpropagation" title="Backpropagation">backpropagation</a>. In such methods, neural network weights are updated proportional to their <a href="/wiki/Partial_derivative" title="Partial derivative">partial derivative</a> of the <a href="/wiki/Loss_function" title="Loss function">loss function</a>.<sup id="cite_ref-Basodi2020_1-0" class="reference"><a href="#cite_note-Basodi2020-1"><span class="cite-bracket">[</span>1<span class="cite-bracket">]</span></a></sup> As the number of forward propagation steps in a network increases, for instance due to greater network depth, the gradients of earlier weights are calculated with increasingly many multiplications. These multiplications shrink the gradient magnitude. Consequently, the gradients of earlier weights will be exponentially smaller than the gradients of later weights. This difference in gradient magnitude might introduce instability in the training process, slow it, or halt it entirely.<sup id="cite_ref-Basodi2020_1-1" class="reference"><a href="#cite_note-Basodi2020-1"><span class="cite-bracket">[</span>1<span class="cite-bracket">]</span></a></sup> For instance, consider the <a href="/wiki/Hyperbolic_tangent" class="mw-redirect" title="Hyperbolic tangent">hyperbolic tangent</a> <a href="/wiki/Activation_function" title="Activation function">activation function</a>. The gradients of this function are in range <span class="texhtml">[-1,1]</span>. The product of repeated multiplication with such gradients decreases exponentially. The inverse problem, when weight gradients at earlier layers get exponentially larger, is called the <b>exploding gradient problem</b>. </p><p>Backpropagation allowed researchers to train <a href="/wiki/Supervised_learning" title="Supervised learning">supervised</a> deep artificial neural networks from scratch, initially with little success. <a href="/wiki/Sepp_Hochreiter" title="Sepp Hochreiter">Hochreiter</a>'s <a href="/wiki/Diplom" title="Diplom">diplom</a> thesis of 1991 formally identified the reason for this failure in the "vanishing gradient problem",<sup id="cite_ref-2" class="reference"><a href="#cite_note-2"><span class="cite-bracket">[</span>2<span class="cite-bracket">]</span></a></sup><sup id="cite_ref-3" class="reference"><a href="#cite_note-3"><span class="cite-bracket">[</span>3<span class="cite-bracket">]</span></a></sup> which not only affects <a href="/wiki/Deep_learning" title="Deep learning">many-layered</a> <a href="/wiki/Feedforward_neural_network" title="Feedforward neural network">feedforward networks</a>,<sup id="cite_ref-4" class="reference"><a href="#cite_note-4"><span class="cite-bracket">[</span>4<span class="cite-bracket">]</span></a></sup> but also <a href="/wiki/Recurrent_neural_network" title="Recurrent neural network">recurrent networks</a>.<sup id="cite_ref-5" class="reference"><a href="#cite_note-5"><span class="cite-bracket">[</span>5<span class="cite-bracket">]</span></a></sup><sup id="cite_ref-:1_6-0" class="reference"><a href="#cite_note-:1-6"><span class="cite-bracket">[</span>6<span class="cite-bracket">]</span></a></sup> The latter are trained by unfolding them into very deep feedforward networks, where a new layer is created for each time-step of an input sequence processed by the network (the combination of unfolding and backpropagation is termed <a href="/wiki/Backpropagation_through_time" title="Backpropagation through time">backpropagation through time</a>). </p> <style data-mw-deduplicate="TemplateStyles:r886046785">.mw-parser-output .toclimit-2 .toclevel-1 ul,.mw-parser-output .toclimit-3 .toclevel-2 ul,.mw-parser-output .toclimit-4 .toclevel-3 ul,.mw-parser-output .toclimit-5 .toclevel-4 ul,.mw-parser-output .toclimit-6 .toclevel-5 ul,.mw-parser-output .toclimit-7 .toclevel-6 ul{display:none}</style><div class="toclimit-3"><meta property="mw:PageProp/toc" /></div> <div class="mw-heading mw-heading2"><h2 id="Prototypical_models">Prototypical models</h2><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Vanishing_gradient_problem&action=edit&section=1" title="Edit section: Prototypical models"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <p>This section is based on the paper <i>On the difficulty of training Recurrent Neural Networks</i> by Pascanu, Mikolov, and Bengio.<sup id="cite_ref-:1_6-1" class="reference"><a href="#cite_note-:1-6"><span class="cite-bracket">[</span>6<span class="cite-bracket">]</span></a></sup> </p> <div class="mw-heading mw-heading3"><h3 id="Recurrent_network_model">Recurrent network model</h3><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Vanishing_gradient_problem&action=edit&section=2" title="Edit section: Recurrent network model"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div><p> A generic recurrent network has hidden states <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle h_{1},h_{2},...}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <msub> <mi>h</mi> <mrow class="MJX-TeXAtom-ORD"> <mn>1</mn> </mrow> </msub> <mo>,</mo> <msub> <mi>h</mi> <mrow class="MJX-TeXAtom-ORD"> <mn>2</mn> </mrow> </msub> <mo>,</mo> <mo>.</mo> <mo>.</mo> <mo>.</mo> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle h_{1},h_{2},...}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/722200529af3de4db648dfcc8b1883aca575e24a" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.671ex; width:9.569ex; height:2.509ex;" alt="{\displaystyle h_{1},h_{2},...}" /></span> inputs <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle u_{1},u_{2},...}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <msub> <mi>u</mi> <mrow class="MJX-TeXAtom-ORD"> <mn>1</mn> </mrow> </msub> <mo>,</mo> <msub> <mi>u</mi> <mrow class="MJX-TeXAtom-ORD"> <mn>2</mn> </mrow> </msub> <mo>,</mo> <mo>.</mo> <mo>.</mo> <mo>.</mo> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle u_{1},u_{2},...}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/e9a20386245fc36e68357ca86063e3a6c08ba611" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.671ex; width:9.55ex; height:2.009ex;" alt="{\displaystyle u_{1},u_{2},...}" /></span>, and outputs <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle x_{1},x_{2},...}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <msub> <mi>x</mi> <mrow class="MJX-TeXAtom-ORD"> <mn>1</mn> </mrow> </msub> <mo>,</mo> <msub> <mi>x</mi> <mrow class="MJX-TeXAtom-ORD"> <mn>2</mn> </mrow> </msub> <mo>,</mo> <mo>.</mo> <mo>.</mo> <mo>.</mo> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle x_{1},x_{2},...}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/6b2f60975da3f37f2277970a7ac48ada8c5aee11" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.671ex; width:9.55ex; height:2.009ex;" alt="{\displaystyle x_{1},x_{2},...}" /></span>. Let it be parametrized by <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle \theta }"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mi>θ<!-- θ --></mi> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle \theta }</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/6e5ab2664b422d53eb0c7df3b87e1360d75ad9af" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.338ex; width:1.09ex; height:2.176ex;" alt="{\displaystyle \theta }" /></span>, so that the system evolves as<span class="mwe-math-element"><span class="mwe-math-mathml-display mwe-math-mathml-a11y" style="display: none;"><math display="block" xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle (h_{t},x_{t})=F(h_{t-1},u_{t},\theta )}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mo stretchy="false">(</mo> <msub> <mi>h</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>t</mi> </mrow> </msub> <mo>,</mo> <msub> <mi>x</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>t</mi> </mrow> </msub> <mo stretchy="false">)</mo> <mo>=</mo> <mi>F</mi> <mo stretchy="false">(</mo> <msub> <mi>h</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>t</mi> <mo>−<!-- − --></mo> <mn>1</mn> </mrow> </msub> <mo>,</mo> <msub> <mi>u</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>t</mi> </mrow> </msub> <mo>,</mo> <mi>θ<!-- θ --></mi> <mo stretchy="false">)</mo> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle (h_{t},x_{t})=F(h_{t-1},u_{t},\theta )}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/6b73e4f9719df0eefd64d5d5979b453f2d874544" class="mwe-math-fallback-image-display mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.838ex; width:23.392ex; height:2.843ex;" alt="{\displaystyle (h_{t},x_{t})=F(h_{t-1},u_{t},\theta )}" /></span>Often, the output <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle x_{t}}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <msub> <mi>x</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>t</mi> </mrow> </msub> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle x_{t}}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/f279a30bc8eabc788f3fe81c9cfb674e72e858db" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.671ex; width:2.156ex; height:2.009ex;" alt="{\displaystyle x_{t}}" /></span> is a function of <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle h_{t}}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <msub> <mi>h</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>t</mi> </mrow> </msub> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle h_{t}}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/e8dbf3d8bfe322f68ff6400385578f8d78e1ba7c" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.671ex; width:2.165ex; height:2.509ex;" alt="{\displaystyle h_{t}}" /></span>, as some <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle x_{t}=G(h_{t})}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <msub> <mi>x</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>t</mi> </mrow> </msub> <mo>=</mo> <mi>G</mi> <mo stretchy="false">(</mo> <msub> <mi>h</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>t</mi> </mrow> </msub> <mo stretchy="false">)</mo> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle x_{t}=G(h_{t})}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/03ed64ee07f55ea4f9b2462004ed69dc8130fece" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.838ex; width:11.055ex; height:2.843ex;" alt="{\displaystyle x_{t}=G(h_{t})}" /></span>. The vanishing gradient problem already presents itself clearly when <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle x_{t}=h_{t}}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <msub> <mi>x</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>t</mi> </mrow> </msub> <mo>=</mo> <msub> <mi>h</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>t</mi> </mrow> </msub> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle x_{t}=h_{t}}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/1d94b7a685858e17254e8cb1d835b37448f4c1e4" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.671ex; width:7.419ex; height:2.509ex;" alt="{\displaystyle x_{t}=h_{t}}" /></span>, so we simplify our notation to the special case with:<span class="mwe-math-element"><span class="mwe-math-mathml-display mwe-math-mathml-a11y" style="display: none;"><math display="block" xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle x_{t}=F(x_{t-1},u_{t},\theta )}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <msub> <mi>x</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>t</mi> </mrow> </msub> <mo>=</mo> <mi>F</mi> <mo stretchy="false">(</mo> <msub> <mi>x</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>t</mi> <mo>−<!-- − --></mo> <mn>1</mn> </mrow> </msub> <mo>,</mo> <msub> <mi>u</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>t</mi> </mrow> </msub> <mo>,</mo> <mi>θ<!-- θ --></mi> <mo stretchy="false">)</mo> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle x_{t}=F(x_{t-1},u_{t},\theta )}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/73a3b6b7a3df0ad4122c6a4d84c6fa65b6bec5c7" class="mwe-math-fallback-image-display mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.838ex; width:18.374ex; height:2.843ex;" alt="{\displaystyle x_{t}=F(x_{t-1},u_{t},\theta )}" /></span> Now, take its <a href="/wiki/Differential_form" title="Differential form">differential</a>:<span class="mwe-math-element"><span class="mwe-math-mathml-display mwe-math-mathml-a11y" style="display: none;"><math display="block" xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle {\begin{aligned}dx_{t}&=\nabla _{\theta }F(x_{t-1},u_{t},\theta )d\theta +\nabla _{x}F(x_{t-1},u_{t},\theta )dx_{t-1}\\&=\nabla _{\theta }F(x_{t-1},u_{t},\theta )d\theta +\nabla _{x}F(x_{t-1},u_{t},\theta )(\nabla _{\theta }F(x_{t-2},u_{t-1},\theta )d\theta +\nabla _{x}F(x_{t-2},u_{t-1},\theta )dx_{t-2})\\&=\cdots \\&=\left(\nabla _{\theta }F(x_{t-1},u_{t},\theta )+\nabla _{x}F(x_{t-1},u_{t},\theta )\nabla _{\theta }F(x_{t-2},u_{t-1},\theta )+\cdots \right)d\theta \end{aligned}}}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mrow class="MJX-TeXAtom-ORD"> <mtable columnalign="right left right left right left right left right left right left" rowspacing="3pt" columnspacing="0em 2em 0em 2em 0em 2em 0em 2em 0em 2em 0em" displaystyle="true"> <mtr> <mtd> <mi>d</mi> <msub> <mi>x</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>t</mi> </mrow> </msub> </mtd> <mtd> <mi></mi> <mo>=</mo> <msub> <mi mathvariant="normal">∇<!-- ∇ --></mi> <mrow class="MJX-TeXAtom-ORD"> <mi>θ<!-- θ --></mi> </mrow> </msub> <mi>F</mi> <mo stretchy="false">(</mo> <msub> <mi>x</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>t</mi> <mo>−<!-- − --></mo> <mn>1</mn> </mrow> </msub> <mo>,</mo> <msub> <mi>u</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>t</mi> </mrow> </msub> <mo>,</mo> <mi>θ<!-- θ --></mi> <mo stretchy="false">)</mo> <mi>d</mi> <mi>θ<!-- θ --></mi> <mo>+</mo> <msub> <mi mathvariant="normal">∇<!-- ∇ --></mi> <mrow class="MJX-TeXAtom-ORD"> <mi>x</mi> </mrow> </msub> <mi>F</mi> <mo stretchy="false">(</mo> <msub> <mi>x</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>t</mi> <mo>−<!-- − --></mo> <mn>1</mn> </mrow> </msub> <mo>,</mo> <msub> <mi>u</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>t</mi> </mrow> </msub> <mo>,</mo> <mi>θ<!-- θ --></mi> <mo stretchy="false">)</mo> <mi>d</mi> <msub> <mi>x</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>t</mi> <mo>−<!-- − --></mo> <mn>1</mn> </mrow> </msub> </mtd> </mtr> <mtr> <mtd></mtd> <mtd> <mi></mi> <mo>=</mo> <msub> <mi mathvariant="normal">∇<!-- ∇ --></mi> <mrow class="MJX-TeXAtom-ORD"> <mi>θ<!-- θ --></mi> </mrow> </msub> <mi>F</mi> <mo stretchy="false">(</mo> <msub> <mi>x</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>t</mi> <mo>−<!-- − --></mo> <mn>1</mn> </mrow> </msub> <mo>,</mo> <msub> <mi>u</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>t</mi> </mrow> </msub> <mo>,</mo> <mi>θ<!-- θ --></mi> <mo stretchy="false">)</mo> <mi>d</mi> <mi>θ<!-- θ --></mi> <mo>+</mo> <msub> <mi mathvariant="normal">∇<!-- ∇ --></mi> <mrow class="MJX-TeXAtom-ORD"> <mi>x</mi> </mrow> </msub> <mi>F</mi> <mo stretchy="false">(</mo> <msub> <mi>x</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>t</mi> <mo>−<!-- − --></mo> <mn>1</mn> </mrow> </msub> <mo>,</mo> <msub> <mi>u</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>t</mi> </mrow> </msub> <mo>,</mo> <mi>θ<!-- θ --></mi> <mo stretchy="false">)</mo> <mo stretchy="false">(</mo> <msub> <mi mathvariant="normal">∇<!-- ∇ --></mi> <mrow class="MJX-TeXAtom-ORD"> <mi>θ<!-- θ --></mi> </mrow> </msub> <mi>F</mi> <mo stretchy="false">(</mo> <msub> <mi>x</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>t</mi> <mo>−<!-- − --></mo> <mn>2</mn> </mrow> </msub> <mo>,</mo> <msub> <mi>u</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>t</mi> <mo>−<!-- − --></mo> <mn>1</mn> </mrow> </msub> <mo>,</mo> <mi>θ<!-- θ --></mi> <mo stretchy="false">)</mo> <mi>d</mi> <mi>θ<!-- θ --></mi> <mo>+</mo> <msub> <mi mathvariant="normal">∇<!-- ∇ --></mi> <mrow class="MJX-TeXAtom-ORD"> <mi>x</mi> </mrow> </msub> <mi>F</mi> <mo stretchy="false">(</mo> <msub> <mi>x</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>t</mi> <mo>−<!-- − --></mo> <mn>2</mn> </mrow> </msub> <mo>,</mo> <msub> <mi>u</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>t</mi> <mo>−<!-- − --></mo> <mn>1</mn> </mrow> </msub> <mo>,</mo> <mi>θ<!-- θ --></mi> <mo stretchy="false">)</mo> <mi>d</mi> <msub> <mi>x</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>t</mi> <mo>−<!-- − --></mo> <mn>2</mn> </mrow> </msub> <mo stretchy="false">)</mo> </mtd> </mtr> <mtr> <mtd></mtd> <mtd> <mi></mi> <mo>=</mo> <mo>⋯<!-- ⋯ --></mo> </mtd> </mtr> <mtr> <mtd></mtd> <mtd> <mi></mi> <mo>=</mo> <mrow> <mo>(</mo> <mrow> <msub> <mi mathvariant="normal">∇<!-- ∇ --></mi> <mrow class="MJX-TeXAtom-ORD"> <mi>θ<!-- θ --></mi> </mrow> </msub> <mi>F</mi> <mo stretchy="false">(</mo> <msub> <mi>x</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>t</mi> <mo>−<!-- − --></mo> <mn>1</mn> </mrow> </msub> <mo>,</mo> <msub> <mi>u</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>t</mi> </mrow> </msub> <mo>,</mo> <mi>θ<!-- θ --></mi> <mo stretchy="false">)</mo> <mo>+</mo> <msub> <mi mathvariant="normal">∇<!-- ∇ --></mi> <mrow class="MJX-TeXAtom-ORD"> <mi>x</mi> </mrow> </msub> <mi>F</mi> <mo stretchy="false">(</mo> <msub> <mi>x</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>t</mi> <mo>−<!-- − --></mo> <mn>1</mn> </mrow> </msub> <mo>,</mo> <msub> <mi>u</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>t</mi> </mrow> </msub> <mo>,</mo> <mi>θ<!-- θ --></mi> <mo stretchy="false">)</mo> <msub> <mi mathvariant="normal">∇<!-- ∇ --></mi> <mrow class="MJX-TeXAtom-ORD"> <mi>θ<!-- θ --></mi> </mrow> </msub> <mi>F</mi> <mo stretchy="false">(</mo> <msub> <mi>x</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>t</mi> <mo>−<!-- − --></mo> <mn>2</mn> </mrow> </msub> <mo>,</mo> <msub> <mi>u</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>t</mi> <mo>−<!-- − --></mo> <mn>1</mn> </mrow> </msub> <mo>,</mo> <mi>θ<!-- θ --></mi> <mo stretchy="false">)</mo> <mo>+</mo> <mo>⋯<!-- ⋯ --></mo> </mrow> <mo>)</mo> </mrow> <mi>d</mi> <mi>θ<!-- θ --></mi> </mtd> </mtr> </mtable> </mrow> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle {\begin{aligned}dx_{t}&=\nabla _{\theta }F(x_{t-1},u_{t},\theta )d\theta +\nabla _{x}F(x_{t-1},u_{t},\theta )dx_{t-1}\\&=\nabla _{\theta }F(x_{t-1},u_{t},\theta )d\theta +\nabla _{x}F(x_{t-1},u_{t},\theta )(\nabla _{\theta }F(x_{t-2},u_{t-1},\theta )d\theta +\nabla _{x}F(x_{t-2},u_{t-1},\theta )dx_{t-2})\\&=\cdots \\&=\left(\nabla _{\theta }F(x_{t-1},u_{t},\theta )+\nabla _{x}F(x_{t-1},u_{t},\theta )\nabla _{\theta }F(x_{t-2},u_{t-1},\theta )+\cdots \right)d\theta \end{aligned}}}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/98fabc94da7acd7f8bf24e76585952f1a1fc2b62" class="mwe-math-fallback-image-display mw-invert skin-invert" aria-hidden="true" style="vertical-align: -5.505ex; width:93.573ex; height:12.176ex;" alt="{\displaystyle {\begin{aligned}dx_{t}&=\nabla _{\theta }F(x_{t-1},u_{t},\theta )d\theta +\nabla _{x}F(x_{t-1},u_{t},\theta )dx_{t-1}\\&=\nabla _{\theta }F(x_{t-1},u_{t},\theta )d\theta +\nabla _{x}F(x_{t-1},u_{t},\theta )(\nabla _{\theta }F(x_{t-2},u_{t-1},\theta )d\theta +\nabla _{x}F(x_{t-2},u_{t-1},\theta )dx_{t-2})\\&=\cdots \\&=\left(\nabla _{\theta }F(x_{t-1},u_{t},\theta )+\nabla _{x}F(x_{t-1},u_{t},\theta )\nabla _{\theta }F(x_{t-2},u_{t-1},\theta )+\cdots \right)d\theta \end{aligned}}}" /></span>Training the network requires us to define a loss function to be minimized. Let it be <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle L(x_{T},u_{1},...,u_{T})}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mi>L</mi> <mo stretchy="false">(</mo> <msub> <mi>x</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>T</mi> </mrow> </msub> <mo>,</mo> <msub> <mi>u</mi> <mrow class="MJX-TeXAtom-ORD"> <mn>1</mn> </mrow> </msub> <mo>,</mo> <mo>.</mo> <mo>.</mo> <mo>.</mo> <mo>,</mo> <msub> <mi>u</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>T</mi> </mrow> </msub> <mo stretchy="false">)</mo> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle L(x_{T},u_{1},...,u_{T})}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/392bab1995dd50a2f0e16006eebb02068106e1e2" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.838ex; width:17.418ex; height:2.843ex;" alt="{\displaystyle L(x_{T},u_{1},...,u_{T})}" /></span><sup id="cite_ref-let_it_be_L_7-0" class="reference"><a href="#cite_note-let_it_be_L-7"><span class="cite-bracket">[</span>note 1<span class="cite-bracket">]</span></a></sup>, then minimizing it by gradient descent gives<style data-mw-deduplicate="TemplateStyles:r1266403038">.mw-parser-output table.numblk{border-collapse:collapse;border:none;margin-top:0;margin-right:0;margin-bottom:0}.mw-parser-output table.numblk>tbody>tr>td{vertical-align:middle;padding:0}.mw-parser-output table.numblk>tbody>tr>td:nth-child(2){width:99%}.mw-parser-output table.numblk>tbody>tr>td:nth-child(2)>table{border-collapse:collapse;margin:0;border:none;width:100%}.mw-parser-output table.numblk>tbody>tr>td:nth-child(2)>table>tbody>tr:first-child>td:first-child,.mw-parser-output table.numblk>tbody>tr>td:nth-child(2)>table>tbody>tr:first-child>td:last-child{padding:0 0.4ex}.mw-parser-output table.numblk>tbody>tr>td:nth-child(2)>table>tbody>tr:first-child>td:nth-child(2){width:100%;padding:0}.mw-parser-output table.numblk>tbody>tr>td:nth-child(2)>table>tbody>tr:last-child>td{padding:0}.mw-parser-output table.numblk>tbody>tr>td:last-child{font-weight:bold}.mw-parser-output table.numblk.numblk-raw-n>tbody>tr>td:last-child{font-weight:unset}.mw-parser-output table.numblk>tbody>tr>td:last-child::before{content:"("}.mw-parser-output table.numblk>tbody>tr>td:last-child::after{content:")"}.mw-parser-output table.numblk.numblk-raw-n>tbody>tr>td:last-child::before,.mw-parser-output table.numblk.numblk-raw-n>tbody>tr>td:last-child::after{content:none}.mw-parser-output table.numblk>tbody>tr>td{border:none}.mw-parser-output table.numblk.numblk-border>tbody>tr>td{border:thin solid}.mw-parser-output table.numblk>tbody>tr>td:nth-child(2)>table>tbody>tr:first-child>td{border:none}.mw-parser-output table.numblk.numblk-border>tbody>tr>td:nth-child(2)>table>tbody>tr:first-child>td{border:thin solid}.mw-parser-output table.numblk>tbody>tr>td:nth-child(2)>table>tbody>tr:last-child>td{border-left:none;border-right:none;border-bottom:none}.mw-parser-output table.numblk.numblk-border>tbody>tr>td:nth-child(2)>table>tbody>tr:last-child>td{border-left:thin solid;border-right:thin solid;border-bottom:thin solid}.mw-parser-output table.numblk:target{color:var(--color-base,#202122);background-color:#cfe8fd}@media screen{html.skin-theme-clientpref-night .mw-parser-output table.numblk:target{color:var(--color-base,#eaecf0);background-color:#301702}}@media screen and (prefers-color-scheme:dark){html.skin-theme-clientpref-os .mw-parser-output table.numblk:target{color:var(--color-base,#eaecf0);background-color:#301702}}</style></p><table role="presentation" class="numblk" style="margin-left: 1.6em;"><tbody><tr><td class="nowrap"><span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle dL=\nabla _{x}L(x_{T},u_{1},...,u_{T})\left(\nabla _{\theta }F(x_{t-1},u_{t},\theta )+\nabla _{x}F(x_{t-1},u_{t},\theta )\nabla _{\theta }F(x_{t-2},u_{t-1},\theta )+\cdots \right)d\theta }"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mi>d</mi> <mi>L</mi> <mo>=</mo> <msub> <mi mathvariant="normal">∇<!-- ∇ --></mi> <mrow class="MJX-TeXAtom-ORD"> <mi>x</mi> </mrow> </msub> <mi>L</mi> <mo stretchy="false">(</mo> <msub> <mi>x</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>T</mi> </mrow> </msub> <mo>,</mo> <msub> <mi>u</mi> <mrow class="MJX-TeXAtom-ORD"> <mn>1</mn> </mrow> </msub> <mo>,</mo> <mo>.</mo> <mo>.</mo> <mo>.</mo> <mo>,</mo> <msub> <mi>u</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>T</mi> </mrow> </msub> <mo stretchy="false">)</mo> <mrow> <mo>(</mo> <mrow> <msub> <mi mathvariant="normal">∇<!-- ∇ --></mi> <mrow class="MJX-TeXAtom-ORD"> <mi>θ<!-- θ --></mi> </mrow> </msub> <mi>F</mi> <mo stretchy="false">(</mo> <msub> <mi>x</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>t</mi> <mo>−<!-- − --></mo> <mn>1</mn> </mrow> </msub> <mo>,</mo> <msub> <mi>u</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>t</mi> </mrow> </msub> <mo>,</mo> <mi>θ<!-- θ --></mi> <mo stretchy="false">)</mo> <mo>+</mo> <msub> <mi mathvariant="normal">∇<!-- ∇ --></mi> <mrow class="MJX-TeXAtom-ORD"> <mi>x</mi> </mrow> </msub> <mi>F</mi> <mo stretchy="false">(</mo> <msub> <mi>x</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>t</mi> <mo>−<!-- − --></mo> <mn>1</mn> </mrow> </msub> <mo>,</mo> <msub> <mi>u</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>t</mi> </mrow> </msub> <mo>,</mo> <mi>θ<!-- θ --></mi> <mo stretchy="false">)</mo> <msub> <mi mathvariant="normal">∇<!-- ∇ --></mi> <mrow class="MJX-TeXAtom-ORD"> <mi>θ<!-- θ --></mi> </mrow> </msub> <mi>F</mi> <mo stretchy="false">(</mo> <msub> <mi>x</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>t</mi> <mo>−<!-- − --></mo> <mn>2</mn> </mrow> </msub> <mo>,</mo> <msub> <mi>u</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>t</mi> <mo>−<!-- − --></mo> <mn>1</mn> </mrow> </msub> <mo>,</mo> <mi>θ<!-- θ --></mi> <mo stretchy="false">)</mo> <mo>+</mo> <mo>⋯<!-- ⋯ --></mo> </mrow> <mo>)</mo> </mrow> <mi>d</mi> <mi>θ<!-- θ --></mi> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle dL=\nabla _{x}L(x_{T},u_{1},...,u_{T})\left(\nabla _{\theta }F(x_{t-1},u_{t},\theta )+\nabla _{x}F(x_{t-1},u_{t},\theta )\nabla _{\theta }F(x_{t-2},u_{t-1},\theta )+\cdots \right)d\theta }</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/1c36895b0e592760a0f9d9862d9f4addda946ea7" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.838ex; width:90.165ex; height:2.843ex;" alt="{\displaystyle dL=\nabla _{x}L(x_{T},u_{1},...,u_{T})\left(\nabla _{\theta }F(x_{t-1},u_{t},\theta )+\nabla _{x}F(x_{t-1},u_{t},\theta )\nabla _{\theta }F(x_{t-2},u_{t-1},\theta )+\cdots \right)d\theta }" /></span></td> <td></td> <td class="nowrap"><span id="math_loss_differential" class="reference nourlexpansion" style="font-weight:bold;">loss differential</span></td></tr></tbody></table><p><span class="mwe-math-element"><span class="mwe-math-mathml-display mwe-math-mathml-a11y" style="display: none;"><math display="block" xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle \Delta \theta =-\eta \cdot \left[\nabla _{x}L(x_{T})\left(\nabla _{\theta }F(x_{t-1},u_{t},\theta )+\nabla _{x}F(x_{t-1},u_{t},\theta )\nabla _{\theta }F(x_{t-2},u_{t-1},\theta )+\cdots \right)\right]^{T}}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mi mathvariant="normal">Δ<!-- Δ --></mi> <mi>θ<!-- θ --></mi> <mo>=</mo> <mo>−<!-- − --></mo> <mi>η<!-- η --></mi> <mo>⋅<!-- ⋅ --></mo> <msup> <mrow> <mo>[</mo> <mrow> <msub> <mi mathvariant="normal">∇<!-- ∇ --></mi> <mrow class="MJX-TeXAtom-ORD"> <mi>x</mi> </mrow> </msub> <mi>L</mi> <mo stretchy="false">(</mo> <msub> <mi>x</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>T</mi> </mrow> </msub> <mo stretchy="false">)</mo> <mrow> <mo>(</mo> <mrow> <msub> <mi mathvariant="normal">∇<!-- ∇ --></mi> <mrow class="MJX-TeXAtom-ORD"> <mi>θ<!-- θ --></mi> </mrow> </msub> <mi>F</mi> <mo stretchy="false">(</mo> <msub> <mi>x</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>t</mi> <mo>−<!-- − --></mo> <mn>1</mn> </mrow> </msub> <mo>,</mo> <msub> <mi>u</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>t</mi> </mrow> </msub> <mo>,</mo> <mi>θ<!-- θ --></mi> <mo stretchy="false">)</mo> <mo>+</mo> <msub> <mi mathvariant="normal">∇<!-- ∇ --></mi> <mrow class="MJX-TeXAtom-ORD"> <mi>x</mi> </mrow> </msub> <mi>F</mi> <mo stretchy="false">(</mo> <msub> <mi>x</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>t</mi> <mo>−<!-- − --></mo> <mn>1</mn> </mrow> </msub> <mo>,</mo> <msub> <mi>u</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>t</mi> </mrow> </msub> <mo>,</mo> <mi>θ<!-- θ --></mi> <mo stretchy="false">)</mo> <msub> <mi mathvariant="normal">∇<!-- ∇ --></mi> <mrow class="MJX-TeXAtom-ORD"> <mi>θ<!-- θ --></mi> </mrow> </msub> <mi>F</mi> <mo stretchy="false">(</mo> <msub> <mi>x</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>t</mi> <mo>−<!-- − --></mo> <mn>2</mn> </mrow> </msub> <mo>,</mo> <msub> <mi>u</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>t</mi> <mo>−<!-- − --></mo> <mn>1</mn> </mrow> </msub> <mo>,</mo> <mi>θ<!-- θ --></mi> <mo stretchy="false">)</mo> <mo>+</mo> <mo>⋯<!-- ⋯ --></mo> </mrow> <mo>)</mo> </mrow> </mrow> <mo>]</mo> </mrow> <mrow class="MJX-TeXAtom-ORD"> <mi>T</mi> </mrow> </msup> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle \Delta \theta =-\eta \cdot \left[\nabla _{x}L(x_{T})\left(\nabla _{\theta }F(x_{t-1},u_{t},\theta )+\nabla _{x}F(x_{t-1},u_{t},\theta )\nabla _{\theta }F(x_{t-2},u_{t-1},\theta )+\cdots \right)\right]^{T}}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/a28acb6ed0be2da7865991555d00895d211598b2" class="mwe-math-fallback-image-display mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.838ex; width:83.732ex; height:3.343ex;" alt="{\displaystyle \Delta \theta =-\eta \cdot \left[\nabla _{x}L(x_{T})\left(\nabla _{\theta }F(x_{t-1},u_{t},\theta )+\nabla _{x}F(x_{t-1},u_{t},\theta )\nabla _{\theta }F(x_{t-2},u_{t-1},\theta )+\cdots \right)\right]^{T}}" /></span>where <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle \eta }"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mi>η<!-- η --></mi> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle \eta }</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/e4d701857cf5fbec133eebaf94deadf722537f64" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.838ex; width:1.169ex; height:2.176ex;" alt="{\displaystyle \eta }" /></span> is the learning rate. </p><p>The vanishing/exploding gradient problem appears because there are repeated multiplications, of the form<span class="mwe-math-element"><span class="mwe-math-mathml-display mwe-math-mathml-a11y" style="display: none;"><math display="block" xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle \nabla _{x}F(x_{t-1},u_{t},\theta )\nabla _{x}F(x_{t-2},u_{t-1},\theta )\nabla _{x}F(x_{t-3},u_{t-2},\theta )\cdots }"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <msub> <mi mathvariant="normal">∇<!-- ∇ --></mi> <mrow class="MJX-TeXAtom-ORD"> <mi>x</mi> </mrow> </msub> <mi>F</mi> <mo stretchy="false">(</mo> <msub> <mi>x</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>t</mi> <mo>−<!-- − --></mo> <mn>1</mn> </mrow> </msub> <mo>,</mo> <msub> <mi>u</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>t</mi> </mrow> </msub> <mo>,</mo> <mi>θ<!-- θ --></mi> <mo stretchy="false">)</mo> <msub> <mi mathvariant="normal">∇<!-- ∇ --></mi> <mrow class="MJX-TeXAtom-ORD"> <mi>x</mi> </mrow> </msub> <mi>F</mi> <mo stretchy="false">(</mo> <msub> <mi>x</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>t</mi> <mo>−<!-- − --></mo> <mn>2</mn> </mrow> </msub> <mo>,</mo> <msub> <mi>u</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>t</mi> <mo>−<!-- − --></mo> <mn>1</mn> </mrow> </msub> <mo>,</mo> <mi>θ<!-- θ --></mi> <mo stretchy="false">)</mo> <msub> <mi mathvariant="normal">∇<!-- ∇ --></mi> <mrow class="MJX-TeXAtom-ORD"> <mi>x</mi> </mrow> </msub> <mi>F</mi> <mo stretchy="false">(</mo> <msub> <mi>x</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>t</mi> <mo>−<!-- − --></mo> <mn>3</mn> </mrow> </msub> <mo>,</mo> <msub> <mi>u</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>t</mi> <mo>−<!-- − --></mo> <mn>2</mn> </mrow> </msub> <mo>,</mo> <mi>θ<!-- θ --></mi> <mo stretchy="false">)</mo> <mo>⋯<!-- ⋯ --></mo> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle \nabla _{x}F(x_{t-1},u_{t},\theta )\nabla _{x}F(x_{t-2},u_{t-1},\theta )\nabla _{x}F(x_{t-3},u_{t-2},\theta )\cdots }</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/26201504a7d6c58f35426b2e126700ae58bd6cc1" class="mwe-math-fallback-image-display mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.838ex; width:55.997ex; height:2.843ex;" alt="{\displaystyle \nabla _{x}F(x_{t-1},u_{t},\theta )\nabla _{x}F(x_{t-2},u_{t-1},\theta )\nabla _{x}F(x_{t-3},u_{t-2},\theta )\cdots }" /></span> </p> <div class="mw-heading mw-heading4"><h4 id="Example:_recurrent_network_with_sigmoid_activation">Example: recurrent network with sigmoid activation</h4><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Vanishing_gradient_problem&action=edit&section=3" title="Edit section: Example: recurrent network with sigmoid activation"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <p>For a concrete example, consider a typical recurrent network defined by </p><p><span class="mwe-math-element"><span class="mwe-math-mathml-display mwe-math-mathml-a11y" style="display: none;"><math display="block" xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle x_{t}=F(x_{t-1},u_{t},\theta )=W_{rec}\sigma (x_{t-1})+W_{in}u_{t}+b}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <msub> <mi>x</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>t</mi> </mrow> </msub> <mo>=</mo> <mi>F</mi> <mo stretchy="false">(</mo> <msub> <mi>x</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>t</mi> <mo>−<!-- − --></mo> <mn>1</mn> </mrow> </msub> <mo>,</mo> <msub> <mi>u</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>t</mi> </mrow> </msub> <mo>,</mo> <mi>θ<!-- θ --></mi> <mo stretchy="false">)</mo> <mo>=</mo> <msub> <mi>W</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>r</mi> <mi>e</mi> <mi>c</mi> </mrow> </msub> <mi>σ<!-- σ --></mi> <mo stretchy="false">(</mo> <msub> <mi>x</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>t</mi> <mo>−<!-- − --></mo> <mn>1</mn> </mrow> </msub> <mo stretchy="false">)</mo> <mo>+</mo> <msub> <mi>W</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>i</mi> <mi>n</mi> </mrow> </msub> <msub> <mi>u</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>t</mi> </mrow> </msub> <mo>+</mo> <mi>b</mi> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle x_{t}=F(x_{t-1},u_{t},\theta )=W_{rec}\sigma (x_{t-1})+W_{in}u_{t}+b}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/ba00dff3e80f75f3602220b72be1a1a40294096d" class="mwe-math-fallback-image-display mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.838ex; width:46.327ex; height:2.843ex;" alt="{\displaystyle x_{t}=F(x_{t-1},u_{t},\theta )=W_{rec}\sigma (x_{t-1})+W_{in}u_{t}+b}" /></span>where <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle \theta =(W_{rec},W_{in})}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mi>θ<!-- θ --></mi> <mo>=</mo> <mo stretchy="false">(</mo> <msub> <mi>W</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>r</mi> <mi>e</mi> <mi>c</mi> </mrow> </msub> <mo>,</mo> <msub> <mi>W</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>i</mi> <mi>n</mi> </mrow> </msub> <mo stretchy="false">)</mo> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle \theta =(W_{rec},W_{in})}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/a172ba6d89ffe5dcfa8d17bcd00b97789fd163b7" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.838ex; width:15.657ex; height:2.843ex;" alt="{\displaystyle \theta =(W_{rec},W_{in})}" /></span> is the network parameter, <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle \sigma }"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mi>σ<!-- σ --></mi> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle \sigma }</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/59f59b7c3e6fdb1d0365a494b81fb9a696138c36" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.338ex; width:1.33ex; height:1.676ex;" alt="{\displaystyle \sigma }" /></span> is the <a href="/wiki/Sigmoid_function" title="Sigmoid function">sigmoid activation function</a><sup id="cite_ref-sigmoid_activation_function_8-0" class="reference"><a href="#cite_note-sigmoid_activation_function-8"><span class="cite-bracket">[</span>note 2<span class="cite-bracket">]</span></a></sup>, applied to each vector coordinate separately, and <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle b}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mi>b</mi> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle b}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/f11423fbb2e967f986e36804a8ae4271734917c3" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.338ex; width:0.998ex; height:2.176ex;" alt="{\displaystyle b}" /></span> is the bias vector. </p><p>Then, <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle \nabla _{x}F(x_{t-1},u_{t},\theta )=W_{rec}\mathop {diag} (\sigma '(x_{t-1}))}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <msub> <mi mathvariant="normal">∇<!-- ∇ --></mi> <mrow class="MJX-TeXAtom-ORD"> <mi>x</mi> </mrow> </msub> <mi>F</mi> <mo stretchy="false">(</mo> <msub> <mi>x</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>t</mi> <mo>−<!-- − --></mo> <mn>1</mn> </mrow> </msub> <mo>,</mo> <msub> <mi>u</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>t</mi> </mrow> </msub> <mo>,</mo> <mi>θ<!-- θ --></mi> <mo stretchy="false">)</mo> <mo>=</mo> <msub> <mi>W</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>r</mi> <mi>e</mi> <mi>c</mi> </mrow> </msub> <mrow class="MJX-TeXAtom-OP"> <mi>d</mi> <mi>i</mi> <mi>a</mi> <mi>g</mi> </mrow> <mo>⁡<!-- --></mo> <mo stretchy="false">(</mo> <msup> <mi>σ<!-- σ --></mi> <mo>′</mo> </msup> <mo stretchy="false">(</mo> <msub> <mi>x</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>t</mi> <mo>−<!-- − --></mo> <mn>1</mn> </mrow> </msub> <mo stretchy="false">)</mo> <mo stretchy="false">)</mo> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle \nabla _{x}F(x_{t-1},u_{t},\theta )=W_{rec}\mathop {diag} (\sigma '(x_{t-1}))}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/c5a4c7302b8b6bec2fc1befb133f0cb7b934d04d" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.838ex; width:38.614ex; height:3.009ex;" alt="{\displaystyle \nabla _{x}F(x_{t-1},u_{t},\theta )=W_{rec}\mathop {diag} (\sigma '(x_{t-1}))}" /></span>, and so <span class="mwe-math-element"><span class="mwe-math-mathml-display mwe-math-mathml-a11y" style="display: none;"><math display="block" xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle {\begin{aligned}\nabla _{x}F(x_{t-1},u_{t},\theta )&\nabla _{x}F(x_{t-2},u_{t-1},\theta )\cdots \nabla _{x}F(x_{t-k},u_{t-k+1},\theta )\\=W_{rec}\mathop {diag} (\sigma '(x_{t-1}))&W_{rec}\mathop {diag} (\sigma '(x_{t-2}))\cdots W_{rec}\mathop {diag} (\sigma '(x_{t-k}))\end{aligned}}}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mrow class="MJX-TeXAtom-ORD"> <mtable columnalign="right left right left right left right left right left right left" rowspacing="3pt" columnspacing="0em 2em 0em 2em 0em 2em 0em 2em 0em 2em 0em" displaystyle="true"> <mtr> <mtd> <msub> <mi mathvariant="normal">∇<!-- ∇ --></mi> <mrow class="MJX-TeXAtom-ORD"> <mi>x</mi> </mrow> </msub> <mi>F</mi> <mo stretchy="false">(</mo> <msub> <mi>x</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>t</mi> <mo>−<!-- − --></mo> <mn>1</mn> </mrow> </msub> <mo>,</mo> <msub> <mi>u</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>t</mi> </mrow> </msub> <mo>,</mo> <mi>θ<!-- θ --></mi> <mo stretchy="false">)</mo> </mtd> <mtd> <msub> <mi mathvariant="normal">∇<!-- ∇ --></mi> <mrow class="MJX-TeXAtom-ORD"> <mi>x</mi> </mrow> </msub> <mi>F</mi> <mo stretchy="false">(</mo> <msub> <mi>x</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>t</mi> <mo>−<!-- − --></mo> <mn>2</mn> </mrow> </msub> <mo>,</mo> <msub> <mi>u</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>t</mi> <mo>−<!-- − --></mo> <mn>1</mn> </mrow> </msub> <mo>,</mo> <mi>θ<!-- θ --></mi> <mo stretchy="false">)</mo> <mo>⋯<!-- ⋯ --></mo> <msub> <mi mathvariant="normal">∇<!-- ∇ --></mi> <mrow class="MJX-TeXAtom-ORD"> <mi>x</mi> </mrow> </msub> <mi>F</mi> <mo stretchy="false">(</mo> <msub> <mi>x</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>t</mi> <mo>−<!-- − --></mo> <mi>k</mi> </mrow> </msub> <mo>,</mo> <msub> <mi>u</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>t</mi> <mo>−<!-- − --></mo> <mi>k</mi> <mo>+</mo> <mn>1</mn> </mrow> </msub> <mo>,</mo> <mi>θ<!-- θ --></mi> <mo stretchy="false">)</mo> </mtd> </mtr> <mtr> <mtd> <mo>=</mo> <msub> <mi>W</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>r</mi> <mi>e</mi> <mi>c</mi> </mrow> </msub> <mrow class="MJX-TeXAtom-OP"> <mi>d</mi> <mi>i</mi> <mi>a</mi> <mi>g</mi> </mrow> <mo>⁡<!-- --></mo> <mo stretchy="false">(</mo> <msup> <mi>σ<!-- σ --></mi> <mo>′</mo> </msup> <mo stretchy="false">(</mo> <msub> <mi>x</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>t</mi> <mo>−<!-- − --></mo> <mn>1</mn> </mrow> </msub> <mo stretchy="false">)</mo> <mo stretchy="false">)</mo> </mtd> <mtd> <msub> <mi>W</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>r</mi> <mi>e</mi> <mi>c</mi> </mrow> </msub> <mrow class="MJX-TeXAtom-OP"> <mi>d</mi> <mi>i</mi> <mi>a</mi> <mi>g</mi> </mrow> <mo>⁡<!-- --></mo> <mo stretchy="false">(</mo> <msup> <mi>σ<!-- σ --></mi> <mo>′</mo> </msup> <mo stretchy="false">(</mo> <msub> <mi>x</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>t</mi> <mo>−<!-- − --></mo> <mn>2</mn> </mrow> </msub> <mo stretchy="false">)</mo> <mo stretchy="false">)</mo> <mo>⋯<!-- ⋯ --></mo> <msub> <mi>W</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>r</mi> <mi>e</mi> <mi>c</mi> </mrow> </msub> <mrow class="MJX-TeXAtom-OP"> <mi>d</mi> <mi>i</mi> <mi>a</mi> <mi>g</mi> </mrow> <mo>⁡<!-- --></mo> <mo stretchy="false">(</mo> <msup> <mi>σ<!-- σ --></mi> <mo>′</mo> </msup> <mo stretchy="false">(</mo> <msub> <mi>x</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>t</mi> <mo>−<!-- − --></mo> <mi>k</mi> </mrow> </msub> <mo stretchy="false">)</mo> <mo stretchy="false">)</mo> </mtd> </mtr> </mtable> </mrow> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle {\begin{aligned}\nabla _{x}F(x_{t-1},u_{t},\theta )&\nabla _{x}F(x_{t-2},u_{t-1},\theta )\cdots \nabla _{x}F(x_{t-k},u_{t-k+1},\theta )\\=W_{rec}\mathop {diag} (\sigma '(x_{t-1}))&W_{rec}\mathop {diag} (\sigma '(x_{t-2}))\cdots W_{rec}\mathop {diag} (\sigma '(x_{t-k}))\end{aligned}}}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/84b865c4df158754db7fcac342ad41c575b1ed08" class="mwe-math-fallback-image-display mw-invert skin-invert" aria-hidden="true" style="vertical-align: -2.505ex; width:64.817ex; height:6.176ex;" alt="{\displaystyle {\begin{aligned}\nabla _{x}F(x_{t-1},u_{t},\theta )&\nabla _{x}F(x_{t-2},u_{t-1},\theta )\cdots \nabla _{x}F(x_{t-k},u_{t-k+1},\theta )\\=W_{rec}\mathop {diag} (\sigma '(x_{t-1}))&W_{rec}\mathop {diag} (\sigma '(x_{t-2}))\cdots W_{rec}\mathop {diag} (\sigma '(x_{t-k}))\end{aligned}}}" /></span>Since <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle |\sigma '|\leq 1}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mrow class="MJX-TeXAtom-ORD"> <mo stretchy="false">|</mo> </mrow> <msup> <mi>σ<!-- σ --></mi> <mo>′</mo> </msup> <mrow class="MJX-TeXAtom-ORD"> <mo stretchy="false">|</mo> </mrow> <mo>≤<!-- ≤ --></mo> <mn>1</mn> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle |\sigma '|\leq 1}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/2d9db24a0e4a45354458d9934a03eb948b9ae0a6" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.838ex; width:7.57ex; height:3.009ex;" alt="{\displaystyle |\sigma '|\leq 1}" /></span>, the <a href="/wiki/Operator_norm" title="Operator norm">operator norm</a> of the above multiplication is bounded above by <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle \|W_{rec}\|^{k}}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mo fence="false" stretchy="false">‖<!-- ‖ --></mo> <msub> <mi>W</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>r</mi> <mi>e</mi> <mi>c</mi> </mrow> </msub> <msup> <mo fence="false" stretchy="false">‖<!-- ‖ --></mo> <mrow class="MJX-TeXAtom-ORD"> <mi>k</mi> </mrow> </msup> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle \|W_{rec}\|^{k}}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/d36dd8bb5e217b55c63b5760e5aa824877cb4eac" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.838ex; width:8.059ex; height:3.176ex;" alt="{\displaystyle \|W_{rec}\|^{k}}" /></span>. So if the <a href="/wiki/Spectral_radius" title="Spectral radius">spectral radius</a> of <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle W_{rec}}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <msub> <mi>W</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>r</mi> <mi>e</mi> <mi>c</mi> </mrow> </msub> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle W_{rec}}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/96e291e7ab7480d5a8991ecf8b0d658fec2bf8a3" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.671ex; width:4.646ex; height:2.509ex;" alt="{\displaystyle W_{rec}}" /></span> is <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle \gamma <1}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mi>γ<!-- γ --></mi> <mo><</mo> <mn>1</mn> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle \gamma <1}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/0d2fc7ea47b14b4cbe727be1a6d404b92d1f0900" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.838ex; width:5.523ex; height:2.676ex;" alt="{\displaystyle \gamma <1}" /></span>, then at large <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle k}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mi>k</mi> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle k}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/c3c9a2c7b599b37105512c5d570edc034056dd40" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.338ex; width:1.211ex; height:2.176ex;" alt="{\displaystyle k}" /></span>, the above multiplication has operator norm bounded above by <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle \gamma ^{k}\to 0}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <msup> <mi>γ<!-- γ --></mi> <mrow class="MJX-TeXAtom-ORD"> <mi>k</mi> </mrow> </msup> <mo stretchy="false">→<!-- → --></mo> <mn>0</mn> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle \gamma ^{k}\to 0}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/53ae17c7d3bd67751f9dd8f287f1493298cdfa06" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.838ex; width:7.145ex; height:3.176ex;" alt="{\displaystyle \gamma ^{k}\to 0}" /></span>. This is the prototypical vanishing gradient problem. </p><p>The effect of a vanishing gradient is that the network cannot learn long-range effects. Recall Equation (<b><a href="#math_loss_differential">loss differential</a></b>):<span class="mwe-math-element"><span class="mwe-math-mathml-display mwe-math-mathml-a11y" style="display: none;"><math display="block" xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle \nabla _{\theta }L=\nabla _{x}L(x_{T},u_{1},...,u_{T})\left(\nabla _{\theta }F(x_{t-1},u_{t},\theta )+\nabla _{x}F(x_{t-1},u_{t},\theta )\nabla _{\theta }F(x_{t-2},u_{t-1},\theta )+\cdots \right)}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <msub> <mi mathvariant="normal">∇<!-- ∇ --></mi> <mrow class="MJX-TeXAtom-ORD"> <mi>θ<!-- θ --></mi> </mrow> </msub> <mi>L</mi> <mo>=</mo> <msub> <mi mathvariant="normal">∇<!-- ∇ --></mi> <mrow class="MJX-TeXAtom-ORD"> <mi>x</mi> </mrow> </msub> <mi>L</mi> <mo stretchy="false">(</mo> <msub> <mi>x</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>T</mi> </mrow> </msub> <mo>,</mo> <msub> <mi>u</mi> <mrow class="MJX-TeXAtom-ORD"> <mn>1</mn> </mrow> </msub> <mo>,</mo> <mo>.</mo> <mo>.</mo> <mo>.</mo> <mo>,</mo> <msub> <mi>u</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>T</mi> </mrow> </msub> <mo stretchy="false">)</mo> <mrow> <mo>(</mo> <mrow> <msub> <mi mathvariant="normal">∇<!-- ∇ --></mi> <mrow class="MJX-TeXAtom-ORD"> <mi>θ<!-- θ --></mi> </mrow> </msub> <mi>F</mi> <mo stretchy="false">(</mo> <msub> <mi>x</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>t</mi> <mo>−<!-- − --></mo> <mn>1</mn> </mrow> </msub> <mo>,</mo> <msub> <mi>u</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>t</mi> </mrow> </msub> <mo>,</mo> <mi>θ<!-- θ --></mi> <mo stretchy="false">)</mo> <mo>+</mo> <msub> <mi mathvariant="normal">∇<!-- ∇ --></mi> <mrow class="MJX-TeXAtom-ORD"> <mi>x</mi> </mrow> </msub> <mi>F</mi> <mo stretchy="false">(</mo> <msub> <mi>x</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>t</mi> <mo>−<!-- − --></mo> <mn>1</mn> </mrow> </msub> <mo>,</mo> <msub> <mi>u</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>t</mi> </mrow> </msub> <mo>,</mo> <mi>θ<!-- θ --></mi> <mo stretchy="false">)</mo> <msub> <mi mathvariant="normal">∇<!-- ∇ --></mi> <mrow class="MJX-TeXAtom-ORD"> <mi>θ<!-- θ --></mi> </mrow> </msub> <mi>F</mi> <mo stretchy="false">(</mo> <msub> <mi>x</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>t</mi> <mo>−<!-- − --></mo> <mn>2</mn> </mrow> </msub> <mo>,</mo> <msub> <mi>u</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>t</mi> <mo>−<!-- − --></mo> <mn>1</mn> </mrow> </msub> <mo>,</mo> <mi>θ<!-- θ --></mi> <mo stretchy="false">)</mo> <mo>+</mo> <mo>⋯<!-- ⋯ --></mo> </mrow> <mo>)</mo> </mrow> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle \nabla _{\theta }L=\nabla _{x}L(x_{T},u_{1},...,u_{T})\left(\nabla _{\theta }F(x_{t-1},u_{t},\theta )+\nabla _{x}F(x_{t-1},u_{t},\theta )\nabla _{\theta }F(x_{t-2},u_{t-1},\theta )+\cdots \right)}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/3a8d9a30caaa01c3ce5dd7ccd68d170296d1c2dc" class="mwe-math-fallback-image-display mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.838ex; width:89.195ex; height:2.843ex;" alt="{\displaystyle \nabla _{\theta }L=\nabla _{x}L(x_{T},u_{1},...,u_{T})\left(\nabla _{\theta }F(x_{t-1},u_{t},\theta )+\nabla _{x}F(x_{t-1},u_{t},\theta )\nabla _{\theta }F(x_{t-2},u_{t-1},\theta )+\cdots \right)}" /></span>The components of <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle \nabla _{\theta }F(x,u,\theta )}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <msub> <mi mathvariant="normal">∇<!-- ∇ --></mi> <mrow class="MJX-TeXAtom-ORD"> <mi>θ<!-- θ --></mi> </mrow> </msub> <mi>F</mi> <mo stretchy="false">(</mo> <mi>x</mi> <mo>,</mo> <mi>u</mi> <mo>,</mo> <mi>θ<!-- θ --></mi> <mo stretchy="false">)</mo> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle \nabla _{\theta }F(x,u,\theta )}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/d9b316e4a6f8bfc08d16b783066c7dd50f16c19e" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.838ex; width:12.307ex; height:2.843ex;" alt="{\displaystyle \nabla _{\theta }F(x,u,\theta )}" /></span> are just components of <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle \sigma (x)}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mi>σ<!-- σ --></mi> <mo stretchy="false">(</mo> <mi>x</mi> <mo stretchy="false">)</mo> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle \sigma (x)}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/ae09ff47b50183fbfd1ea5697c63963ec9eefa20" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.838ex; width:4.469ex; height:2.843ex;" alt="{\displaystyle \sigma (x)}" /></span> and <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle u}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mi>u</mi> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle u}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/c3e6bb763d22c20916ed4f0bb6bd49d7470cffd8" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.338ex; width:1.33ex; height:1.676ex;" alt="{\displaystyle u}" /></span>, so if <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle u_{t},u_{t-1},...}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <msub> <mi>u</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>t</mi> </mrow> </msub> <mo>,</mo> <msub> <mi>u</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>t</mi> <mo>−<!-- − --></mo> <mn>1</mn> </mrow> </msub> <mo>,</mo> <mo>.</mo> <mo>.</mo> <mo>.</mo> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle u_{t},u_{t-1},...}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/69003652ec56a0fe4484f744d0abf2755f5a7fed" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.671ex; width:11.194ex; height:2.009ex;" alt="{\displaystyle u_{t},u_{t-1},...}" /></span> are bounded, then <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle \|\nabla _{\theta }F(x_{t-k-1},u_{t-k},\theta )\|}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mo fence="false" stretchy="false">‖<!-- ‖ --></mo> <msub> <mi mathvariant="normal">∇<!-- ∇ --></mi> <mrow class="MJX-TeXAtom-ORD"> <mi>θ<!-- θ --></mi> </mrow> </msub> <mi>F</mi> <mo stretchy="false">(</mo> <msub> <mi>x</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>t</mi> <mo>−<!-- − --></mo> <mi>k</mi> <mo>−<!-- − --></mo> <mn>1</mn> </mrow> </msub> <mo>,</mo> <msub> <mi>u</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>t</mi> <mo>−<!-- − --></mo> <mi>k</mi> </mrow> </msub> <mo>,</mo> <mi>θ<!-- θ --></mi> <mo stretchy="false">)</mo> <mo fence="false" stretchy="false">‖<!-- ‖ --></mo> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle \|\nabla _{\theta }F(x_{t-k-1},u_{t-k},\theta )\|}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/34a2274c1587cb5fbf8650a7674d2debdd20d5da" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.838ex; width:22.654ex; height:2.843ex;" alt="{\displaystyle \|\nabla _{\theta }F(x_{t-k-1},u_{t-k},\theta )\|}" /></span> is also bounded by some <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle M>0}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mi>M</mi> <mo>></mo> <mn>0</mn> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle M>0}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/a7f423ab77b3411ec2803520a07c0dfae6ceb826" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.338ex; width:6.703ex; height:2.176ex;" alt="{\displaystyle M>0}" /></span>, and so the terms in <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle \nabla _{\theta }L}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <msub> <mi mathvariant="normal">∇<!-- ∇ --></mi> <mrow class="MJX-TeXAtom-ORD"> <mi>θ<!-- θ --></mi> </mrow> </msub> <mi>L</mi> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle \nabla _{\theta }L}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/d39138eb7d45f58ed2aa34565f2b8f46e7fca273" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.671ex; width:4.522ex; height:2.509ex;" alt="{\displaystyle \nabla _{\theta }L}" /></span> decay as <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle M\gamma ^{k}}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mi>M</mi> <msup> <mi>γ<!-- γ --></mi> <mrow class="MJX-TeXAtom-ORD"> <mi>k</mi> </mrow> </msup> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle M\gamma ^{k}}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/0881ce69d31e29a89d88ba18f28b3657b2b5eeb6" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.838ex; width:4.811ex; height:3.176ex;" alt="{\displaystyle M\gamma ^{k}}" /></span>. This means that, effectively, <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle \nabla _{\theta }L}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <msub> <mi mathvariant="normal">∇<!-- ∇ --></mi> <mrow class="MJX-TeXAtom-ORD"> <mi>θ<!-- θ --></mi> </mrow> </msub> <mi>L</mi> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle \nabla _{\theta }L}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/d39138eb7d45f58ed2aa34565f2b8f46e7fca273" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.671ex; width:4.522ex; height:2.509ex;" alt="{\displaystyle \nabla _{\theta }L}" /></span> is affected only by the first <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle O(\gamma ^{-1})}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mi>O</mi> <mo stretchy="false">(</mo> <msup> <mi>γ<!-- γ --></mi> <mrow class="MJX-TeXAtom-ORD"> <mo>−<!-- − --></mo> <mn>1</mn> </mrow> </msup> <mo stretchy="false">)</mo> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle O(\gamma ^{-1})}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/6a3cb2124ae50a2b1f9e2b74c7708755d7b12701" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.838ex; width:7.195ex; height:3.176ex;" alt="{\displaystyle O(\gamma ^{-1})}" /></span> terms in the sum. </p><p>If <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle \gamma \geq 1}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mi>γ<!-- γ --></mi> <mo>≥<!-- ≥ --></mo> <mn>1</mn> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle \gamma \geq 1}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/47586b1f6f96143cb9bb8e7ffe3cebea1f229a00" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.838ex; width:5.523ex; height:2.676ex;" alt="{\displaystyle \gamma \geq 1}" /></span>, the above analysis does not quite work.<sup id="cite_ref-not_quite_work_9-0" class="reference"><a href="#cite_note-not_quite_work-9"><span class="cite-bracket">[</span>note 3<span class="cite-bracket">]</span></a></sup> For the prototypical exploding gradient problem, the next model is clearer. </p> <div class="mw-heading mw-heading3"><h3 id="Dynamical_systems_model">Dynamical systems model</h3><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Vanishing_gradient_problem&action=edit&section=4" title="Edit section: Dynamical systems model"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <figure class="mw-default-size" typeof="mw:File/Thumb"><a href="/wiki/File:One-neuron_recurrent_network_bifurcation_diagram.png" class="mw-file-description"><img src="//upload.wikimedia.org/wikipedia/commons/thumb/f/f4/One-neuron_recurrent_network_bifurcation_diagram.png/220px-One-neuron_recurrent_network_bifurcation_diagram.png" decoding="async" width="220" height="220" class="mw-file-element" srcset="//upload.wikimedia.org/wikipedia/commons/thumb/f/f4/One-neuron_recurrent_network_bifurcation_diagram.png/330px-One-neuron_recurrent_network_bifurcation_diagram.png 1.5x, //upload.wikimedia.org/wikipedia/commons/thumb/f/f4/One-neuron_recurrent_network_bifurcation_diagram.png/440px-One-neuron_recurrent_network_bifurcation_diagram.png 2x" data-file-width="800" data-file-height="800" /></a><figcaption><a href="/wiki/Bifurcation_diagram" title="Bifurcation diagram">Bifurcation diagram</a> of the one-neuron recurrent network. Horizontal axis is b, and vertical axis is x. The black curve is the set of stable and unstable equilibria. Notice that the system exhibits <a href="/wiki/Hysteresis" title="Hysteresis">hysteresis</a>, and can be used as a one-bit memory. </figcaption></figure> <p>Following (Doya, 1993),<sup id="cite_ref-10" class="reference"><a href="#cite_note-10"><span class="cite-bracket">[</span>7<span class="cite-bracket">]</span></a></sup> consider this one-neuron recurrent network with sigmoid activation:<span class="mwe-math-element"><span class="mwe-math-mathml-display mwe-math-mathml-a11y" style="display: none;"><math display="block" xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle x_{t+1}=(1-\epsilon )x_{t}+\epsilon \sigma (wx_{t}+b)+\epsilon w'u_{t}}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <msub> <mi>x</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>t</mi> <mo>+</mo> <mn>1</mn> </mrow> </msub> <mo>=</mo> <mo stretchy="false">(</mo> <mn>1</mn> <mo>−<!-- − --></mo> <mi>ϵ<!-- ϵ --></mi> <mo stretchy="false">)</mo> <msub> <mi>x</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>t</mi> </mrow> </msub> <mo>+</mo> <mi>ϵ<!-- ϵ --></mi> <mi>σ<!-- σ --></mi> <mo stretchy="false">(</mo> <mi>w</mi> <msub> <mi>x</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>t</mi> </mrow> </msub> <mo>+</mo> <mi>b</mi> <mo stretchy="false">)</mo> <mo>+</mo> <mi>ϵ<!-- ϵ --></mi> <msup> <mi>w</mi> <mo>′</mo> </msup> <msub> <mi>u</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>t</mi> </mrow> </msub> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle x_{t+1}=(1-\epsilon )x_{t}+\epsilon \sigma (wx_{t}+b)+\epsilon w'u_{t}}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/c8e8579e0f2fc37c5976fafec84a814b4803f178" class="mwe-math-fallback-image-display mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.838ex; width:39.137ex; height:3.009ex;" alt="{\displaystyle x_{t+1}=(1-\epsilon )x_{t}+\epsilon \sigma (wx_{t}+b)+\epsilon w'u_{t}}" /></span>At the small <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle \epsilon }"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mi>ϵ<!-- ϵ --></mi> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle \epsilon }</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/c3837cad72483d97bcdde49c85d3b7b859fb3fd2" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.338ex; width:0.944ex; height:1.676ex;" alt="{\displaystyle \epsilon }" /></span> limit, the dynamics of the network becomes<span class="mwe-math-element"><span class="mwe-math-mathml-display mwe-math-mathml-a11y" style="display: none;"><math display="block" xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle {\frac {dx}{dt}}=-x(t)+\sigma (wx(t)+b)+w'u(t)}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mrow class="MJX-TeXAtom-ORD"> <mfrac> <mrow> <mi>d</mi> <mi>x</mi> </mrow> <mrow> <mi>d</mi> <mi>t</mi> </mrow> </mfrac> </mrow> <mo>=</mo> <mo>−<!-- − --></mo> <mi>x</mi> <mo stretchy="false">(</mo> <mi>t</mi> <mo stretchy="false">)</mo> <mo>+</mo> <mi>σ<!-- σ --></mi> <mo stretchy="false">(</mo> <mi>w</mi> <mi>x</mi> <mo stretchy="false">(</mo> <mi>t</mi> <mo stretchy="false">)</mo> <mo>+</mo> <mi>b</mi> <mo stretchy="false">)</mo> <mo>+</mo> <msup> <mi>w</mi> <mo>′</mo> </msup> <mi>u</mi> <mo stretchy="false">(</mo> <mi>t</mi> <mo stretchy="false">)</mo> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle {\frac {dx}{dt}}=-x(t)+\sigma (wx(t)+b)+w'u(t)}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/5be8c37ebd06782cb6cc66de748278e7242888b9" class="mwe-math-fallback-image-display mw-invert skin-invert" aria-hidden="true" style="vertical-align: -2.005ex; width:36.895ex; height:5.509ex;" alt="{\displaystyle {\frac {dx}{dt}}=-x(t)+\sigma (wx(t)+b)+w'u(t)}" /></span>Consider first the <a href="/wiki/Autonomous_system_(mathematics)" title="Autonomous system (mathematics)">autonomous</a> case, with <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle u=0}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mi>u</mi> <mo>=</mo> <mn>0</mn> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle u=0}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/41c2a269988ef0c55e0449b74950a4976e35a067" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.338ex; width:5.591ex; height:2.176ex;" alt="{\displaystyle u=0}" /></span>. Set <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle w=5.0}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mi>w</mi> <mo>=</mo> <mn>5.0</mn> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle w=5.0}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/a4e5f88bb74631bfb48a7c0daa81d4b1eebb2e3b" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.338ex; width:7.734ex; height:2.176ex;" alt="{\displaystyle w=5.0}" /></span>, and vary <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle b}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mi>b</mi> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle b}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/f11423fbb2e967f986e36804a8ae4271734917c3" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.338ex; width:0.998ex; height:2.176ex;" alt="{\displaystyle b}" /></span> in <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle [-3,-2]}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mo stretchy="false">[</mo> <mo>−<!-- − --></mo> <mn>3</mn> <mo>,</mo> <mo>−<!-- − --></mo> <mn>2</mn> <mo stretchy="false">]</mo> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle [-3,-2]}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/bf998a3a92028ee969459f1ca11f8b4f54ac8132" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.838ex; width:8.269ex; height:2.843ex;" alt="{\displaystyle [-3,-2]}" /></span>. As <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle b}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mi>b</mi> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle b}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/f11423fbb2e967f986e36804a8ae4271734917c3" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.338ex; width:0.998ex; height:2.176ex;" alt="{\displaystyle b}" /></span> decreases, the system has 1 stable point, then has 2 stable points and 1 unstable point, and finally has 1 stable point again. Explicitly, the stable points are <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle (x,b)=\left(x,\ln \left({\frac {x}{1-x}}\right)-5x\right)}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mo stretchy="false">(</mo> <mi>x</mi> <mo>,</mo> <mi>b</mi> <mo stretchy="false">)</mo> <mo>=</mo> <mrow> <mo>(</mo> <mrow> <mi>x</mi> <mo>,</mo> <mi>ln</mi> <mo>⁡<!-- --></mo> <mrow> <mo>(</mo> <mrow class="MJX-TeXAtom-ORD"> <mfrac> <mi>x</mi> <mrow> <mn>1</mn> <mo>−<!-- − --></mo> <mi>x</mi> </mrow> </mfrac> </mrow> <mo>)</mo> </mrow> <mo>−<!-- − --></mo> <mn>5</mn> <mi>x</mi> </mrow> <mo>)</mo> </mrow> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle (x,b)=\left(x,\ln \left({\frac {x}{1-x}}\right)-5x\right)}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/692da15bc7383c35a8355cc769a12ddb64553a1b" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -2.505ex; width:30.915ex; height:6.176ex;" alt="{\displaystyle (x,b)=\left(x,\ln \left({\frac {x}{1-x}}\right)-5x\right)}" /></span>. </p><p>Now consider <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle {\frac {\Delta x(T)}{\Delta x(0)}}}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mrow class="MJX-TeXAtom-ORD"> <mfrac> <mrow> <mi mathvariant="normal">Δ<!-- Δ --></mi> <mi>x</mi> <mo stretchy="false">(</mo> <mi>T</mi> <mo stretchy="false">)</mo> </mrow> <mrow> <mi mathvariant="normal">Δ<!-- Δ --></mi> <mi>x</mi> <mo stretchy="false">(</mo> <mn>0</mn> <mo stretchy="false">)</mo> </mrow> </mfrac> </mrow> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle {\frac {\Delta x(T)}{\Delta x(0)}}}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/142ced7bde20c8e679878bd14b3b9bc409e5a90a" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -2.671ex; width:7.547ex; height:6.509ex;" alt="{\displaystyle {\frac {\Delta x(T)}{\Delta x(0)}}}" /></span> and <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle {\frac {\Delta x(T)}{\Delta b}}}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mrow class="MJX-TeXAtom-ORD"> <mfrac> <mrow> <mi mathvariant="normal">Δ<!-- Δ --></mi> <mi>x</mi> <mo stretchy="false">(</mo> <mi>T</mi> <mo stretchy="false">)</mo> </mrow> <mrow> <mi mathvariant="normal">Δ<!-- Δ --></mi> <mi>b</mi> </mrow> </mfrac> </mrow> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle {\frac {\Delta x(T)}{\Delta b}}}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/a0fd751cf74672fc34ec6fb770a8547831564022" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -2.005ex; width:7.547ex; height:5.843ex;" alt="{\displaystyle {\frac {\Delta x(T)}{\Delta b}}}" /></span>, where <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle T}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mi>T</mi> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle T}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/ec7200acd984a1d3a3d7dc455e262fbe54f7f6e0" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.338ex; width:1.636ex; height:2.176ex;" alt="{\displaystyle T}" /></span> is large enough that the system has settled into one of the stable points. </p><p>If <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle (x(0),b)}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mo stretchy="false">(</mo> <mi>x</mi> <mo stretchy="false">(</mo> <mn>0</mn> <mo stretchy="false">)</mo> <mo>,</mo> <mi>b</mi> <mo stretchy="false">)</mo> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle (x(0),b)}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/ad4792c71b80639795649f01ec834846dbb1b7b4" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.838ex; width:8.142ex; height:2.843ex;" alt="{\displaystyle (x(0),b)}" /></span> puts the system very close to an unstable point, then a tiny variation in <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle x(0)}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mi>x</mi> <mo stretchy="false">(</mo> <mn>0</mn> <mo stretchy="false">)</mo> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle x(0)}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/8f7176643e6d36fa7674dc79fdff1a4daa068f5d" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.838ex; width:4.301ex; height:2.843ex;" alt="{\displaystyle x(0)}" /></span> or <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle b}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mi>b</mi> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle b}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/f11423fbb2e967f986e36804a8ae4271734917c3" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.338ex; width:0.998ex; height:2.176ex;" alt="{\displaystyle b}" /></span> would make <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle x(T)}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mi>x</mi> <mo stretchy="false">(</mo> <mi>T</mi> <mo stretchy="false">)</mo> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle x(T)}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/79cf138e52ea666320b42eeb0c37520cbe54934a" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.838ex; width:4.775ex; height:2.843ex;" alt="{\displaystyle x(T)}" /></span> move from one stable point to the other. This makes <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle {\frac {\Delta x(T)}{\Delta x(0)}}}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mrow class="MJX-TeXAtom-ORD"> <mfrac> <mrow> <mi mathvariant="normal">Δ<!-- Δ --></mi> <mi>x</mi> <mo stretchy="false">(</mo> <mi>T</mi> <mo stretchy="false">)</mo> </mrow> <mrow> <mi mathvariant="normal">Δ<!-- Δ --></mi> <mi>x</mi> <mo stretchy="false">(</mo> <mn>0</mn> <mo stretchy="false">)</mo> </mrow> </mfrac> </mrow> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle {\frac {\Delta x(T)}{\Delta x(0)}}}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/142ced7bde20c8e679878bd14b3b9bc409e5a90a" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -2.671ex; width:7.547ex; height:6.509ex;" alt="{\displaystyle {\frac {\Delta x(T)}{\Delta x(0)}}}" /></span> and <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle {\frac {\Delta x(T)}{\Delta b}}}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mrow class="MJX-TeXAtom-ORD"> <mfrac> <mrow> <mi mathvariant="normal">Δ<!-- Δ --></mi> <mi>x</mi> <mo stretchy="false">(</mo> <mi>T</mi> <mo stretchy="false">)</mo> </mrow> <mrow> <mi mathvariant="normal">Δ<!-- Δ --></mi> <mi>b</mi> </mrow> </mfrac> </mrow> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle {\frac {\Delta x(T)}{\Delta b}}}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/a0fd751cf74672fc34ec6fb770a8547831564022" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -2.005ex; width:7.547ex; height:5.843ex;" alt="{\displaystyle {\frac {\Delta x(T)}{\Delta b}}}" /></span> both very large, a case of the exploding gradient. </p><p>If <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle (x(0),b)}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mo stretchy="false">(</mo> <mi>x</mi> <mo stretchy="false">(</mo> <mn>0</mn> <mo stretchy="false">)</mo> <mo>,</mo> <mi>b</mi> <mo stretchy="false">)</mo> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle (x(0),b)}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/ad4792c71b80639795649f01ec834846dbb1b7b4" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.838ex; width:8.142ex; height:2.843ex;" alt="{\displaystyle (x(0),b)}" /></span> puts the system far from an unstable point, then a small variation in <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle x(0)}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mi>x</mi> <mo stretchy="false">(</mo> <mn>0</mn> <mo stretchy="false">)</mo> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle x(0)}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/8f7176643e6d36fa7674dc79fdff1a4daa068f5d" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.838ex; width:4.301ex; height:2.843ex;" alt="{\displaystyle x(0)}" /></span> would have no effect on <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle x(T)}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mi>x</mi> <mo stretchy="false">(</mo> <mi>T</mi> <mo stretchy="false">)</mo> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle x(T)}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/79cf138e52ea666320b42eeb0c37520cbe54934a" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.838ex; width:4.775ex; height:2.843ex;" alt="{\displaystyle x(T)}" /></span>, making <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle {\frac {\Delta x(T)}{\Delta x(0)}}=0}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mrow class="MJX-TeXAtom-ORD"> <mfrac> <mrow> <mi mathvariant="normal">Δ<!-- Δ --></mi> <mi>x</mi> <mo stretchy="false">(</mo> <mi>T</mi> <mo stretchy="false">)</mo> </mrow> <mrow> <mi mathvariant="normal">Δ<!-- Δ --></mi> <mi>x</mi> <mo stretchy="false">(</mo> <mn>0</mn> <mo stretchy="false">)</mo> </mrow> </mfrac> </mrow> <mo>=</mo> <mn>0</mn> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle {\frac {\Delta x(T)}{\Delta x(0)}}=0}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/281913265cf5ab0de2375a26ba4b0f6218814808" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -2.671ex; width:11.808ex; height:6.509ex;" alt="{\displaystyle {\frac {\Delta x(T)}{\Delta x(0)}}=0}" /></span>, a case of the vanishing gradient. </p><p>Note that in this case, <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle {\frac {\Delta x(T)}{\Delta b}}\approx {\frac {\partial x(T)}{\partial b}}=\left({\frac {1}{x(T)(1-x(T))}}-5\right)^{-1}}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mrow class="MJX-TeXAtom-ORD"> <mfrac> <mrow> <mi mathvariant="normal">Δ<!-- Δ --></mi> <mi>x</mi> <mo stretchy="false">(</mo> <mi>T</mi> <mo stretchy="false">)</mo> </mrow> <mrow> <mi mathvariant="normal">Δ<!-- Δ --></mi> <mi>b</mi> </mrow> </mfrac> </mrow> <mo>≈<!-- ≈ --></mo> <mrow class="MJX-TeXAtom-ORD"> <mfrac> <mrow> <mi mathvariant="normal">∂<!-- ∂ --></mi> <mi>x</mi> <mo stretchy="false">(</mo> <mi>T</mi> <mo stretchy="false">)</mo> </mrow> <mrow> <mi mathvariant="normal">∂<!-- ∂ --></mi> <mi>b</mi> </mrow> </mfrac> </mrow> <mo>=</mo> <msup> <mrow> <mo>(</mo> <mrow> <mrow class="MJX-TeXAtom-ORD"> <mfrac> <mn>1</mn> <mrow> <mi>x</mi> <mo stretchy="false">(</mo> <mi>T</mi> <mo stretchy="false">)</mo> <mo stretchy="false">(</mo> <mn>1</mn> <mo>−<!-- − --></mo> <mi>x</mi> <mo stretchy="false">(</mo> <mi>T</mi> <mo stretchy="false">)</mo> <mo stretchy="false">)</mo> </mrow> </mfrac> </mrow> <mo>−<!-- − --></mo> <mn>5</mn> </mrow> <mo>)</mo> </mrow> <mrow class="MJX-TeXAtom-ORD"> <mo>−<!-- − --></mo> <mn>1</mn> </mrow> </msup> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle {\frac {\Delta x(T)}{\Delta b}}\approx {\frac {\partial x(T)}{\partial b}}=\left({\frac {1}{x(T)(1-x(T))}}-5\right)^{-1}}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/5e46cc1169d7de9bd23d87ebb6825916047a55ae" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -2.671ex; width:46.629ex; height:6.676ex;" alt="{\displaystyle {\frac {\Delta x(T)}{\Delta b}}\approx {\frac {\partial x(T)}{\partial b}}=\left({\frac {1}{x(T)(1-x(T))}}-5\right)^{-1}}" /></span> neither decays to zero nor blows up to infinity. Indeed, it's the only well-behaved gradient, which explains why early researches focused on learning or designing recurrent networks systems that could perform long-ranged computations (such as outputting the first input it sees at the very end of an episode) by shaping its stable attractors.<sup id="cite_ref-11" class="reference"><a href="#cite_note-11"><span class="cite-bracket">[</span>8<span class="cite-bracket">]</span></a></sup> </p><p>For the general case, the intuition still holds (<sup id="cite_ref-:1_6-2" class="reference"><a href="#cite_note-:1-6"><span class="cite-bracket">[</span>6<span class="cite-bracket">]</span></a></sup> Figures 3, 4, and 5). </p> <div class="mw-heading mw-heading3"><h3 id="Geometric_model">Geometric model</h3><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Vanishing_gradient_problem&action=edit&section=5" title="Edit section: Geometric model"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <p>Continue using the above one-neuron network, fixing <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle w=5,x(0)=0.5,u(t)=0}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mi>w</mi> <mo>=</mo> <mn>5</mn> <mo>,</mo> <mi>x</mi> <mo stretchy="false">(</mo> <mn>0</mn> <mo stretchy="false">)</mo> <mo>=</mo> <mn>0.5</mn> <mo>,</mo> <mi>u</mi> <mo stretchy="false">(</mo> <mi>t</mi> <mo stretchy="false">)</mo> <mo>=</mo> <mn>0</mn> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle w=5,x(0)=0.5,u(t)=0}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/0bf0bb249020b73db7b9ae5c909c09fe3247032f" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.838ex; width:26.604ex; height:2.843ex;" alt="{\displaystyle w=5,x(0)=0.5,u(t)=0}" /></span>, and consider a loss function defined by <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle L(x(T))=(0.855-x(T))^{2}}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mi>L</mi> <mo stretchy="false">(</mo> <mi>x</mi> <mo stretchy="false">(</mo> <mi>T</mi> <mo stretchy="false">)</mo> <mo stretchy="false">)</mo> <mo>=</mo> <mo stretchy="false">(</mo> <mn>0.855</mn> <mo>−<!-- − --></mo> <mi>x</mi> <mo stretchy="false">(</mo> <mi>T</mi> <mo stretchy="false">)</mo> <msup> <mo stretchy="false">)</mo> <mrow class="MJX-TeXAtom-ORD"> <mn>2</mn> </mrow> </msup> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle L(x(T))=(0.855-x(T))^{2}}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/3bdc5e8aac424a363e16ba50c2a02a1787925f38" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.838ex; width:27.042ex; height:3.176ex;" alt="{\displaystyle L(x(T))=(0.855-x(T))^{2}}" /></span>. This produces a rather pathological loss landscape: as <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle b}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mi>b</mi> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle b}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/f11423fbb2e967f986e36804a8ae4271734917c3" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.338ex; width:0.998ex; height:2.176ex;" alt="{\displaystyle b}" /></span> approach <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle -2.5}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mo>−<!-- − --></mo> <mn>2.5</mn> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle -2.5}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/0c64311fe3f8abf35d392a2174cf1f44d6be67f1" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.505ex; width:4.78ex; height:2.343ex;" alt="{\displaystyle -2.5}" /></span> from above, the loss approaches zero, but as soon as <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle b}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mi>b</mi> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle b}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/f11423fbb2e967f986e36804a8ae4271734917c3" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.338ex; width:0.998ex; height:2.176ex;" alt="{\displaystyle b}" /></span> crosses <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle -2.5}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mo>−<!-- − --></mo> <mn>2.5</mn> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle -2.5}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/0c64311fe3f8abf35d392a2174cf1f44d6be67f1" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.505ex; width:4.78ex; height:2.343ex;" alt="{\displaystyle -2.5}" /></span>, the attractor basin changes, and loss jumps to 0.50.<sup id="cite_ref-attractor_12-0" class="reference"><a href="#cite_note-attractor-12"><span class="cite-bracket">[</span>note 4<span class="cite-bracket">]</span></a></sup> </p><p>Consequently, attempting to train <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle b}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mi>b</mi> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle b}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/f11423fbb2e967f986e36804a8ae4271734917c3" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.338ex; width:0.998ex; height:2.176ex;" alt="{\displaystyle b}" /></span> by gradient descent would "hit a wall in the loss landscape", and cause exploding gradient. A slightly more complex situation is plotted in,<sup id="cite_ref-:1_6-3" class="reference"><a href="#cite_note-:1-6"><span class="cite-bracket">[</span>6<span class="cite-bracket">]</span></a></sup> Figures 6. </p> <div class="mw-heading mw-heading2"><h2 id="Solutions">Solutions</h2><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Vanishing_gradient_problem&action=edit&section=6" title="Edit section: Solutions"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <style data-mw-deduplicate="TemplateStyles:r1251242444">.mw-parser-output .ambox{border:1px solid #a2a9b1;border-left:10px solid #36c;background-color:#fbfbfb;box-sizing:border-box}.mw-parser-output .ambox+link+.ambox,.mw-parser-output .ambox+link+style+.ambox,.mw-parser-output .ambox+link+link+.ambox,.mw-parser-output .ambox+.mw-empty-elt+link+.ambox,.mw-parser-output .ambox+.mw-empty-elt+link+style+.ambox,.mw-parser-output .ambox+.mw-empty-elt+link+link+.ambox{margin-top:-1px}html body.mediawiki .mw-parser-output .ambox.mbox-small-left{margin:4px 1em 4px 0;overflow:hidden;width:238px;border-collapse:collapse;font-size:88%;line-height:1.25em}.mw-parser-output .ambox-speedy{border-left:10px solid #b32424;background-color:#fee7e6}.mw-parser-output .ambox-delete{border-left:10px solid #b32424}.mw-parser-output .ambox-content{border-left:10px solid #f28500}.mw-parser-output .ambox-style{border-left:10px solid #fc3}.mw-parser-output .ambox-move{border-left:10px solid #9932cc}.mw-parser-output .ambox-protection{border-left:10px solid #a2a9b1}.mw-parser-output .ambox .mbox-text{border:none;padding:0.25em 0.5em;width:100%}.mw-parser-output .ambox .mbox-image{border:none;padding:2px 0 2px 0.5em;text-align:center}.mw-parser-output .ambox .mbox-imageright{border:none;padding:2px 0.5em 2px 0;text-align:center}.mw-parser-output .ambox .mbox-empty-cell{border:none;padding:0;width:1px}.mw-parser-output .ambox .mbox-image-div{width:52px}@media(min-width:720px){.mw-parser-output .ambox{margin:0 10%}}@media print{body.ns-0 .mw-parser-output .ambox{display:none!important}}</style><style data-mw-deduplicate="TemplateStyles:r1248332772">.mw-parser-output .multiple-issues-text{width:95%;margin:0.2em 0}.mw-parser-output .multiple-issues-text>.mw-collapsible-content{margin-top:0.3em}.mw-parser-output .compact-ambox .ambox{border:none;border-collapse:collapse;background-color:transparent;margin:0 0 0 1.6em!important;padding:0!important;width:auto;display:block}body.mediawiki .mw-parser-output .compact-ambox .ambox.mbox-small-left{font-size:100%;width:auto;margin:0}.mw-parser-output .compact-ambox .ambox .mbox-text{padding:0!important;margin:0!important}.mw-parser-output .compact-ambox .ambox .mbox-text-span{display:list-item;line-height:1.5em;list-style-type:disc}body.skin-minerva .mw-parser-output .multiple-issues-text>.mw-collapsible-toggle,.mw-parser-output .compact-ambox .ambox .mbox-image,.mw-parser-output .compact-ambox .ambox .mbox-imageright,.mw-parser-output .compact-ambox .ambox .mbox-empty-cell,.mw-parser-output .compact-ambox .hide-when-compact{display:none}</style><table class="box-Multiple_issues plainlinks metadata ambox ambox-content ambox-multiple_issues compact-ambox" role="presentation"><tbody><tr><td class="mbox-image"><div class="mbox-image-div"><span typeof="mw:File"><span><img alt="" src="//upload.wikimedia.org/wikipedia/en/thumb/b/b4/Ambox_important.svg/40px-Ambox_important.svg.png" decoding="async" width="40" height="40" class="mw-file-element" srcset="//upload.wikimedia.org/wikipedia/en/thumb/b/b4/Ambox_important.svg/60px-Ambox_important.svg.png 1.5x, //upload.wikimedia.org/wikipedia/en/thumb/b/b4/Ambox_important.svg/80px-Ambox_important.svg.png 2x" data-file-width="40" data-file-height="40" /></span></span></div></td><td class="mbox-text"><div class="mbox-text-span"><div class="multiple-issues-text mw-collapsible"><b>This section has multiple issues.</b> Please help <b><a href="/wiki/Special:EditPage/Vanishing_gradient_problem" title="Special:EditPage/Vanishing gradient problem">improve it</a></b> or discuss these issues on the <b><a href="/wiki/Talk:Vanishing_gradient_problem" title="Talk:Vanishing gradient problem">talk page</a></b>. <small><i>(<a href="/wiki/Help:Maintenance_template_removal" title="Help:Maintenance template removal">Learn how and when to remove these messages</a>)</i></small> <div class="mw-collapsible-content"> <link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1251242444" /><table class="box-Refimprove_science plainlinks metadata ambox ambox-content ambox-Primary_sources" role="presentation"><tbody><tr><td class="mbox-image"><div class="mbox-image-div"><span typeof="mw:File"><span><img src="//upload.wikimedia.org/wikipedia/en/thumb/9/99/Question_book-new.svg/50px-Question_book-new.svg.png" decoding="async" width="50" height="39" class="mw-file-element" srcset="//upload.wikimedia.org/wikipedia/en/thumb/9/99/Question_book-new.svg/75px-Question_book-new.svg.png 1.5x, //upload.wikimedia.org/wikipedia/en/thumb/9/99/Question_book-new.svg/100px-Question_book-new.svg.png 2x" data-file-width="512" data-file-height="399" /></span></span></div></td><td class="mbox-text"><div class="mbox-text-span">This section <b>needs additional <a href="/wiki/Wikipedia:Scientific_citation_guidelines" title="Wikipedia:Scientific citation guidelines">citations</a> to <a href="/wiki/Wikipedia:Reliable_sources#Scholarship" title="Wikipedia:Reliable sources">secondary or tertiary sources</a></b>.<span class="hide-when-compact"> Help add sources such as review articles, monographs, or textbooks. Please also establish the relevance for any <a href="/wiki/Wikipedia:No_original_research#Primary,_secondary_and_tertiary_sources" title="Wikipedia:No original research">primary research articles</a> cited. Unsourced or poorly sourced material may be challenged and removed.</span> <span class="date-container"><i>(<span class="date">December 2017</span>)</i></span><span class="hide-when-compact"><i> (<small><a href="/wiki/Help:Maintenance_template_removal" title="Help:Maintenance template removal">Learn how and when to remove this message</a></small>)</i></span></div></td></tr></tbody></table> <link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1251242444" /><table class="box-Unreliable_sources plainlinks metadata ambox ambox-content ambox-unreliable_sources" role="presentation"><tbody><tr><td class="mbox-image"><div class="mbox-image-div"><span typeof="mw:File"><a href="/wiki/File:Text_document_with_red_question_mark.svg" class="mw-file-description"><img src="//upload.wikimedia.org/wikipedia/commons/thumb/a/a4/Text_document_with_red_question_mark.svg/40px-Text_document_with_red_question_mark.svg.png" decoding="async" width="40" height="40" class="mw-file-element" srcset="//upload.wikimedia.org/wikipedia/commons/thumb/a/a4/Text_document_with_red_question_mark.svg/60px-Text_document_with_red_question_mark.svg.png 1.5x, //upload.wikimedia.org/wikipedia/commons/thumb/a/a4/Text_document_with_red_question_mark.svg/80px-Text_document_with_red_question_mark.svg.png 2x" data-file-width="48" data-file-height="48" /></a></span></div></td><td class="mbox-text"><div class="mbox-text-span">Some of this section 's <a href="/wiki/Wikipedia:Citing_sources" title="Wikipedia:Citing sources">listed sources</a> <b>may not be <a href="/wiki/Wikipedia:Reliable_sources" title="Wikipedia:Reliable sources">reliable</a></b>.<span class="hide-when-compact"> Please help improve this article by looking for better, more reliable sources. Unreliable citations may be challenged and removed.</span> <span class="date-container"><i>(<span class="date">December 2017</span>)</i></span><span class="hide-when-compact"><i> (<small><a href="/wiki/Help:Maintenance_template_removal" title="Help:Maintenance template removal">Learn how and when to remove this message</a></small>)</i></span></div></td></tr></tbody></table> </div> </div><span class="hide-when-compact"><i> (<small><a href="/wiki/Help:Maintenance_template_removal" title="Help:Maintenance template removal">Learn how and when to remove this message</a></small>)</i></span></div></td></tr></tbody></table> <p>To overcome this problem, several methods were proposed. </p> <div class="mw-heading mw-heading3"><h3 id="RNN">RNN</h3><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Vanishing_gradient_problem&action=edit&section=7" title="Edit section: RNN"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <p>For <a href="/wiki/Recurrent_neural_network" title="Recurrent neural network">recurrent neural networks</a>, the <a href="/wiki/Long_short-term_memory" title="Long short-term memory">long short-term memory</a> (LSTM) network was designed to solve the problem (<a href="/wiki/Sepp_Hochreiter" title="Sepp Hochreiter">Hochreiter</a> & <a href="/wiki/J%C3%BCrgen_Schmidhuber" title="Jürgen Schmidhuber">Schmidhuber</a>, 1997).<sup id="cite_ref-lstm_13-0" class="reference"><a href="#cite_note-lstm-13"><span class="cite-bracket">[</span>9<span class="cite-bracket">]</span></a></sup> </p><p>For the exploding gradient problem, (Pascanu et al, 2012)<sup id="cite_ref-:1_6-4" class="reference"><a href="#cite_note-:1-6"><span class="cite-bracket">[</span>6<span class="cite-bracket">]</span></a></sup> recommended gradient clipping, meaning dividing the gradient vector <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle g}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mi>g</mi> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle g}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/d3556280e66fe2c0d0140df20935a6f057381d77" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.671ex; width:1.116ex; height:2.009ex;" alt="{\displaystyle g}" /></span> by <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle \|g\|/g_{max}}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mo fence="false" stretchy="false">‖<!-- ‖ --></mo> <mi>g</mi> <mo fence="false" stretchy="false">‖<!-- ‖ --></mo> <mrow class="MJX-TeXAtom-ORD"> <mo>/</mo> </mrow> <msub> <mi>g</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>m</mi> <mi>a</mi> <mi>x</mi> </mrow> </msub> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle \|g\|/g_{max}}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/8539138f3089e1a506ca4fe0da1a8904c86e28c2" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.838ex; width:9.197ex; height:2.843ex;" alt="{\displaystyle \|g\|/g_{max}}" /></span> if <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle \|g\|>g_{max}}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mo fence="false" stretchy="false">‖<!-- ‖ --></mo> <mi>g</mi> <mo fence="false" stretchy="false">‖<!-- ‖ --></mo> <mo>></mo> <msub> <mi>g</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>m</mi> <mi>a</mi> <mi>x</mi> </mrow> </msub> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle \|g\|>g_{max}}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/e8964c1da83b56bc0b6ddbab62356f05177b648e" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.838ex; width:11.133ex; height:2.843ex;" alt="{\displaystyle \|g\|>g_{max}}" /></span>. This restricts the gradient vectors within a ball of radius <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle g_{max}}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <msub> <mi>g</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>m</mi> <mi>a</mi> <mi>x</mi> </mrow> </msub> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle g_{max}}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/11d71812d5323f80034a10daad991f4b4fb8823b" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.671ex; width:4.594ex; height:2.009ex;" alt="{\displaystyle g_{max}}" /></span>. </p> <div class="mw-heading mw-heading3"><h3 id="Batch_normalization">Batch normalization</h3><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Vanishing_gradient_problem&action=edit&section=8" title="Edit section: Batch normalization"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <p><a href="/wiki/Batch_normalization" title="Batch normalization">Batch normalization</a> is a standard method for solving both the exploding and the vanishing gradient problems.<sup id="cite_ref-14" class="reference"><a href="#cite_note-14"><span class="cite-bracket">[</span>10<span class="cite-bracket">]</span></a></sup><sup id="cite_ref-15" class="reference"><a href="#cite_note-15"><span class="cite-bracket">[</span>11<span class="cite-bracket">]</span></a></sup> </p> <div class="mw-heading mw-heading3"><h3 id="Multi-level_hierarchy">Multi-level hierarchy</h3><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Vanishing_gradient_problem&action=edit&section=9" title="Edit section: Multi-level hierarchy"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <p>In multi-level hierarchy of networks (<a href="/wiki/J%C3%BCrgen_Schmidhuber" title="Jürgen Schmidhuber">Schmidhuber</a>, 1992), pre-trained one level at a time through <a href="/wiki/Unsupervised_learning" title="Unsupervised learning">unsupervised learning</a>, fine-tuned through <a href="/wiki/Backpropagation" title="Backpropagation">backpropagation</a>.<sup id="cite_ref-SCHMID1992_16-0" class="reference"><a href="#cite_note-SCHMID1992-16"><span class="cite-bracket">[</span>12<span class="cite-bracket">]</span></a></sup> Here each level learns a compressed representation of the observations that is fed to the next level. </p> <div class="mw-heading mw-heading3"><h3 id="Deep_belief_network">Deep belief network</h3><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Vanishing_gradient_problem&action=edit&section=10" title="Edit section: Deep belief network"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <p>Similar ideas have been used in feed-forward neural networks for unsupervised pre-training to structure a neural network, making it first learn generally useful <a href="/wiki/Feature_detection_(nervous_system)" title="Feature detection (nervous system)">feature detectors</a>. Then the network is trained further by supervised <a href="/wiki/Backpropagation" title="Backpropagation">backpropagation</a> to classify labeled data. The <a href="/wiki/Deep_belief_network" title="Deep belief network">deep belief network</a> model by Hinton et al. (2006) involves learning the distribution of a high-level representation using successive layers of binary or real-valued <a href="/wiki/Latent_variable" class="mw-redirect" title="Latent variable">latent variables</a>. It uses a <a href="/wiki/Restricted_Boltzmann_machine" title="Restricted Boltzmann machine">restricted Boltzmann machine</a> to model each new layer of higher level features. Each new layer guarantees an increase on the <a href="/wiki/Lower_bound" class="mw-redirect" title="Lower bound">lower-bound</a> of the <a href="/wiki/Log_likelihood" class="mw-redirect" title="Log likelihood">log likelihood</a> of the data, thus improving the model, if trained properly. Once sufficiently many layers have been learned the deep architecture may be used as a <a href="/wiki/Generative_model" title="Generative model">generative model</a> by reproducing the data when sampling down the model (an "ancestral pass") from the top level feature activations.<sup id="cite_ref-hinton2006_17-0" class="reference"><a href="#cite_note-hinton2006-17"><span class="cite-bracket">[</span>13<span class="cite-bracket">]</span></a></sup> Hinton reports that his models are effective feature extractors over high-dimensional, structured data.<sup id="cite_ref-18" class="reference"><a href="#cite_note-18"><span class="cite-bracket">[</span>14<span class="cite-bracket">]</span></a></sup> </p> <div class="mw-heading mw-heading3"><h3 id="Faster_hardware">Faster hardware</h3><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Vanishing_gradient_problem&action=edit&section=11" title="Edit section: Faster hardware"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <p>Hardware advances have meant that from 1991 to 2015, computer power (especially as delivered by <a href="/wiki/General-purpose_computing_on_graphics_processing_units" title="General-purpose computing on graphics processing units">GPUs</a>) has increased around a million-fold, making standard backpropagation feasible for networks several layers deeper than when the vanishing gradient problem was recognized. Schmidhuber notes that this "is basically what is winning many of the image recognition competitions now", but that it "does not really overcome the problem in a fundamental way"<sup id="cite_ref-19" class="reference"><a href="#cite_note-19"><span class="cite-bracket">[</span>15<span class="cite-bracket">]</span></a></sup> since the original models tackling the vanishing gradient problem by Hinton and others were trained in a <a href="/wiki/Xeon" title="Xeon">Xeon processor</a>, not GPUs.<sup id="cite_ref-hinton2006_17-1" class="reference"><a href="#cite_note-hinton2006-17"><span class="cite-bracket">[</span>13<span class="cite-bracket">]</span></a></sup> </p> <div class="mw-heading mw-heading3"><h3 id="Residual_connection">Residual connection</h3><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Vanishing_gradient_problem&action=edit&section=12" title="Edit section: Residual connection"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <p><a href="/wiki/Residual_neural_network" title="Residual neural network">Residual connections</a>, or skip connections, refers to the architectural motif of <span class="avoidwrap" style="display:inline-block;"><span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle x\mapsto f(x)+x}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mi>x</mi> <mo stretchy="false">↦<!-- ↦ --></mo> <mi>f</mi> <mo stretchy="false">(</mo> <mi>x</mi> <mo stretchy="false">)</mo> <mo>+</mo> <mi>x</mi> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle x\mapsto f(x)+x}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/c9ce5caa1d71c263eddc969f5501804aad6fd3ec" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.838ex; width:13.531ex; height:2.843ex;" alt="{\displaystyle x\mapsto f(x)+x}" /></span>,</span> where <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle f}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mi>f</mi> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle f}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/132e57acb643253e7810ee9702d9581f159a1c61" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.671ex; width:1.279ex; height:2.509ex;" alt="{\displaystyle f}" /></span> is an arbitrary neural network module. This gives the gradient of <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle \nabla f+I}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mi mathvariant="normal">∇<!-- ∇ --></mi> <mi>f</mi> <mo>+</mo> <mi>I</mi> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle \nabla f+I}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/2f82c4536f631c539a424ad04221ccd2beb83142" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.671ex; width:7.227ex; height:2.509ex;" alt="{\displaystyle \nabla f+I}" /></span>, where the identity matrix do not suffer from the vanishing or exploding gradient. During backpropagation, part of the gradient flows through the residual connections.<sup id="cite_ref-He2015_20-0" class="reference"><a href="#cite_note-He2015-20"><span class="cite-bracket">[</span>16<span class="cite-bracket">]</span></a></sup> </p><p>Concretely, let the neural network (without residual connections) be <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle f_{n}\circ f_{n-1}\circ \cdots \circ f_{1}}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <msub> <mi>f</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>n</mi> </mrow> </msub> <mo>∘<!-- ∘ --></mo> <msub> <mi>f</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>n</mi> <mo>−<!-- − --></mo> <mn>1</mn> </mrow> </msub> <mo>∘<!-- ∘ --></mo> <mo>⋯<!-- ⋯ --></mo> <mo>∘<!-- ∘ --></mo> <msub> <mi>f</mi> <mrow class="MJX-TeXAtom-ORD"> <mn>1</mn> </mrow> </msub> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle f_{n}\circ f_{n-1}\circ \cdots \circ f_{1}}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/17bd3103b5ee1bd3263dc46bb3183687ac3810c7" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.671ex; width:18.317ex; height:2.509ex;" alt="{\displaystyle f_{n}\circ f_{n-1}\circ \cdots \circ f_{1}}" /></span>, then with residual connections, the gradient of output with respect to the activations at layer <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle l}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mi>l</mi> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle l}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/829091f745070b9eb97a80244129025440a1cfac" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.338ex; width:0.693ex; height:2.176ex;" alt="{\displaystyle l}" /></span> is <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle I+\nabla f_{l+1}+\nabla f_{l+2}\nabla f_{l+1}+\cdots }"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mi>I</mi> <mo>+</mo> <mi mathvariant="normal">∇<!-- ∇ --></mi> <msub> <mi>f</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>l</mi> <mo>+</mo> <mn>1</mn> </mrow> </msub> <mo>+</mo> <mi mathvariant="normal">∇<!-- ∇ --></mi> <msub> <mi>f</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>l</mi> <mo>+</mo> <mn>2</mn> </mrow> </msub> <mi mathvariant="normal">∇<!-- ∇ --></mi> <msub> <mi>f</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>l</mi> <mo>+</mo> <mn>1</mn> </mrow> </msub> <mo>+</mo> <mo>⋯<!-- ⋯ --></mo> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle I+\nabla f_{l+1}+\nabla f_{l+2}\nabla f_{l+1}+\cdots }</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/38615876236845cba5f9f038f02a56b1ef50c396" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.671ex; width:30.111ex; height:2.509ex;" alt="{\displaystyle I+\nabla f_{l+1}+\nabla f_{l+2}\nabla f_{l+1}+\cdots }" /></span>. The gradient thus does not vanish in arbitrarily deep networks. </p><p>Feedforward networks with residual connections can be regarded as an ensemble of relatively shallow nets. In this perspective, they resolve the vanishing gradient problem by being equivalent to ensembles of many shallow networks, for which there is no vanishing gradient problem.<sup id="cite_ref-21" class="reference"><a href="#cite_note-21"><span class="cite-bracket">[</span>17<span class="cite-bracket">]</span></a></sup> </p> <div class="mw-heading mw-heading3"><h3 id="Other_activation_functions">Other activation functions</h3><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Vanishing_gradient_problem&action=edit&section=13" title="Edit section: Other activation functions"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <p><a href="/wiki/Rectifier_(neural_networks)" title="Rectifier (neural networks)">Rectifiers</a> such as <a href="/wiki/ReLU" class="mw-redirect" title="ReLU">ReLU</a> suffer less from the vanishing gradient problem, because they only saturate in one direction.<sup id="cite_ref-22" class="reference"><a href="#cite_note-22"><span class="cite-bracket">[</span>18<span class="cite-bracket">]</span></a></sup> </p> <div class="mw-heading mw-heading3"><h3 id="Weight_initialization">Weight initialization</h3><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Vanishing_gradient_problem&action=edit&section=14" title="Edit section: Weight initialization"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <p><a href="/wiki/Weight_initialization" title="Weight initialization">Weight initialization</a> is another approach that has been proposed to reduce the vanishing gradient problem in deep networks. </p><p>Kumar suggested that the distribution of initial weights should vary according to activation function used and proposed to initialize the weights in networks with the logistic activation function using a Gaussian distribution with a zero mean and a standard deviation of <code>3.6/sqrt(N)</code>, where <code>N</code> is the number of neurons in a layer.<sup id="cite_ref-23" class="reference"><a href="#cite_note-23"><span class="cite-bracket">[</span>19<span class="cite-bracket">]</span></a></sup> </p><p>Recently, Yilmaz and Poli<sup id="cite_ref-24" class="reference"><a href="#cite_note-24"><span class="cite-bracket">[</span>20<span class="cite-bracket">]</span></a></sup> performed a theoretical analysis on how gradients are affected by the mean of the initial weights in deep neural networks using the logistic activation function and found that gradients do not vanish if the <i>mean</i> of the initial weights is set according to the formula: <code>max(−1,-8/N)</code>. This simple strategy allows networks with 10 or 15 hidden layers to be trained very efficiently and effectively using the standard <a href="/wiki/Backpropagation" title="Backpropagation">backpropagation</a>. </p> <div class="mw-heading mw-heading3"><h3 id="Other">Other</h3><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Vanishing_gradient_problem&action=edit&section=15" title="Edit section: Other"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <p>Behnke relied only on the sign of the gradient (<a href="/wiki/Rprop" title="Rprop">Rprop</a>) when training his <a href="/w/index.php?title=Neural_Abstraction_Pyramid&action=edit&redlink=1" class="new" title="Neural Abstraction Pyramid (page does not exist)">Neural Abstraction Pyramid</a><sup id="cite_ref-25" class="reference"><a href="#cite_note-25"><span class="cite-bracket">[</span>21<span class="cite-bracket">]</span></a></sup> to solve problems like image reconstruction and face localization.<sup class="noprint Inline-Template Template-Fact" style="white-space:nowrap;">[<i><a href="/wiki/Wikipedia:Citation_needed" title="Wikipedia:Citation needed"><span title="This claim needs references to reliable sources. (June 2017)">citation needed</span></a></i>]</sup> </p><p>Neural networks can also be optimized by using a universal search algorithm on the space of neural network's weights, e.g., <a href="/wiki/Random_guess" class="mw-redirect" title="Random guess">random guess</a> or more systematically <a href="/wiki/Genetic_algorithm" title="Genetic algorithm">genetic algorithm</a>. This approach is not based on gradient and avoids the vanishing gradient problem.<sup id="cite_ref-26" class="reference"><a href="#cite_note-26"><span class="cite-bracket">[</span>22<span class="cite-bracket">]</span></a></sup> </p> <div class="mw-heading mw-heading2"><h2 id="See_also">See also</h2><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Vanishing_gradient_problem&action=edit&section=16" title="Edit section: See also"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <ul><li><a href="/wiki/Spectral_radius" title="Spectral radius">Spectral radius</a></li></ul> <div class="mw-heading mw-heading2"><h2 id="Notes">Notes</h2><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Vanishing_gradient_problem&action=edit&section=17" title="Edit section: Notes"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <style data-mw-deduplicate="TemplateStyles:r1239543626">.mw-parser-output .reflist{margin-bottom:0.5em;list-style-type:decimal}@media screen{.mw-parser-output .reflist{font-size:90%}}.mw-parser-output .reflist .references{font-size:100%;margin-bottom:0;list-style-type:inherit}.mw-parser-output .reflist-columns-2{column-width:30em}.mw-parser-output .reflist-columns-3{column-width:25em}.mw-parser-output .reflist-columns{margin-top:0.3em}.mw-parser-output .reflist-columns ol{margin-top:0}.mw-parser-output .reflist-columns li{page-break-inside:avoid;break-inside:avoid-column}.mw-parser-output .reflist-upper-alpha{list-style-type:upper-alpha}.mw-parser-output .reflist-upper-roman{list-style-type:upper-roman}.mw-parser-output .reflist-lower-alpha{list-style-type:lower-alpha}.mw-parser-output .reflist-lower-greek{list-style-type:lower-greek}.mw-parser-output .reflist-lower-roman{list-style-type:lower-roman}</style><div class="reflist"> <div class="mw-references-wrap"><ol class="references"> <li id="cite_note-let_it_be_L-7"><span class="mw-cite-backlink"><b><a href="#cite_ref-let_it_be_L_7-0">^</a></b></span> <span class="reference-text">A more general loss function could depend on the entire sequence of outputs, as <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle L(x_{1},...,x_{T},u_{1},...,u_{T})=\sum _{t=1}^{T}{\mathcal {E}}(x_{t},u_{1},...,u_{t})}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mi>L</mi> <mo stretchy="false">(</mo> <msub> <mi>x</mi> <mrow class="MJX-TeXAtom-ORD"> <mn>1</mn> </mrow> </msub> <mo>,</mo> <mo>.</mo> <mo>.</mo> <mo>.</mo> <mo>,</mo> <msub> <mi>x</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>T</mi> </mrow> </msub> <mo>,</mo> <msub> <mi>u</mi> <mrow class="MJX-TeXAtom-ORD"> <mn>1</mn> </mrow> </msub> <mo>,</mo> <mo>.</mo> <mo>.</mo> <mo>.</mo> <mo>,</mo> <msub> <mi>u</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>T</mi> </mrow> </msub> <mo stretchy="false">)</mo> <mo>=</mo> <munderover> <mo>∑<!-- ∑ --></mo> <mrow class="MJX-TeXAtom-ORD"> <mi>t</mi> <mo>=</mo> <mn>1</mn> </mrow> <mrow class="MJX-TeXAtom-ORD"> <mi>T</mi> </mrow> </munderover> <mrow class="MJX-TeXAtom-ORD"> <mrow class="MJX-TeXAtom-ORD"> <mi class="MJX-tex-caligraphic" mathvariant="script">E</mi> </mrow> </mrow> <mo stretchy="false">(</mo> <msub> <mi>x</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>t</mi> </mrow> </msub> <mo>,</mo> <msub> <mi>u</mi> <mrow class="MJX-TeXAtom-ORD"> <mn>1</mn> </mrow> </msub> <mo>,</mo> <mo>.</mo> <mo>.</mo> <mo>.</mo> <mo>,</mo> <msub> <mi>u</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>t</mi> </mrow> </msub> <mo stretchy="false">)</mo> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle L(x_{1},...,x_{T},u_{1},...,u_{T})=\sum _{t=1}^{T}{\mathcal {E}}(x_{t},u_{1},...,u_{t})}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/59abaef8ea1685c80431e5aba44c528eb729d98c" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -3.005ex; width:47.831ex; height:7.343ex;" alt="{\displaystyle L(x_{1},...,x_{T},u_{1},...,u_{T})=\sum _{t=1}^{T}{\mathcal {E}}(x_{t},u_{1},...,u_{t})}" /></span> for which the problem is the same, just with more complex notations.</span> </li> <li id="cite_note-sigmoid_activation_function-8"><span class="mw-cite-backlink"><b><a href="#cite_ref-sigmoid_activation_function_8-0">^</a></b></span> <span class="reference-text">Any activation function works, as long as it is differentiable with bounded derivative.</span> </li> <li id="cite_note-not_quite_work-9"><span class="mw-cite-backlink"><b><a href="#cite_ref-not_quite_work_9-0">^</a></b></span> <span class="reference-text">Consider <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle W_{rec}={\begin{bmatrix}0&2\\\epsilon &0\end{bmatrix}}}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <msub> <mi>W</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>r</mi> <mi>e</mi> <mi>c</mi> </mrow> </msub> <mo>=</mo> <mrow class="MJX-TeXAtom-ORD"> <mrow> <mo>[</mo> <mtable rowspacing="4pt" columnspacing="1em"> <mtr> <mtd> <mn>0</mn> </mtd> <mtd> <mn>2</mn> </mtd> </mtr> <mtr> <mtd> <mi>ϵ<!-- ϵ --></mi> </mtd> <mtd> <mn>0</mn> </mtd> </mtr> </mtable> <mo>]</mo> </mrow> </mrow> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle W_{rec}={\begin{bmatrix}0&2\\\epsilon &0\end{bmatrix}}}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/6b7ec791e4a225ef38f97097b1a3d5eaf0985899" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -2.505ex; width:15.598ex; height:6.176ex;" alt="{\displaystyle W_{rec}={\begin{bmatrix}0&2\\\epsilon &0\end{bmatrix}}}" /></span> and <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle D={\begin{bmatrix}c&0\\0&c\end{bmatrix}}}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mi>D</mi> <mo>=</mo> <mrow class="MJX-TeXAtom-ORD"> <mrow> <mo>[</mo> <mtable rowspacing="4pt" columnspacing="1em"> <mtr> <mtd> <mi>c</mi> </mtd> <mtd> <mn>0</mn> </mtd> </mtr> <mtr> <mtd> <mn>0</mn> </mtd> <mtd> <mi>c</mi> </mtd> </mtr> </mtable> <mo>]</mo> </mrow> </mrow> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle D={\begin{bmatrix}c&0\\0&c\end{bmatrix}}}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/b3d1383bc8de62d88f19cf1e29ddc10cdb717d23" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -2.505ex; width:12.877ex; height:6.176ex;" alt="{\displaystyle D={\begin{bmatrix}c&0\\0&c\end{bmatrix}}}" /></span>, with <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle \epsilon >{\frac {1}{2}}}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mi>ϵ<!-- ϵ --></mi> <mo>></mo> <mrow class="MJX-TeXAtom-ORD"> <mfrac> <mn>1</mn> <mn>2</mn> </mfrac> </mrow> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle \epsilon >{\frac {1}{2}}}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/f039862798f21a9b1ef41781baec70acfc7149c8" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -1.838ex; width:6.041ex; height:5.176ex;" alt="{\displaystyle \epsilon >{\frac {1}{2}}}" /></span> and <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle c\in (0,1)}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mi>c</mi> <mo>∈<!-- ∈ --></mo> <mo stretchy="false">(</mo> <mn>0</mn> <mo>,</mo> <mn>1</mn> <mo stretchy="false">)</mo> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle c\in (0,1)}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/f0d511ece0151d2cef8fd2db38a881657bbf3b8a" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.838ex; width:9.016ex; height:2.843ex;" alt="{\displaystyle c\in (0,1)}" /></span>. Then <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle W_{rec}}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <msub> <mi>W</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>r</mi> <mi>e</mi> <mi>c</mi> </mrow> </msub> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle W_{rec}}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/96e291e7ab7480d5a8991ecf8b0d658fec2bf8a3" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.671ex; width:4.646ex; height:2.509ex;" alt="{\displaystyle W_{rec}}" /></span> has spectral radius <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle {\sqrt {2\epsilon }}>1}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mrow class="MJX-TeXAtom-ORD"> <msqrt> <mn>2</mn> <mi>ϵ<!-- ϵ --></mi> </msqrt> </mrow> <mo>></mo> <mn>1</mn> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle {\sqrt {2\epsilon }}>1}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/06601f7432e42a86b318db26fcee4b30659b1bd7" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.671ex; width:8.303ex; height:3.009ex;" alt="{\displaystyle {\sqrt {2\epsilon }}>1}" /></span>, and <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle (W_{rec}D)^{2N}=(2\epsilon \cdot c^{2})^{N}I_{2\times 2}}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mo stretchy="false">(</mo> <msub> <mi>W</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>r</mi> <mi>e</mi> <mi>c</mi> </mrow> </msub> <mi>D</mi> <msup> <mo stretchy="false">)</mo> <mrow class="MJX-TeXAtom-ORD"> <mn>2</mn> <mi>N</mi> </mrow> </msup> <mo>=</mo> <mo stretchy="false">(</mo> <mn>2</mn> <mi>ϵ<!-- ϵ --></mi> <mo>⋅<!-- ⋅ --></mo> <msup> <mi>c</mi> <mrow class="MJX-TeXAtom-ORD"> <mn>2</mn> </mrow> </msup> <msup> <mo stretchy="false">)</mo> <mrow class="MJX-TeXAtom-ORD"> <mi>N</mi> </mrow> </msup> <msub> <mi>I</mi> <mrow class="MJX-TeXAtom-ORD"> <mn>2</mn> <mo>×<!-- × --></mo> <mn>2</mn> </mrow> </msub> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle (W_{rec}D)^{2N}=(2\epsilon \cdot c^{2})^{N}I_{2\times 2}}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/9be0c26a57a31ab0bde5c8ed30cb32fbbbcbeb22" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.838ex; width:27.516ex; height:3.176ex;" alt="{\displaystyle (W_{rec}D)^{2N}=(2\epsilon \cdot c^{2})^{N}I_{2\times 2}}" /></span>, which might go to infinity or zero depending on choice of <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle c}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mi>c</mi> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle c}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/86a67b81c2de995bd608d5b2df50cd8cd7d92455" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.338ex; width:1.007ex; height:1.676ex;" alt="{\displaystyle c}" /></span>.</span> </li> <li id="cite_note-attractor-12"><span class="mw-cite-backlink"><b><a href="#cite_ref-attractor_12-0">^</a></b></span> <span class="reference-text">This is because at <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle b=-2.5}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mi>b</mi> <mo>=</mo> <mo>−<!-- − --></mo> <mn>2.5</mn> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle b=-2.5}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/7bc96007807c1cd45aa5cacb3dc4f383e1e8aac5" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.505ex; width:8.876ex; height:2.343ex;" alt="{\displaystyle b=-2.5}" /></span>, the two stable attractors are <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle x=0.145,0.855}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mi>x</mi> <mo>=</mo> <mn>0.145</mn> <mo>,</mo> <mn>0.855</mn> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle x=0.145,0.855}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/95ef2fd1c58164f6f870f74d64274d3441c9e10d" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.671ex; width:16.055ex; height:2.509ex;" alt="{\displaystyle x=0.145,0.855}" /></span>, and the unstable attractor is <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle x=0.5}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mi>x</mi> <mo>=</mo> <mn>0.5</mn> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle x=0.5}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/91daf663ff0814a9298797956c0a55fea756931c" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.338ex; width:7.4ex; height:2.176ex;" alt="{\displaystyle x=0.5}" /></span>.</span> </li> </ol></div></div><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1239543626" /><div class="reflist reflist-lower-alpha"> </div> <div class="mw-heading mw-heading2"><h2 id="References">References</h2><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Vanishing_gradient_problem&action=edit&section=18" title="Edit section: References"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1239543626" /><div class="reflist reflist-columns references-column-width" style="column-width: 30em;"> <ol class="references"> <li id="cite_note-Basodi2020-1"><span class="mw-cite-backlink">^ <a href="#cite_ref-Basodi2020_1-0"><sup><i><b>a</b></i></sup></a> <a href="#cite_ref-Basodi2020_1-1"><sup><i><b>b</b></i></sup></a></span> <span class="reference-text"><style data-mw-deduplicate="TemplateStyles:r1238218222">.mw-parser-output cite.citation{font-style:inherit;word-wrap:break-word}.mw-parser-output .citation q{quotes:"\"""\"""'""'"}.mw-parser-output .citation:target{background-color:rgba(0,127,255,0.133)}.mw-parser-output .id-lock-free.id-lock-free a{background:url("//upload.wikimedia.org/wikipedia/commons/6/65/Lock-green.svg")right 0.1em center/9px no-repeat}.mw-parser-output .id-lock-limited.id-lock-limited a,.mw-parser-output .id-lock-registration.id-lock-registration a{background:url("//upload.wikimedia.org/wikipedia/commons/d/d6/Lock-gray-alt-2.svg")right 0.1em center/9px no-repeat}.mw-parser-output .id-lock-subscription.id-lock-subscription a{background:url("//upload.wikimedia.org/wikipedia/commons/a/aa/Lock-red-alt-2.svg")right 0.1em center/9px no-repeat}.mw-parser-output .cs1-ws-icon a{background:url("//upload.wikimedia.org/wikipedia/commons/4/4c/Wikisource-logo.svg")right 0.1em center/12px no-repeat}body:not(.skin-timeless):not(.skin-minerva) .mw-parser-output .id-lock-free a,body:not(.skin-timeless):not(.skin-minerva) .mw-parser-output .id-lock-limited a,body:not(.skin-timeless):not(.skin-minerva) .mw-parser-output .id-lock-registration a,body:not(.skin-timeless):not(.skin-minerva) .mw-parser-output .id-lock-subscription a,body:not(.skin-timeless):not(.skin-minerva) .mw-parser-output .cs1-ws-icon a{background-size:contain;padding:0 1em 0 0}.mw-parser-output .cs1-code{color:inherit;background:inherit;border:none;padding:inherit}.mw-parser-output .cs1-hidden-error{display:none;color:var(--color-error,#d33)}.mw-parser-output .cs1-visible-error{color:var(--color-error,#d33)}.mw-parser-output .cs1-maint{display:none;color:#085;margin-left:0.3em}.mw-parser-output .cs1-kern-left{padding-left:0.2em}.mw-parser-output .cs1-kern-right{padding-right:0.2em}.mw-parser-output .citation .mw-selflink{font-weight:inherit}@media screen{.mw-parser-output .cs1-format{font-size:95%}html.skin-theme-clientpref-night .mw-parser-output .cs1-maint{color:#18911f}}@media screen and (prefers-color-scheme:dark){html.skin-theme-clientpref-os .mw-parser-output .cs1-maint{color:#18911f}}</style><cite id="CITEREFBasodiJiZhangPan2020" class="citation journal cs1">Basodi, Sunitha; Ji, Chunyan; Zhang, Haiping; Pan, Yi (September 2020). <a rel="nofollow" class="external text" href="https://doi.org/10.26599%2FBDMA.2020.9020004">"Gradient amplification: An efficient way to train deep neural networks"</a>. <i>Big Data Mining and Analytics</i>. <b>3</b> (3): 198. <a href="/wiki/ArXiv_(identifier)" class="mw-redirect" title="ArXiv (identifier)">arXiv</a>:<span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://arxiv.org/abs/2006.10560">2006.10560</a></span>. <a href="/wiki/Doi_(identifier)" class="mw-redirect" title="Doi (identifier)">doi</a>:<span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://doi.org/10.26599%2FBDMA.2020.9020004">10.26599/BDMA.2020.9020004</a></span>. <a href="/wiki/ISSN_(identifier)" class="mw-redirect" title="ISSN (identifier)">ISSN</a> <a rel="nofollow" class="external text" href="https://search.worldcat.org/issn/2096-0654">2096-0654</a>. <a href="/wiki/S2CID_(identifier)" class="mw-redirect" title="S2CID (identifier)">S2CID</a> <a rel="nofollow" class="external text" href="https://api.semanticscholar.org/CorpusID:219792172">219792172</a>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=article&rft.jtitle=Big+Data+Mining+and+Analytics&rft.atitle=Gradient+amplification%3A+An+efficient+way+to+train+deep+neural+networks&rft.volume=3&rft.issue=3&rft.pages=198&rft.date=2020-09&rft_id=info%3Aarxiv%2F2006.10560&rft_id=https%3A%2F%2Fapi.semanticscholar.org%2FCorpusID%3A219792172%23id-name%3DS2CID&rft.issn=2096-0654&rft_id=info%3Adoi%2F10.26599%2FBDMA.2020.9020004&rft.aulast=Basodi&rft.aufirst=Sunitha&rft.au=Ji%2C+Chunyan&rft.au=Zhang%2C+Haiping&rft.au=Pan%2C+Yi&rft_id=https%3A%2F%2Fdoi.org%2F10.26599%252FBDMA.2020.9020004&rfr_id=info%3Asid%2Fen.wikipedia.org%3AVanishing+gradient+problem" class="Z3988"></span></span> </li> <li id="cite_note-2"><span class="mw-cite-backlink"><b><a href="#cite_ref-2">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222" /><cite id="CITEREFHochreiter1991" class="citation thesis cs1">Hochreiter, S. (1991). <a rel="nofollow" class="external text" href="http://people.idsia.ch/~juergen/SeppHochreiter1991ThesisAdvisorSchmidhuber.pdf"><i>Untersuchungen zu dynamischen neuronalen Netzen</i></a> <span class="cs1-format">(PDF)</span> (Diplom thesis). Institut f. Informatik, Technische Univ. Munich.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Adissertation&rft.title=Untersuchungen+zu+dynamischen+neuronalen+Netzen&rft.inst=Institut+f.+Informatik%2C+Technische+Univ.+Munich&rft.date=1991&rft.aulast=Hochreiter&rft.aufirst=S.&rft_id=http%3A%2F%2Fpeople.idsia.ch%2F~juergen%2FSeppHochreiter1991ThesisAdvisorSchmidhuber.pdf&rfr_id=info%3Asid%2Fen.wikipedia.org%3AVanishing+gradient+problem" class="Z3988"></span></span> </li> <li id="cite_note-3"><span class="mw-cite-backlink"><b><a href="#cite_ref-3">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222" /><cite id="CITEREFHochreiterBengioFrasconiSchmidhuber2001" class="citation book cs1">Hochreiter, S.; Bengio, Y.; Frasconi, P.; Schmidhuber, J. (2001). <a rel="nofollow" class="external text" href="https://ieeexplore.ieee.org/document/5264952">"Gradient flow in recurrent nets: the difficulty of learning long-term dependencies"</a>. In Kremer, S. C.; Kolen, J. F. (eds.). <i>A Field Guide to Dynamical Recurrent Neural Networks</i>. IEEE Press. <a href="/wiki/Doi_(identifier)" class="mw-redirect" title="Doi (identifier)">doi</a>:<a rel="nofollow" class="external text" href="https://doi.org/10.1109%2F9780470544037.ch14">10.1109/9780470544037.ch14</a>. <a href="/wiki/ISBN_(identifier)" class="mw-redirect" title="ISBN (identifier)">ISBN</a> <a href="/wiki/Special:BookSources/0-7803-5369-2" title="Special:BookSources/0-7803-5369-2"><bdi>0-7803-5369-2</bdi></a>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&rft.genre=bookitem&rft.atitle=Gradient+flow+in+recurrent+nets%3A+the+difficulty+of+learning+long-term+dependencies&rft.btitle=A+Field+Guide+to+Dynamical+Recurrent+Neural+Networks&rft.pub=IEEE+Press&rft.date=2001&rft_id=info%3Adoi%2F10.1109%2F9780470544037.ch14&rft.isbn=0-7803-5369-2&rft.aulast=Hochreiter&rft.aufirst=S.&rft.au=Bengio%2C+Y.&rft.au=Frasconi%2C+P.&rft.au=Schmidhuber%2C+J.&rft_id=https%3A%2F%2Fieeexplore.ieee.org%2Fdocument%2F5264952&rfr_id=info%3Asid%2Fen.wikipedia.org%3AVanishing+gradient+problem" class="Z3988"></span></span> </li> <li id="cite_note-4"><span class="mw-cite-backlink"><b><a href="#cite_ref-4">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222" /><cite id="CITEREFGohHodasVishnu2017" class="citation journal cs1">Goh, Garrett B.; Hodas, Nathan O.; Vishnu, Abhinav (15 June 2017). "Deep learning for computational chemistry". <i>Journal of Computational Chemistry</i>. <b>38</b> (16): <span class="nowrap">1291–</span>1307. <a href="/wiki/ArXiv_(identifier)" class="mw-redirect" title="ArXiv (identifier)">arXiv</a>:<span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://arxiv.org/abs/1701.04503">1701.04503</a></span>. <a href="/wiki/Bibcode_(identifier)" class="mw-redirect" title="Bibcode (identifier)">Bibcode</a>:<a rel="nofollow" class="external text" href="https://ui.adsabs.harvard.edu/abs/2017arXiv170104503G">2017arXiv170104503G</a>. <a href="/wiki/Doi_(identifier)" class="mw-redirect" title="Doi (identifier)">doi</a>:<a rel="nofollow" class="external text" href="https://doi.org/10.1002%2Fjcc.24764">10.1002/jcc.24764</a>. <a href="/wiki/PMID_(identifier)" class="mw-redirect" title="PMID (identifier)">PMID</a> <a rel="nofollow" class="external text" href="https://pubmed.ncbi.nlm.nih.gov/28272810">28272810</a>. <a href="/wiki/S2CID_(identifier)" class="mw-redirect" title="S2CID (identifier)">S2CID</a> <a rel="nofollow" class="external text" href="https://api.semanticscholar.org/CorpusID:6831636">6831636</a>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=article&rft.jtitle=Journal+of+Computational+Chemistry&rft.atitle=Deep+learning+for+computational+chemistry&rft.volume=38&rft.issue=16&rft.pages=%3Cspan+class%3D%22nowrap%22%3E1291-%3C%2Fspan%3E1307&rft.date=2017-06-15&rft_id=https%3A%2F%2Fapi.semanticscholar.org%2FCorpusID%3A6831636%23id-name%3DS2CID&rft_id=info%3Abibcode%2F2017arXiv170104503G&rft_id=info%3Aarxiv%2F1701.04503&rft_id=info%3Apmid%2F28272810&rft_id=info%3Adoi%2F10.1002%2Fjcc.24764&rft.aulast=Goh&rft.aufirst=Garrett+B.&rft.au=Hodas%2C+Nathan+O.&rft.au=Vishnu%2C+Abhinav&rfr_id=info%3Asid%2Fen.wikipedia.org%3AVanishing+gradient+problem" class="Z3988"></span></span> </li> <li id="cite_note-5"><span class="mw-cite-backlink"><b><a href="#cite_ref-5">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222" /><cite id="CITEREFBengioFrasconiSimard1993" class="citation conference cs1">Bengio, Y.; Frasconi, P.; Simard, P. (1993). <a rel="nofollow" class="external text" href="https://ieeexplore.ieee.org/document/298725"><i>The problem of learning long-term dependencies in recurrent networks</i></a>. IEEE International Conference on Neural Networks. IEEE. pp. <span class="nowrap">1183–</span>1188. <a href="/wiki/Doi_(identifier)" class="mw-redirect" title="Doi (identifier)">doi</a>:<a rel="nofollow" class="external text" href="https://doi.org/10.1109%2FICNN.1993.298725">10.1109/ICNN.1993.298725</a>. <a href="/wiki/ISBN_(identifier)" class="mw-redirect" title="ISBN (identifier)">ISBN</a> <a href="/wiki/Special:BookSources/978-0-7803-0999-9" title="Special:BookSources/978-0-7803-0999-9"><bdi>978-0-7803-0999-9</bdi></a>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&rft.genre=conference&rft.btitle=The+problem+of+learning+long-term+dependencies+in+recurrent+networks&rft.pages=%3Cspan+class%3D%22nowrap%22%3E1183-%3C%2Fspan%3E1188&rft.pub=IEEE&rft.date=1993&rft_id=info%3Adoi%2F10.1109%2FICNN.1993.298725&rft.isbn=978-0-7803-0999-9&rft.aulast=Bengio&rft.aufirst=Y.&rft.au=Frasconi%2C+P.&rft.au=Simard%2C+P.&rft_id=https%3A%2F%2Fieeexplore.ieee.org%2Fdocument%2F298725&rfr_id=info%3Asid%2Fen.wikipedia.org%3AVanishing+gradient+problem" class="Z3988"></span></span> </li> <li id="cite_note-:1-6"><span class="mw-cite-backlink">^ <a href="#cite_ref-:1_6-0"><sup><i><b>a</b></i></sup></a> <a href="#cite_ref-:1_6-1"><sup><i><b>b</b></i></sup></a> <a href="#cite_ref-:1_6-2"><sup><i><b>c</b></i></sup></a> <a href="#cite_ref-:1_6-3"><sup><i><b>d</b></i></sup></a> <a href="#cite_ref-:1_6-4"><sup><i><b>e</b></i></sup></a></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222" /><cite id="CITEREFPascanuMikolovBengio2012" class="citation arxiv cs1">Pascanu, Razvan; Mikolov, Tomas; Bengio, Yoshua (21 November 2012). "On the difficulty of training Recurrent Neural Networks". <a href="/wiki/ArXiv_(identifier)" class="mw-redirect" title="ArXiv (identifier)">arXiv</a>:<span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://arxiv.org/abs/1211.5063">1211.5063</a></span> [<a rel="nofollow" class="external text" href="https://arxiv.org/archive/cs.LG">cs.LG</a>].</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=preprint&rft.jtitle=arXiv&rft.atitle=On+the+difficulty+of+training+Recurrent+Neural+Networks&rft.date=2012-11-21&rft_id=info%3Aarxiv%2F1211.5063&rft.aulast=Pascanu&rft.aufirst=Razvan&rft.au=Mikolov%2C+Tomas&rft.au=Bengio%2C+Yoshua&rfr_id=info%3Asid%2Fen.wikipedia.org%3AVanishing+gradient+problem" class="Z3988"></span></span> </li> <li id="cite_note-10"><span class="mw-cite-backlink"><b><a href="#cite_ref-10">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222" /><cite id="CITEREFDoya1992" class="citation book cs1">Doya, K. (1992). <a rel="nofollow" class="external text" href="https://dx.doi.org/10.1109/iscas.1992.230622">"Bifurcations in the learning of recurrent neural networks"</a>. <i>[Proceedings] 1992 IEEE International Symposium on Circuits and Systems</i>. Vol. 6. IEEE. pp. <span class="nowrap">2777–</span>2780. <a href="/wiki/Doi_(identifier)" class="mw-redirect" title="Doi (identifier)">doi</a>:<a rel="nofollow" class="external text" href="https://doi.org/10.1109%2Fiscas.1992.230622">10.1109/iscas.1992.230622</a>. <a href="/wiki/ISBN_(identifier)" class="mw-redirect" title="ISBN (identifier)">ISBN</a> <a href="/wiki/Special:BookSources/0-7803-0593-0" title="Special:BookSources/0-7803-0593-0"><bdi>0-7803-0593-0</bdi></a>. <a href="/wiki/S2CID_(identifier)" class="mw-redirect" title="S2CID (identifier)">S2CID</a> <a rel="nofollow" class="external text" href="https://api.semanticscholar.org/CorpusID:15069221">15069221</a>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&rft.genre=bookitem&rft.atitle=Bifurcations+in+the+learning+of+recurrent+neural+networks&rft.btitle=%26%2391%3BProceedings%26%2393%3B+1992+IEEE+International+Symposium+on+Circuits+and+Systems&rft.pages=%3Cspan+class%3D%22nowrap%22%3E2777-%3C%2Fspan%3E2780&rft.pub=IEEE&rft.date=1992&rft_id=https%3A%2F%2Fapi.semanticscholar.org%2FCorpusID%3A15069221%23id-name%3DS2CID&rft_id=info%3Adoi%2F10.1109%2Fiscas.1992.230622&rft.isbn=0-7803-0593-0&rft.aulast=Doya&rft.aufirst=K.&rft_id=http%3A%2F%2Fdx.doi.org%2F10.1109%2Fiscas.1992.230622&rfr_id=info%3Asid%2Fen.wikipedia.org%3AVanishing+gradient+problem" class="Z3988"></span></span> </li> <li id="cite_note-11"><span class="mw-cite-backlink"><b><a href="#cite_ref-11">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222" /><cite id="CITEREFBengioSimardFrasconi1994" class="citation journal cs1">Bengio, Y.; Simard, P.; Frasconi, P. (March 1994). <a rel="nofollow" class="external text" href="https://ieeexplore.ieee.org/document/279181/;jsessionid=SMWnZdZeKnZotfiKpw4WmPN3aVjDWTzXEGlc8wIe2ZPryNFwofRz!1804793449">"Learning long-term dependencies with gradient descent is difficult"</a>. <i>IEEE Transactions on Neural Networks</i>. <b>5</b> (2): <span class="nowrap">157–</span>166. <a href="/wiki/Doi_(identifier)" class="mw-redirect" title="Doi (identifier)">doi</a>:<a rel="nofollow" class="external text" href="https://doi.org/10.1109%2F72.279181">10.1109/72.279181</a>. <a href="/wiki/ISSN_(identifier)" class="mw-redirect" title="ISSN (identifier)">ISSN</a> <a rel="nofollow" class="external text" href="https://search.worldcat.org/issn/1941-0093">1941-0093</a>. <a href="/wiki/PMID_(identifier)" class="mw-redirect" title="PMID (identifier)">PMID</a> <a rel="nofollow" class="external text" href="https://pubmed.ncbi.nlm.nih.gov/18267787">18267787</a>. <a href="/wiki/S2CID_(identifier)" class="mw-redirect" title="S2CID (identifier)">S2CID</a> <a rel="nofollow" class="external text" href="https://api.semanticscholar.org/CorpusID:206457500">206457500</a>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=article&rft.jtitle=IEEE+Transactions+on+Neural+Networks&rft.atitle=Learning+long-term+dependencies+with+gradient+descent+is+difficult&rft.volume=5&rft.issue=2&rft.pages=%3Cspan+class%3D%22nowrap%22%3E157-%3C%2Fspan%3E166&rft.date=1994-03&rft.issn=1941-0093&rft_id=https%3A%2F%2Fapi.semanticscholar.org%2FCorpusID%3A206457500%23id-name%3DS2CID&rft_id=info%3Apmid%2F18267787&rft_id=info%3Adoi%2F10.1109%2F72.279181&rft.aulast=Bengio&rft.aufirst=Y.&rft.au=Simard%2C+P.&rft.au=Frasconi%2C+P.&rft_id=https%3A%2F%2Fieeexplore.ieee.org%2Fdocument%2F279181%2F%3Bjsessionid%3DSMWnZdZeKnZotfiKpw4WmPN3aVjDWTzXEGlc8wIe2ZPryNFwofRz%211804793449&rfr_id=info%3Asid%2Fen.wikipedia.org%3AVanishing+gradient+problem" class="Z3988"></span></span> </li> <li id="cite_note-lstm-13"><span class="mw-cite-backlink"><b><a href="#cite_ref-lstm_13-0">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222" /><cite id="CITEREFHochreiterSchmidhuber1997" class="citation journal cs1"><a href="/wiki/Sepp_Hochreiter" title="Sepp Hochreiter">Hochreiter, Sepp</a>; <a href="/wiki/J%C3%BCrgen_Schmidhuber" title="Jürgen Schmidhuber">Schmidhuber, Jürgen</a> (1997). "Long Short-Term Memory". <i>Neural Computation</i>. <b>9</b> (8): <span class="nowrap">1735–</span>1780. <a href="/wiki/Doi_(identifier)" class="mw-redirect" title="Doi (identifier)">doi</a>:<a rel="nofollow" class="external text" href="https://doi.org/10.1162%2Fneco.1997.9.8.1735">10.1162/neco.1997.9.8.1735</a>. <a href="/wiki/PMID_(identifier)" class="mw-redirect" title="PMID (identifier)">PMID</a> <a rel="nofollow" class="external text" href="https://pubmed.ncbi.nlm.nih.gov/9377276">9377276</a>. <a href="/wiki/S2CID_(identifier)" class="mw-redirect" title="S2CID (identifier)">S2CID</a> <a rel="nofollow" class="external text" href="https://api.semanticscholar.org/CorpusID:1915014">1915014</a>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=article&rft.jtitle=Neural+Computation&rft.atitle=Long+Short-Term+Memory&rft.volume=9&rft.issue=8&rft.pages=%3Cspan+class%3D%22nowrap%22%3E1735-%3C%2Fspan%3E1780&rft.date=1997&rft_id=https%3A%2F%2Fapi.semanticscholar.org%2FCorpusID%3A1915014%23id-name%3DS2CID&rft_id=info%3Apmid%2F9377276&rft_id=info%3Adoi%2F10.1162%2Fneco.1997.9.8.1735&rft.aulast=Hochreiter&rft.aufirst=Sepp&rft.au=Schmidhuber%2C+J%C3%BCrgen&rfr_id=info%3Asid%2Fen.wikipedia.org%3AVanishing+gradient+problem" class="Z3988"></span></span> </li> <li id="cite_note-14"><span class="mw-cite-backlink"><b><a href="#cite_ref-14">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222" /><cite id="CITEREFIoffeSzegedy2015" class="citation journal cs1">Ioffe, Sergey; Szegedy, Christian (1 June 2015). <a rel="nofollow" class="external text" href="https://proceedings.mlr.press/v37/ioffe15.html">"Batch Normalization: Accelerating Deep Network Training by Reducing Internal Covariate Shift"</a>. <i>International Conference on Machine Learning</i>. PMLR: <span class="nowrap">448–</span>456. <a href="/wiki/ArXiv_(identifier)" class="mw-redirect" title="ArXiv (identifier)">arXiv</a>:<span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://arxiv.org/abs/1502.03167">1502.03167</a></span>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=article&rft.jtitle=International+Conference+on+Machine+Learning&rft.atitle=Batch+Normalization%3A+Accelerating+Deep+Network+Training+by+Reducing+Internal+Covariate+Shift&rft.pages=%3Cspan+class%3D%22nowrap%22%3E448-%3C%2Fspan%3E456&rft.date=2015-06-01&rft_id=info%3Aarxiv%2F1502.03167&rft.aulast=Ioffe&rft.aufirst=Sergey&rft.au=Szegedy%2C+Christian&rft_id=https%3A%2F%2Fproceedings.mlr.press%2Fv37%2Fioffe15.html&rfr_id=info%3Asid%2Fen.wikipedia.org%3AVanishing+gradient+problem" class="Z3988"></span></span> </li> <li id="cite_note-15"><span class="mw-cite-backlink"><b><a href="#cite_ref-15">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222" /><cite id="CITEREFSanturkarTsiprasIlyasMadry2018" class="citation journal cs1">Santurkar, Shibani; Tsipras, Dimitris; Ilyas, Andrew; Madry, Aleksander (2018). <a rel="nofollow" class="external text" href="https://proceedings.neurips.cc/paper/2018/hash/905056c1ac1dad141560467e0a99e1cf-Abstract.html">"How Does Batch Normalization Help Optimization?"</a>. <i>Advances in Neural Information Processing Systems</i>. <b>31</b>. Curran Associates, Inc.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=article&rft.jtitle=Advances+in+Neural+Information+Processing+Systems&rft.atitle=How+Does+Batch+Normalization+Help+Optimization%3F&rft.volume=31&rft.date=2018&rft.aulast=Santurkar&rft.aufirst=Shibani&rft.au=Tsipras%2C+Dimitris&rft.au=Ilyas%2C+Andrew&rft.au=Madry%2C+Aleksander&rft_id=https%3A%2F%2Fproceedings.neurips.cc%2Fpaper%2F2018%2Fhash%2F905056c1ac1dad141560467e0a99e1cf-Abstract.html&rfr_id=info%3Asid%2Fen.wikipedia.org%3AVanishing+gradient+problem" class="Z3988"></span></span> </li> <li id="cite_note-SCHMID1992-16"><span class="mw-cite-backlink"><b><a href="#cite_ref-SCHMID1992_16-0">^</a></b></span> <span class="reference-text">J. Schmidhuber., "Learning complex, extended sequences using the principle of history compression," <i>Neural Computation</i>, 4, pp. 234–242, 1992.</span> </li> <li id="cite_note-hinton2006-17"><span class="mw-cite-backlink">^ <a href="#cite_ref-hinton2006_17-0"><sup><i><b>a</b></i></sup></a> <a href="#cite_ref-hinton2006_17-1"><sup><i><b>b</b></i></sup></a></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222" /><cite id="CITEREFHintonOsinderoTeh2006" class="citation journal cs1"><a href="/wiki/Geoffrey_Hinton" title="Geoffrey Hinton">Hinton, G. E.</a>; Osindero, S.; Teh, Y. (2006). <a rel="nofollow" class="external text" href="http://www.cs.toronto.edu/~hinton/absps/fastnc.pdf">"A fast learning algorithm for deep belief nets"</a> <span class="cs1-format">(PDF)</span>. <i><a href="/wiki/Neural_Computation_(journal)" title="Neural Computation (journal)">Neural Computation</a></i>. <b>18</b> (7): <span class="nowrap">1527–</span>1554. <a href="/wiki/CiteSeerX_(identifier)" class="mw-redirect" title="CiteSeerX (identifier)">CiteSeerX</a> <span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.76.1541">10.1.1.76.1541</a></span>. <a href="/wiki/Doi_(identifier)" class="mw-redirect" title="Doi (identifier)">doi</a>:<a rel="nofollow" class="external text" href="https://doi.org/10.1162%2Fneco.2006.18.7.1527">10.1162/neco.2006.18.7.1527</a>. <a href="/wiki/PMID_(identifier)" class="mw-redirect" title="PMID (identifier)">PMID</a> <a rel="nofollow" class="external text" href="https://pubmed.ncbi.nlm.nih.gov/16764513">16764513</a>. <a href="/wiki/S2CID_(identifier)" class="mw-redirect" title="S2CID (identifier)">S2CID</a> <a rel="nofollow" class="external text" href="https://api.semanticscholar.org/CorpusID:2309950">2309950</a>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=article&rft.jtitle=Neural+Computation&rft.atitle=A+fast+learning+algorithm+for+deep+belief+nets&rft.volume=18&rft.issue=7&rft.pages=%3Cspan+class%3D%22nowrap%22%3E1527-%3C%2Fspan%3E1554&rft.date=2006&rft_id=https%3A%2F%2Fciteseerx.ist.psu.edu%2Fviewdoc%2Fsummary%3Fdoi%3D10.1.1.76.1541%23id-name%3DCiteSeerX&rft_id=https%3A%2F%2Fapi.semanticscholar.org%2FCorpusID%3A2309950%23id-name%3DS2CID&rft_id=info%3Apmid%2F16764513&rft_id=info%3Adoi%2F10.1162%2Fneco.2006.18.7.1527&rft.aulast=Hinton&rft.aufirst=G.+E.&rft.au=Osindero%2C+S.&rft.au=Teh%2C+Y.&rft_id=http%3A%2F%2Fwww.cs.toronto.edu%2F~hinton%2Fabsps%2Ffastnc.pdf&rfr_id=info%3Asid%2Fen.wikipedia.org%3AVanishing+gradient+problem" class="Z3988"></span></span> </li> <li id="cite_note-18"><span class="mw-cite-backlink"><b><a href="#cite_ref-18">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222" /><cite id="CITEREFHinton2009" class="citation journal cs1">Hinton, G. (2009). <a rel="nofollow" class="external text" href="https://doi.org/10.4249%2Fscholarpedia.5947">"Deep belief networks"</a>. <i>Scholarpedia</i>. <b>4</b> (5): 5947. <a href="/wiki/Bibcode_(identifier)" class="mw-redirect" title="Bibcode (identifier)">Bibcode</a>:<a rel="nofollow" class="external text" href="https://ui.adsabs.harvard.edu/abs/2009SchpJ...4.5947H">2009SchpJ...4.5947H</a>. <a href="/wiki/Doi_(identifier)" class="mw-redirect" title="Doi (identifier)">doi</a>:<span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://doi.org/10.4249%2Fscholarpedia.5947">10.4249/scholarpedia.5947</a></span>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=article&rft.jtitle=Scholarpedia&rft.atitle=Deep+belief+networks&rft.volume=4&rft.issue=5&rft.pages=5947&rft.date=2009&rft_id=info%3Adoi%2F10.4249%2Fscholarpedia.5947&rft_id=info%3Abibcode%2F2009SchpJ...4.5947H&rft.aulast=Hinton&rft.aufirst=G.&rft_id=https%3A%2F%2Fdoi.org%2F10.4249%252Fscholarpedia.5947&rfr_id=info%3Asid%2Fen.wikipedia.org%3AVanishing+gradient+problem" class="Z3988"></span></span> </li> <li id="cite_note-19"><span class="mw-cite-backlink"><b><a href="#cite_ref-19">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222" /><cite id="CITEREFSchmidhuber2015" class="citation journal cs1">Schmidhuber, Jürgen (2015). "Deep learning in neural networks: An overview". <i>Neural Networks</i>. <b>61</b>: <span class="nowrap">85–</span>117. <a href="/wiki/ArXiv_(identifier)" class="mw-redirect" title="ArXiv (identifier)">arXiv</a>:<span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://arxiv.org/abs/1404.7828">1404.7828</a></span>. <a href="/wiki/Doi_(identifier)" class="mw-redirect" title="Doi (identifier)">doi</a>:<a rel="nofollow" class="external text" href="https://doi.org/10.1016%2Fj.neunet.2014.09.003">10.1016/j.neunet.2014.09.003</a>. <a href="/wiki/PMID_(identifier)" class="mw-redirect" title="PMID (identifier)">PMID</a> <a rel="nofollow" class="external text" href="https://pubmed.ncbi.nlm.nih.gov/25462637">25462637</a>. <a href="/wiki/S2CID_(identifier)" class="mw-redirect" title="S2CID (identifier)">S2CID</a> <a rel="nofollow" class="external text" href="https://api.semanticscholar.org/CorpusID:11715509">11715509</a>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=article&rft.jtitle=Neural+Networks&rft.atitle=Deep+learning+in+neural+networks%3A+An+overview&rft.volume=61&rft.pages=%3Cspan+class%3D%22nowrap%22%3E85-%3C%2Fspan%3E117&rft.date=2015&rft_id=info%3Aarxiv%2F1404.7828&rft_id=https%3A%2F%2Fapi.semanticscholar.org%2FCorpusID%3A11715509%23id-name%3DS2CID&rft_id=info%3Apmid%2F25462637&rft_id=info%3Adoi%2F10.1016%2Fj.neunet.2014.09.003&rft.aulast=Schmidhuber&rft.aufirst=J%C3%BCrgen&rfr_id=info%3Asid%2Fen.wikipedia.org%3AVanishing+gradient+problem" class="Z3988"></span></span> </li> <li id="cite_note-He2015-20"><span class="mw-cite-backlink"><b><a href="#cite_ref-He2015_20-0">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222" /><cite id="CITEREFHeZhangRenSun2016" class="citation conference cs1">He, Kaiming; Zhang, Xiangyu; Ren, Shaoqing; Sun, Jian (2016). <a rel="nofollow" class="external text" href="https://ieeexplore.ieee.org/document/7780459"><i>Deep Residual Learning for Image Recognition</i></a>. <i>2016 IEEE Conference on Computer Vision and Pattern Recognition (CVPR)</i>. Las Vegas, NV, USA: IEEE. pp. <span class="nowrap">770–</span>778. <a href="/wiki/ArXiv_(identifier)" class="mw-redirect" title="ArXiv (identifier)">arXiv</a>:<span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://arxiv.org/abs/1512.03385">1512.03385</a></span>. <a href="/wiki/Doi_(identifier)" class="mw-redirect" title="Doi (identifier)">doi</a>:<a rel="nofollow" class="external text" href="https://doi.org/10.1109%2FCVPR.2016.90">10.1109/CVPR.2016.90</a>. <a href="/wiki/ISBN_(identifier)" class="mw-redirect" title="ISBN (identifier)">ISBN</a> <a href="/wiki/Special:BookSources/978-1-4673-8851-1" title="Special:BookSources/978-1-4673-8851-1"><bdi>978-1-4673-8851-1</bdi></a>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=conference&rft.jtitle=2016+IEEE+Conference+on+Computer+Vision+and+Pattern+Recognition+%28CVPR%29&rft.atitle=Deep+Residual+Learning+for+Image+Recognition&rft.pages=%3Cspan+class%3D%22nowrap%22%3E770-%3C%2Fspan%3E778&rft.date=2016&rft_id=info%3Aarxiv%2F1512.03385&rft_id=info%3Adoi%2F10.1109%2FCVPR.2016.90&rft.isbn=978-1-4673-8851-1&rft.aulast=He&rft.aufirst=Kaiming&rft.au=Zhang%2C+Xiangyu&rft.au=Ren%2C+Shaoqing&rft.au=Sun%2C+Jian&rft_id=https%3A%2F%2Fieeexplore.ieee.org%2Fdocument%2F7780459&rfr_id=info%3Asid%2Fen.wikipedia.org%3AVanishing+gradient+problem" class="Z3988"></span></span> </li> <li id="cite_note-21"><span class="mw-cite-backlink"><b><a href="#cite_ref-21">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222" /><cite id="CITEREFVeitWilberBelongie2016" class="citation arxiv cs1">Veit, Andreas; Wilber, Michael; Belongie, Serge (20 May 2016). "Residual Networks Behave Like Ensembles of Relatively Shallow Networks". <a href="/wiki/ArXiv_(identifier)" class="mw-redirect" title="ArXiv (identifier)">arXiv</a>:<span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://arxiv.org/abs/1605.06431">1605.06431</a></span> [<a rel="nofollow" class="external text" href="https://arxiv.org/archive/cs.CV">cs.CV</a>].</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=preprint&rft.jtitle=arXiv&rft.atitle=Residual+Networks+Behave+Like+Ensembles+of+Relatively+Shallow+Networks&rft.date=2016-05-20&rft_id=info%3Aarxiv%2F1605.06431&rft.aulast=Veit&rft.aufirst=Andreas&rft.au=Wilber%2C+Michael&rft.au=Belongie%2C+Serge&rfr_id=info%3Asid%2Fen.wikipedia.org%3AVanishing+gradient+problem" class="Z3988"></span></span> </li> <li id="cite_note-22"><span class="mw-cite-backlink"><b><a href="#cite_ref-22">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222" /><cite id="CITEREFGlorotBordesBengio2011" class="citation journal cs1">Glorot, Xavier; Bordes, Antoine; Bengio, Yoshua (14 June 2011). <a rel="nofollow" class="external text" href="http://proceedings.mlr.press/v15/glorot11a.html">"Deep Sparse Rectifier Neural Networks"</a>. <i>PMLR</i>: <span class="nowrap">315–</span>323.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=article&rft.jtitle=PMLR&rft.atitle=Deep+Sparse+Rectifier+Neural+Networks&rft.pages=%3Cspan+class%3D%22nowrap%22%3E315-%3C%2Fspan%3E323&rft.date=2011-06-14&rft.aulast=Glorot&rft.aufirst=Xavier&rft.au=Bordes%2C+Antoine&rft.au=Bengio%2C+Yoshua&rft_id=http%3A%2F%2Fproceedings.mlr.press%2Fv15%2Fglorot11a.html&rfr_id=info%3Asid%2Fen.wikipedia.org%3AVanishing+gradient+problem" class="Z3988"></span></span> </li> <li id="cite_note-23"><span class="mw-cite-backlink"><b><a href="#cite_ref-23">^</a></b></span> <span class="reference-text">Kumar, Siddharth Krishna. "On weight initialization in deep neural networks." <i><a rel="nofollow" class="external text" href="https://arxiv.org/abs/1704.08863">arXiv preprint arXiv:1704.08863</a></i> (2017).</span> </li> <li id="cite_note-24"><span class="mw-cite-backlink"><b><a href="#cite_ref-24">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222" /><cite id="CITEREFYilmazPoli2022" class="citation journal cs1">Yilmaz, Ahmet; Poli, Riccardo (1 September 2022). <a rel="nofollow" class="external text" href="https://www.sciencedirect.com/science/article/pii/S0893608022002040">"Successfully and efficiently training deep multi-layer perceptrons with logistic activation function simply requires initializing the weights with an appropriate negative mean"</a>. <i>Neural Networks</i>. <b>153</b>: <span class="nowrap">87–</span>103. <a href="/wiki/Doi_(identifier)" class="mw-redirect" title="Doi (identifier)">doi</a>:<a rel="nofollow" class="external text" href="https://doi.org/10.1016%2Fj.neunet.2022.05.030">10.1016/j.neunet.2022.05.030</a>. <a href="/wiki/ISSN_(identifier)" class="mw-redirect" title="ISSN (identifier)">ISSN</a> <a rel="nofollow" class="external text" href="https://search.worldcat.org/issn/0893-6080">0893-6080</a>. <a href="/wiki/PMID_(identifier)" class="mw-redirect" title="PMID (identifier)">PMID</a> <a rel="nofollow" class="external text" href="https://pubmed.ncbi.nlm.nih.gov/35714424">35714424</a>. <a href="/wiki/S2CID_(identifier)" class="mw-redirect" title="S2CID (identifier)">S2CID</a> <a rel="nofollow" class="external text" href="https://api.semanticscholar.org/CorpusID:249487697">249487697</a>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=article&rft.jtitle=Neural+Networks&rft.atitle=Successfully+and+efficiently+training+deep+multi-layer+perceptrons+with+logistic+activation+function+simply+requires+initializing+the+weights+with+an+appropriate+negative+mean&rft.volume=153&rft.pages=%3Cspan+class%3D%22nowrap%22%3E87-%3C%2Fspan%3E103&rft.date=2022-09-01&rft.issn=0893-6080&rft_id=https%3A%2F%2Fapi.semanticscholar.org%2FCorpusID%3A249487697%23id-name%3DS2CID&rft_id=info%3Apmid%2F35714424&rft_id=info%3Adoi%2F10.1016%2Fj.neunet.2022.05.030&rft.aulast=Yilmaz&rft.aufirst=Ahmet&rft.au=Poli%2C+Riccardo&rft_id=https%3A%2F%2Fwww.sciencedirect.com%2Fscience%2Farticle%2Fpii%2FS0893608022002040&rfr_id=info%3Asid%2Fen.wikipedia.org%3AVanishing+gradient+problem" class="Z3988"></span></span> </li> <li id="cite_note-25"><span class="mw-cite-backlink"><b><a href="#cite_ref-25">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222" /><cite id="CITEREFSven_Behnke2003" class="citation book cs1">Sven Behnke (2003). <a rel="nofollow" class="external text" href="http://www.ais.uni-bonn.de/books/LNCS2766.pdf"><i>Hierarchical Neural Networks for Image Interpretation</i></a> <span class="cs1-format">(PDF)</span>. Lecture Notes in Computer Science. Vol. 2766. Springer.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&rft.genre=book&rft.btitle=Hierarchical+Neural+Networks+for+Image+Interpretation.&rft.series=Lecture+Notes+in+Computer+Science&rft.pub=Springer&rft.date=2003&rft.au=Sven+Behnke&rft_id=http%3A%2F%2Fwww.ais.uni-bonn.de%2Fbooks%2FLNCS2766.pdf&rfr_id=info%3Asid%2Fen.wikipedia.org%3AVanishing+gradient+problem" class="Z3988"></span></span> </li> <li id="cite_note-26"><span class="mw-cite-backlink"><b><a href="#cite_ref-26">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222" /><cite class="citation web cs1"><a rel="nofollow" class="external text" href="http://people.idsia.ch/~juergen/fundamentaldeeplearningproblem.html">"Sepp Hochreiter's Fundamental Deep Learning Problem (1991)"</a>. <i>people.idsia.ch</i><span class="reference-accessdate">. Retrieved <span class="nowrap">7 January</span> 2017</span>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=unknown&rft.jtitle=people.idsia.ch&rft.atitle=Sepp+Hochreiter%27s+Fundamental+Deep+Learning+Problem+%281991%29&rft_id=http%3A%2F%2Fpeople.idsia.ch%2F~juergen%2Ffundamentaldeeplearningproblem.html&rfr_id=info%3Asid%2Fen.wikipedia.org%3AVanishing+gradient+problem" class="Z3988"></span></span> </li> </ol></div> <p class="mw-empty-elt"> </p> <!-- NewPP limit report Parsed by mw‐web.codfw.main‐7b4fff7949‐2d5fr Cached time: 20250326152519 Cache expiry: 2592000 Reduced expiry: false Complications: [vary‐revision‐sha1, show‐toc] CPU time usage: 0.645 seconds Real time usage: 0.881 seconds Preprocessor visited node count: 3136/1000000 Post‐expand include size: 112583/2097152 bytes Template argument size: 11627/2097152 bytes Highest expansion depth: 16/100 Expensive parser function count: 4/500 Unstrip recursion depth: 1/20 Unstrip post‐expand size: 107230/5000000 bytes Lua time usage: 0.316/10.000 seconds Lua memory usage: 6562326/52428800 bytes Number of Wikibase entities loaded: 0/400 --> <!-- Transclusion expansion time report (%,ms,calls,template) 100.00% 613.694 1 -total 36.90% 226.433 3 Template:Reflist 21.85% 134.109 11 Template:Cite_journal 18.85% 115.686 1 Template:Machine_learning_bar 16.99% 104.281 1 Template:Sidebar_with_collapsible_lists 14.82% 90.972 1 Template:Short_description 10.99% 67.460 1 Template:Multiple_issues 8.05% 49.398 2 Template:Pagetype 6.76% 41.455 1 Template:NumBlk 6.36% 39.051 2 Template:Ambox --> <!-- Saved in parser cache with key enwiki:pcache:43502368:|#|:idhash:canonical and timestamp 20250326152519 and revision id 1272854228. Rendering was triggered because: page-view --> </div><!--esi <esi:include src="/esitest-fa8a495983347898/content" /> --><noscript><img src="https://login.wikimedia.org/wiki/Special:CentralAutoLogin/start?useformat=desktop&type=1x1&usesul3=0" alt="" width="1" height="1" style="border: none; position: absolute;"></noscript> <div class="printfooter" data-nosnippet="">Retrieved from "<a dir="ltr" href="https://en.wikipedia.org/w/index.php?title=Vanishing_gradient_problem&oldid=1272854228">https://en.wikipedia.org/w/index.php?title=Vanishing_gradient_problem&oldid=1272854228</a>"</div></div> <div id="catlinks" class="catlinks" data-mw="interface"><div id="mw-normal-catlinks" class="mw-normal-catlinks"><a href="/wiki/Help:Category" title="Help:Category">Category</a>: <ul><li><a href="/wiki/Category:Artificial_neural_networks" title="Category:Artificial neural networks">Artificial neural networks</a></li></ul></div><div id="mw-hidden-catlinks" class="mw-hidden-catlinks mw-hidden-cats-hidden">Hidden categories: <ul><li><a href="/wiki/Category:Articles_with_short_description" title="Category:Articles with short description">Articles with short description</a></li><li><a href="/wiki/Category:Short_description_matches_Wikidata" title="Category:Short description matches Wikidata">Short description matches Wikidata</a></li><li><a href="/wiki/Category:Articles_lacking_reliable_references_from_December_2017" title="Category:Articles lacking reliable references from December 2017">Articles lacking reliable references from December 2017</a></li><li><a href="/wiki/Category:All_articles_lacking_reliable_references" title="Category:All articles lacking reliable references">All articles lacking reliable references</a></li><li><a href="/wiki/Category:Articles_with_multiple_maintenance_issues" title="Category:Articles with multiple maintenance issues">Articles with multiple maintenance issues</a></li><li><a href="/wiki/Category:All_articles_with_unsourced_statements" title="Category:All articles with unsourced statements">All articles with unsourced statements</a></li><li><a href="/wiki/Category:Articles_with_unsourced_statements_from_June_2017" title="Category:Articles with unsourced statements from June 2017">Articles with unsourced statements from June 2017</a></li><li><a href="/wiki/Category:Use_dmy_dates_from_August_2019" title="Category:Use dmy dates from August 2019">Use dmy dates from August 2019</a></li></ul></div></div> </div> </main> </div> <div class="mw-footer-container"> <footer id="footer" class="mw-footer" > <ul id="footer-info"> <li id="footer-info-lastmod"> This page was last edited on 30 January 2025, at 13:31<span class="anonymous-show"> (UTC)</span>.</li> <li id="footer-info-copyright">Text is available under the <a href="/wiki/Wikipedia:Text_of_the_Creative_Commons_Attribution-ShareAlike_4.0_International_License" title="Wikipedia:Text of the Creative Commons Attribution-ShareAlike 4.0 International License">Creative Commons Attribution-ShareAlike 4.0 License</a>; additional terms may apply. By using this site, you agree to the <a href="https://foundation.wikimedia.org/wiki/Special:MyLanguage/Policy:Terms_of_Use" class="extiw" title="foundation:Special:MyLanguage/Policy:Terms of Use">Terms of Use</a> and <a href="https://foundation.wikimedia.org/wiki/Special:MyLanguage/Policy:Privacy_policy" class="extiw" title="foundation:Special:MyLanguage/Policy:Privacy policy">Privacy Policy</a>. Wikipedia® is a registered trademark of the <a rel="nofollow" class="external text" href="https://wikimediafoundation.org/">Wikimedia Foundation, Inc.</a>, a non-profit organization.</li> </ul> <ul id="footer-places"> <li id="footer-places-privacy"><a href="https://foundation.wikimedia.org/wiki/Special:MyLanguage/Policy:Privacy_policy">Privacy policy</a></li> <li id="footer-places-about"><a href="/wiki/Wikipedia:About">About Wikipedia</a></li> <li id="footer-places-disclaimers"><a href="/wiki/Wikipedia:General_disclaimer">Disclaimers</a></li> <li id="footer-places-contact"><a href="//en.wikipedia.org/wiki/Wikipedia:Contact_us">Contact Wikipedia</a></li> <li id="footer-places-wm-codeofconduct"><a href="https://foundation.wikimedia.org/wiki/Special:MyLanguage/Policy:Universal_Code_of_Conduct">Code of Conduct</a></li> <li id="footer-places-developers"><a href="https://developer.wikimedia.org">Developers</a></li> <li id="footer-places-statslink"><a href="https://stats.wikimedia.org/#/en.wikipedia.org">Statistics</a></li> <li id="footer-places-cookiestatement"><a href="https://foundation.wikimedia.org/wiki/Special:MyLanguage/Policy:Cookie_statement">Cookie statement</a></li> <li id="footer-places-mobileview"><a href="//en.m.wikipedia.org/w/index.php?title=Vanishing_gradient_problem&mobileaction=toggle_view_mobile" class="noprint stopMobileRedirectToggle">Mobile view</a></li> </ul> <ul id="footer-icons" class="noprint"> <li id="footer-copyrightico"><a href="https://www.wikimedia.org/" class="cdx-button cdx-button--fake-button cdx-button--size-large cdx-button--fake-button--enabled"><picture><source media="(min-width: 500px)" srcset="/static/images/footer/wikimedia-button.svg" width="84" height="29"><img src="/static/images/footer/wikimedia.svg" width="25" height="25" alt="Wikimedia Foundation" lang="en" loading="lazy"></picture></a></li> <li id="footer-poweredbyico"><a href="https://www.mediawiki.org/" class="cdx-button cdx-button--fake-button cdx-button--size-large cdx-button--fake-button--enabled"><picture><source media="(min-width: 500px)" srcset="/w/resources/assets/poweredby_mediawiki.svg" width="88" height="31"><img src="/w/resources/assets/mediawiki_compact.svg" alt="Powered by MediaWiki" lang="en" width="25" height="25" loading="lazy"></picture></a></li> </ul> </footer> </div> </div> </div> <div class="vector-header-container vector-sticky-header-container"> <div id="vector-sticky-header" class="vector-sticky-header"> <div class="vector-sticky-header-start"> <div class="vector-sticky-header-icon-start vector-button-flush-left vector-button-flush-right" aria-hidden="true"> <button class="cdx-button cdx-button--weight-quiet cdx-button--icon-only vector-sticky-header-search-toggle" tabindex="-1" data-event-name="ui.vector-sticky-search-form.icon"><span class="vector-icon mw-ui-icon-search mw-ui-icon-wikimedia-search"></span> <span>Search</span> </button> </div> <div role="search" class="vector-search-box-vue vector-search-box-show-thumbnail vector-search-box"> <div class="vector-typeahead-search-container"> <div class="cdx-typeahead-search cdx-typeahead-search--show-thumbnail"> <form action="/w/index.php" id="vector-sticky-search-form" class="cdx-search-input cdx-search-input--has-end-button"> <div class="cdx-search-input__input-wrapper" data-search-loc="header-moved"> <div class="cdx-text-input cdx-text-input--has-start-icon"> <input class="cdx-text-input__input" type="search" name="search" placeholder="Search Wikipedia"> <span class="cdx-text-input__icon cdx-text-input__start-icon"></span> </div> <input type="hidden" name="title" value="Special:Search"> </div> <button class="cdx-button cdx-search-input__end-button">Search</button> </form> </div> </div> </div> <div class="vector-sticky-header-context-bar"> <nav aria-label="Contents" class="vector-toc-landmark"> <div id="vector-sticky-header-toc" class="vector-dropdown mw-portlet mw-portlet-sticky-header-toc vector-sticky-header-toc vector-button-flush-left" > <input type="checkbox" id="vector-sticky-header-toc-checkbox" role="button" aria-haspopup="true" data-event-name="ui.dropdown-vector-sticky-header-toc" class="vector-dropdown-checkbox " aria-label="Toggle the table of contents" > <label id="vector-sticky-header-toc-label" for="vector-sticky-header-toc-checkbox" class="vector-dropdown-label cdx-button cdx-button--fake-button cdx-button--fake-button--enabled cdx-button--weight-quiet cdx-button--icon-only " aria-hidden="true" ><span class="vector-icon mw-ui-icon-listBullet mw-ui-icon-wikimedia-listBullet"></span> <span class="vector-dropdown-label-text">Toggle the table of contents</span> </label> <div class="vector-dropdown-content"> <div id="vector-sticky-header-toc-unpinned-container" class="vector-unpinned-container"> </div> </div> </div> </nav> <div class="vector-sticky-header-context-bar-primary" aria-hidden="true" ><span class="mw-page-title-main">Vanishing gradient problem</span></div> </div> </div> <div class="vector-sticky-header-end" aria-hidden="true"> <div class="vector-sticky-header-icons"> <a href="#" class="cdx-button cdx-button--fake-button cdx-button--fake-button--enabled cdx-button--weight-quiet cdx-button--icon-only" id="ca-talk-sticky-header" tabindex="-1" data-event-name="talk-sticky-header"><span class="vector-icon mw-ui-icon-speechBubbles mw-ui-icon-wikimedia-speechBubbles"></span> <span></span> </a> <a href="#" class="cdx-button cdx-button--fake-button cdx-button--fake-button--enabled cdx-button--weight-quiet cdx-button--icon-only" id="ca-subject-sticky-header" tabindex="-1" data-event-name="subject-sticky-header"><span class="vector-icon mw-ui-icon-article mw-ui-icon-wikimedia-article"></span> <span></span> </a> <a href="#" class="cdx-button cdx-button--fake-button cdx-button--fake-button--enabled cdx-button--weight-quiet cdx-button--icon-only" id="ca-history-sticky-header" tabindex="-1" data-event-name="history-sticky-header"><span class="vector-icon mw-ui-icon-wikimedia-history mw-ui-icon-wikimedia-wikimedia-history"></span> <span></span> </a> <a href="#" class="cdx-button cdx-button--fake-button cdx-button--fake-button--enabled cdx-button--weight-quiet cdx-button--icon-only mw-watchlink" id="ca-watchstar-sticky-header" tabindex="-1" data-event-name="watch-sticky-header"><span class="vector-icon mw-ui-icon-wikimedia-star mw-ui-icon-wikimedia-wikimedia-star"></span> <span></span> </a> <a href="#" class="cdx-button cdx-button--fake-button cdx-button--fake-button--enabled cdx-button--weight-quiet cdx-button--icon-only" id="ca-edit-sticky-header" tabindex="-1" data-event-name="wikitext-edit-sticky-header"><span class="vector-icon mw-ui-icon-wikimedia-wikiText mw-ui-icon-wikimedia-wikimedia-wikiText"></span> <span></span> </a> <a href="#" class="cdx-button cdx-button--fake-button cdx-button--fake-button--enabled cdx-button--weight-quiet cdx-button--icon-only" id="ca-ve-edit-sticky-header" tabindex="-1" data-event-name="ve-edit-sticky-header"><span class="vector-icon mw-ui-icon-wikimedia-edit mw-ui-icon-wikimedia-wikimedia-edit"></span> <span></span> </a> <a href="#" class="cdx-button cdx-button--fake-button cdx-button--fake-button--enabled cdx-button--weight-quiet cdx-button--icon-only" id="ca-viewsource-sticky-header" tabindex="-1" data-event-name="ve-edit-protected-sticky-header"><span class="vector-icon mw-ui-icon-wikimedia-editLock mw-ui-icon-wikimedia-wikimedia-editLock"></span> <span></span> </a> </div> <div class="vector-sticky-header-buttons"> <button class="cdx-button cdx-button--weight-quiet mw-interlanguage-selector" id="p-lang-btn-sticky-header" tabindex="-1" data-event-name="ui.dropdown-p-lang-btn-sticky-header"><span class="vector-icon mw-ui-icon-wikimedia-language mw-ui-icon-wikimedia-wikimedia-language"></span> <span>11 languages</span> </button> <a href="#" class="cdx-button cdx-button--fake-button cdx-button--fake-button--enabled cdx-button--weight-quiet cdx-button--action-progressive" id="ca-addsection-sticky-header" tabindex="-1" data-event-name="addsection-sticky-header"><span class="vector-icon mw-ui-icon-speechBubbleAdd-progressive mw-ui-icon-wikimedia-speechBubbleAdd-progressive"></span> <span>Add topic</span> </a> </div> <div class="vector-sticky-header-icon-end"> <div class="vector-user-links"> </div> </div> </div> </div> </div> <div class="mw-portlet mw-portlet-dock-bottom emptyPortlet" id="p-dock-bottom"> <ul> </ul> </div> <script>(RLQ=window.RLQ||[]).push(function(){mw.config.set({"wgHostname":"mw-web.codfw.main-585dc88b6-6rn59","wgBackendResponseTime":207,"wgPageParseReport":{"limitreport":{"cputime":"0.645","walltime":"0.881","ppvisitednodes":{"value":3136,"limit":1000000},"postexpandincludesize":{"value":112583,"limit":2097152},"templateargumentsize":{"value":11627,"limit":2097152},"expansiondepth":{"value":16,"limit":100},"expensivefunctioncount":{"value":4,"limit":500},"unstrip-depth":{"value":1,"limit":20},"unstrip-size":{"value":107230,"limit":5000000},"entityaccesscount":{"value":0,"limit":400},"timingprofile":["100.00% 613.694 1 -total"," 36.90% 226.433 3 Template:Reflist"," 21.85% 134.109 11 Template:Cite_journal"," 18.85% 115.686 1 Template:Machine_learning_bar"," 16.99% 104.281 1 Template:Sidebar_with_collapsible_lists"," 14.82% 90.972 1 Template:Short_description"," 10.99% 67.460 1 Template:Multiple_issues"," 8.05% 49.398 2 Template:Pagetype"," 6.76% 41.455 1 Template:NumBlk"," 6.36% 39.051 2 Template:Ambox"]},"scribunto":{"limitreport-timeusage":{"value":"0.316","limit":"10.000"},"limitreport-memusage":{"value":6562326,"limit":52428800}},"cachereport":{"origin":"mw-web.codfw.main-7b4fff7949-2d5fr","timestamp":"20250326152519","ttl":2592000,"transientcontent":false}}});});</script> <script type="application/ld+json">{"@context":"https:\/\/schema.org","@type":"Article","name":"Vanishing gradient problem","url":"https:\/\/en.wikipedia.org\/wiki\/Vanishing_gradient_problem","sameAs":"http:\/\/www.wikidata.org\/entity\/Q18358230","mainEntity":"http:\/\/www.wikidata.org\/entity\/Q18358230","author":{"@type":"Organization","name":"Contributors to Wikimedia projects"},"publisher":{"@type":"Organization","name":"Wikimedia Foundation, Inc.","logo":{"@type":"ImageObject","url":"https:\/\/www.wikimedia.org\/static\/images\/wmf-hor-googpub.png"}},"datePublished":"2014-08-08T02:05:51Z","dateModified":"2025-01-30T13:31:59Z","headline":"machine learning model training problem"}</script> </body> </html>