CINXE.COM

Speech recognition - Wikipedia

<!DOCTYPE html> <html class="client-nojs vector-feature-language-in-header-enabled vector-feature-language-in-main-page-header-disabled vector-feature-page-tools-pinned-disabled vector-feature-toc-pinned-clientpref-1 vector-feature-main-menu-pinned-disabled vector-feature-limited-width-clientpref-1 vector-feature-limited-width-content-enabled vector-feature-custom-font-size-clientpref-1 vector-feature-appearance-pinned-clientpref-1 vector-feature-night-mode-enabled skin-theme-clientpref-day vector-sticky-header-enabled vector-toc-available" lang="en" dir="ltr"> <head> <meta charset="UTF-8"> <title>Speech recognition - Wikipedia</title> <script>(function(){var className="client-js vector-feature-language-in-header-enabled vector-feature-language-in-main-page-header-disabled vector-feature-page-tools-pinned-disabled vector-feature-toc-pinned-clientpref-1 vector-feature-main-menu-pinned-disabled vector-feature-limited-width-clientpref-1 vector-feature-limited-width-content-enabled vector-feature-custom-font-size-clientpref-1 vector-feature-appearance-pinned-clientpref-1 vector-feature-night-mode-enabled skin-theme-clientpref-day vector-sticky-header-enabled vector-toc-available";var cookie=document.cookie.match(/(?:^|; )enwikimwclientpreferences=([^;]+)/);if(cookie){cookie[1].split('%2C').forEach(function(pref){className=className.replace(new RegExp('(^| )'+pref.replace(/-clientpref-\w+$|[^\w-]+/g,'')+'-clientpref-\\w+( |$)'),'$1'+pref+'$2');});}document.documentElement.className=className;}());RLCONF={"wgBreakFrames":false,"wgSeparatorTransformTable":["",""],"wgDigitTransformTable":["",""],"wgDefaultDateFormat":"dmy","wgMonthNames":["","January","February","March","April","May","June","July","August","September","October","November","December"],"wgRequestId":"f9d43ea6-0cce-4074-af36-69ee85c378e7","wgCanonicalNamespace":"","wgCanonicalSpecialPageName":false,"wgNamespaceNumber":0,"wgPageName":"Speech_recognition","wgTitle":"Speech recognition","wgCurRevisionId":1283243349,"wgRevisionId":1283243349,"wgArticleId":29468,"wgIsArticle":true,"wgIsRedirect":false,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgCategories":["Webarchive template wayback links","All articles with dead external links","Articles with dead external links from March 2023","Articles with permanently dead external links","CS1 errors: missing periodical","CS1: unfit URL","Articles with short description","Short description matches Wikidata","Use dmy dates from February 2017","Articles containing potentially dated statements from 2017","All articles containing potentially dated statements","All articles with unsourced statements","Articles with unsourced statements from March 2014","All articles with vague or ambiguous time","Vague or ambiguous time from April 2014","Articles with unsourced statements from November 2016","Articles with unsourced statements from December 2012","Articles with unsourced statements from May 2013","CS1: long volume value","Speech recognition","Automatic identification and data capture","Computational linguistics","User interface techniques","History of human–computer interaction","Computer accessibility","Machine learning task"],"wgPageViewLanguage":"en","wgPageContentLanguage":"en","wgPageContentModel":"wikitext","wgRelevantPageName":"Speech_recognition","wgRelevantArticleId":29468,"wgIsProbablyEditable":true,"wgRelevantPageIsProbablyEditable":true,"wgRestrictionEdit":[],"wgRestrictionMove":[],"wgNoticeProject":"wikipedia","wgCiteReferencePreviewsActive":false,"wgFlaggedRevsParams":{"tags":{"status":{"levels":1}}},"wgMediaViewerOnClick":true,"wgMediaViewerEnabledByDefault":true,"wgPopupsFlags":0,"wgVisualEditor":{"pageLanguageCode":"en","pageLanguageDir":"ltr","pageVariantFallbacks":"en"},"wgMFDisplayWikibaseDescriptions":{"search":true,"watchlist":true,"tagline":false,"nearby":true},"wgWMESchemaEditAttemptStepOversample":false,"wgWMEPageLength":100000,"wgEditSubmitButtonLabelPublish":true,"wgULSPosition":"interlanguage","wgULSisCompactLinksEnabled":false,"wgVector2022LanguageInHeader":true,"wgULSisLanguageSelectorEmpty":false,"wgWikibaseItemId":"Q189436","wgCheckUserClientHintsHeadersJsApi":["brands","architecture","bitness","fullVersionList","mobile","model","platform","platformVersion"],"GEHomepageSuggestedEditsEnableTopics":true,"wgGETopicsMatchModeEnabled":false,"wgGELevelingUpEnabledForUser":false}; RLSTATE={"ext.globalCssJs.user.styles":"ready","site.styles":"ready","user.styles":"ready","ext.globalCssJs.user":"ready","user":"ready","user.options":"loading","ext.cite.styles":"ready","ext.math.styles":"ready","skins.vector.search.codex.styles":"ready","skins.vector.styles":"ready","skins.vector.icons":"ready","jquery.makeCollapsible.styles":"ready","ext.wikimediamessages.styles":"ready","ext.visualEditor.desktopArticleTarget.noscript":"ready","ext.uls.interlanguage":"ready","wikibase.client.init":"ready","ext.wikimediaBadges":"ready"};RLPAGEMODULES=["ext.cite.ux-enhancements","site","mediawiki.page.ready","jquery.makeCollapsible","mediawiki.toc","skins.vector.js","ext.centralNotice.geoIP","ext.centralNotice.startUp","ext.gadget.ReferenceTooltips","ext.gadget.switcher","ext.urlShortener.toolbar","ext.centralauth.centralautologin","mmv.bootstrap","ext.popups","ext.visualEditor.desktopArticleTarget.init","ext.visualEditor.targetLoader","ext.echo.centralauth","ext.eventLogging","ext.wikimediaEvents","ext.navigationTiming","ext.uls.interface","ext.cx.eventlogging.campaigns","ext.cx.uls.quick.actions","wikibase.client.vector-2022","ext.checkUser.clientHints","ext.quicksurveys.init","ext.growthExperiments.SuggestedEditSession"];</script> <script>(RLQ=window.RLQ||[]).push(function(){mw.loader.impl(function(){return["user.options@12s5i",function($,jQuery,require,module){mw.user.tokens.set({"patrolToken":"+\\","watchToken":"+\\","csrfToken":"+\\"}); }];});});</script> <link rel="stylesheet" href="/w/load.php?lang=en&amp;modules=ext.cite.styles%7Cext.math.styles%7Cext.uls.interlanguage%7Cext.visualEditor.desktopArticleTarget.noscript%7Cext.wikimediaBadges%7Cext.wikimediamessages.styles%7Cjquery.makeCollapsible.styles%7Cskins.vector.icons%2Cstyles%7Cskins.vector.search.codex.styles%7Cwikibase.client.init&amp;only=styles&amp;skin=vector-2022"> <script async="" src="/w/load.php?lang=en&amp;modules=startup&amp;only=scripts&amp;raw=1&amp;skin=vector-2022"></script> <meta name="ResourceLoaderDynamicStyles" content=""> <link rel="stylesheet" href="/w/load.php?lang=en&amp;modules=site.styles&amp;only=styles&amp;skin=vector-2022"> <meta name="generator" content="MediaWiki 1.44.0-wmf.22"> <meta name="referrer" content="origin"> <meta name="referrer" content="origin-when-cross-origin"> <meta name="robots" content="max-image-preview:standard"> <meta name="format-detection" content="telephone=no"> <meta name="viewport" content="width=1120"> <meta property="og:title" content="Speech recognition - Wikipedia"> <meta property="og:type" content="website"> <link rel="preconnect" href="//upload.wikimedia.org"> <link rel="alternate" media="only screen and (max-width: 640px)" href="//en.m.wikipedia.org/wiki/Speech_recognition"> <link rel="alternate" type="application/x-wiki" title="Edit this page" href="/w/index.php?title=Speech_recognition&amp;action=edit"> <link rel="apple-touch-icon" href="/static/apple-touch/wikipedia.png"> <link rel="icon" href="/static/favicon/wikipedia.ico"> <link rel="search" type="application/opensearchdescription+xml" href="/w/rest.php/v1/search" title="Wikipedia (en)"> <link rel="EditURI" type="application/rsd+xml" href="//en.wikipedia.org/w/api.php?action=rsd"> <link rel="canonical" href="https://en.wikipedia.org/wiki/Speech_recognition"> <link rel="license" href="https://creativecommons.org/licenses/by-sa/4.0/deed.en"> <link rel="alternate" type="application/atom+xml" title="Wikipedia Atom feed" href="/w/index.php?title=Special:RecentChanges&amp;feed=atom"> <link rel="dns-prefetch" href="//meta.wikimedia.org" /> <link rel="dns-prefetch" href="login.wikimedia.org"> </head> <body class="skin--responsive skin-vector skin-vector-search-vue mediawiki ltr sitedir-ltr mw-hide-empty-elt ns-0 ns-subject mw-editable page-Speech_recognition rootpage-Speech_recognition skin-vector-2022 action-view"><a class="mw-jump-link" href="#bodyContent">Jump to content</a> <div class="vector-header-container"> <header class="vector-header mw-header"> <div class="vector-header-start"> <nav class="vector-main-menu-landmark" aria-label="Site"> <div id="vector-main-menu-dropdown" class="vector-dropdown vector-main-menu-dropdown vector-button-flush-left vector-button-flush-right" title="Main menu" > <input type="checkbox" id="vector-main-menu-dropdown-checkbox" role="button" aria-haspopup="true" data-event-name="ui.dropdown-vector-main-menu-dropdown" class="vector-dropdown-checkbox " aria-label="Main menu" > <label id="vector-main-menu-dropdown-label" for="vector-main-menu-dropdown-checkbox" class="vector-dropdown-label cdx-button cdx-button--fake-button cdx-button--fake-button--enabled cdx-button--weight-quiet cdx-button--icon-only " aria-hidden="true" ><span class="vector-icon mw-ui-icon-menu mw-ui-icon-wikimedia-menu"></span> <span class="vector-dropdown-label-text">Main menu</span> </label> <div class="vector-dropdown-content"> <div id="vector-main-menu-unpinned-container" class="vector-unpinned-container"> <div id="vector-main-menu" class="vector-main-menu vector-pinnable-element"> <div class="vector-pinnable-header vector-main-menu-pinnable-header vector-pinnable-header-unpinned" data-feature-name="main-menu-pinned" data-pinnable-element-id="vector-main-menu" data-pinned-container-id="vector-main-menu-pinned-container" data-unpinned-container-id="vector-main-menu-unpinned-container" > <div class="vector-pinnable-header-label">Main menu</div> <button class="vector-pinnable-header-toggle-button vector-pinnable-header-pin-button" data-event-name="pinnable-header.vector-main-menu.pin">move to sidebar</button> <button class="vector-pinnable-header-toggle-button vector-pinnable-header-unpin-button" data-event-name="pinnable-header.vector-main-menu.unpin">hide</button> </div> <div id="p-navigation" class="vector-menu mw-portlet mw-portlet-navigation" > <div class="vector-menu-heading"> Navigation </div> <div class="vector-menu-content"> <ul class="vector-menu-content-list"> <li id="n-mainpage-description" class="mw-list-item"><a href="/wiki/Main_Page" title="Visit the main page [z]" accesskey="z"><span>Main page</span></a></li><li id="n-contents" class="mw-list-item"><a href="/wiki/Wikipedia:Contents" title="Guides to browsing Wikipedia"><span>Contents</span></a></li><li id="n-currentevents" class="mw-list-item"><a href="/wiki/Portal:Current_events" title="Articles related to current events"><span>Current events</span></a></li><li id="n-randompage" class="mw-list-item"><a href="/wiki/Special:Random" title="Visit a randomly selected article [x]" accesskey="x"><span>Random article</span></a></li><li id="n-aboutsite" class="mw-list-item"><a href="/wiki/Wikipedia:About" title="Learn about Wikipedia and how it works"><span>About Wikipedia</span></a></li><li id="n-contactpage" class="mw-list-item"><a href="//en.wikipedia.org/wiki/Wikipedia:Contact_us" title="How to contact Wikipedia"><span>Contact us</span></a></li> </ul> </div> </div> <div id="p-interaction" class="vector-menu mw-portlet mw-portlet-interaction" > <div class="vector-menu-heading"> Contribute </div> <div class="vector-menu-content"> <ul class="vector-menu-content-list"> <li id="n-help" class="mw-list-item"><a href="/wiki/Help:Contents" title="Guidance on how to use and edit Wikipedia"><span>Help</span></a></li><li id="n-introduction" class="mw-list-item"><a href="/wiki/Help:Introduction" title="Learn how to edit Wikipedia"><span>Learn to edit</span></a></li><li id="n-portal" class="mw-list-item"><a href="/wiki/Wikipedia:Community_portal" title="The hub for editors"><span>Community portal</span></a></li><li id="n-recentchanges" class="mw-list-item"><a href="/wiki/Special:RecentChanges" title="A list of recent changes to Wikipedia [r]" accesskey="r"><span>Recent changes</span></a></li><li id="n-upload" class="mw-list-item"><a href="/wiki/Wikipedia:File_upload_wizard" title="Add images or other media for use on Wikipedia"><span>Upload file</span></a></li><li id="n-specialpages" class="mw-list-item"><a href="/wiki/Special:SpecialPages"><span>Special pages</span></a></li> </ul> </div> </div> </div> </div> </div> </div> </nav> <a href="/wiki/Main_Page" class="mw-logo"> <img class="mw-logo-icon" src="/static/images/icons/wikipedia.png" alt="" aria-hidden="true" height="50" width="50"> <span class="mw-logo-container skin-invert"> <img class="mw-logo-wordmark" alt="Wikipedia" src="/static/images/mobile/copyright/wikipedia-wordmark-en.svg" style="width: 7.5em; height: 1.125em;"> <img class="mw-logo-tagline" alt="The Free Encyclopedia" src="/static/images/mobile/copyright/wikipedia-tagline-en.svg" width="117" height="13" style="width: 7.3125em; height: 0.8125em;"> </span> </a> </div> <div class="vector-header-end"> <div id="p-search" role="search" class="vector-search-box-vue vector-search-box-collapses vector-search-box-show-thumbnail vector-search-box-auto-expand-width vector-search-box"> <a href="/wiki/Special:Search" class="cdx-button cdx-button--fake-button cdx-button--fake-button--enabled cdx-button--weight-quiet cdx-button--icon-only search-toggle" title="Search Wikipedia [f]" accesskey="f"><span class="vector-icon mw-ui-icon-search mw-ui-icon-wikimedia-search"></span> <span>Search</span> </a> <div class="vector-typeahead-search-container"> <div class="cdx-typeahead-search cdx-typeahead-search--show-thumbnail cdx-typeahead-search--auto-expand-width"> <form action="/w/index.php" id="searchform" class="cdx-search-input cdx-search-input--has-end-button"> <div id="simpleSearch" class="cdx-search-input__input-wrapper" data-search-loc="header-moved"> <div class="cdx-text-input cdx-text-input--has-start-icon"> <input class="cdx-text-input__input" type="search" name="search" placeholder="Search Wikipedia" aria-label="Search Wikipedia" autocapitalize="sentences" title="Search Wikipedia [f]" accesskey="f" id="searchInput" > <span class="cdx-text-input__icon cdx-text-input__start-icon"></span> </div> <input type="hidden" name="title" value="Special:Search"> </div> <button class="cdx-button cdx-search-input__end-button">Search</button> </form> </div> </div> </div> <nav class="vector-user-links vector-user-links-wide" aria-label="Personal tools"> <div class="vector-user-links-main"> <div id="p-vector-user-menu-preferences" class="vector-menu mw-portlet emptyPortlet" > <div class="vector-menu-content"> <ul class="vector-menu-content-list"> </ul> </div> </div> <div id="p-vector-user-menu-userpage" class="vector-menu mw-portlet emptyPortlet" > <div class="vector-menu-content"> <ul class="vector-menu-content-list"> </ul> </div> </div> <nav class="vector-appearance-landmark" aria-label="Appearance"> <div id="vector-appearance-dropdown" class="vector-dropdown " title="Change the appearance of the page&#039;s font size, width, and color" > <input type="checkbox" id="vector-appearance-dropdown-checkbox" role="button" aria-haspopup="true" data-event-name="ui.dropdown-vector-appearance-dropdown" class="vector-dropdown-checkbox " aria-label="Appearance" > <label id="vector-appearance-dropdown-label" for="vector-appearance-dropdown-checkbox" class="vector-dropdown-label cdx-button cdx-button--fake-button cdx-button--fake-button--enabled cdx-button--weight-quiet cdx-button--icon-only " aria-hidden="true" ><span class="vector-icon mw-ui-icon-appearance mw-ui-icon-wikimedia-appearance"></span> <span class="vector-dropdown-label-text">Appearance</span> </label> <div class="vector-dropdown-content"> <div id="vector-appearance-unpinned-container" class="vector-unpinned-container"> </div> </div> </div> </nav> <div id="p-vector-user-menu-notifications" class="vector-menu mw-portlet emptyPortlet" > <div class="vector-menu-content"> <ul class="vector-menu-content-list"> </ul> </div> </div> <div id="p-vector-user-menu-overflow" class="vector-menu mw-portlet" > <div class="vector-menu-content"> <ul class="vector-menu-content-list"> <li id="pt-sitesupport-2" class="user-links-collapsible-item mw-list-item user-links-collapsible-item"><a data-mw="interface" href="https://donate.wikimedia.org/?wmf_source=donate&amp;wmf_medium=sidebar&amp;wmf_campaign=en.wikipedia.org&amp;uselang=en" class=""><span>Donate</span></a> </li> <li id="pt-createaccount-2" class="user-links-collapsible-item mw-list-item user-links-collapsible-item"><a data-mw="interface" href="/w/index.php?title=Special:CreateAccount&amp;returnto=Speech+recognition" title="You are encouraged to create an account and log in; however, it is not mandatory" class=""><span>Create account</span></a> </li> <li id="pt-login-2" class="user-links-collapsible-item mw-list-item user-links-collapsible-item"><a data-mw="interface" href="/w/index.php?title=Special:UserLogin&amp;returnto=Speech+recognition" title="You&#039;re encouraged to log in; however, it&#039;s not mandatory. [o]" accesskey="o" class=""><span>Log in</span></a> </li> </ul> </div> </div> </div> <div id="vector-user-links-dropdown" class="vector-dropdown vector-user-menu vector-button-flush-right vector-user-menu-logged-out" title="Log in and more options" > <input type="checkbox" id="vector-user-links-dropdown-checkbox" role="button" aria-haspopup="true" data-event-name="ui.dropdown-vector-user-links-dropdown" class="vector-dropdown-checkbox " aria-label="Personal tools" > <label id="vector-user-links-dropdown-label" for="vector-user-links-dropdown-checkbox" class="vector-dropdown-label cdx-button cdx-button--fake-button cdx-button--fake-button--enabled cdx-button--weight-quiet cdx-button--icon-only " aria-hidden="true" ><span class="vector-icon mw-ui-icon-ellipsis mw-ui-icon-wikimedia-ellipsis"></span> <span class="vector-dropdown-label-text">Personal tools</span> </label> <div class="vector-dropdown-content"> <div id="p-personal" class="vector-menu mw-portlet mw-portlet-personal user-links-collapsible-item" title="User menu" > <div class="vector-menu-content"> <ul class="vector-menu-content-list"> <li id="pt-sitesupport" class="user-links-collapsible-item mw-list-item"><a href="https://donate.wikimedia.org/?wmf_source=donate&amp;wmf_medium=sidebar&amp;wmf_campaign=en.wikipedia.org&amp;uselang=en"><span>Donate</span></a></li><li id="pt-createaccount" class="user-links-collapsible-item mw-list-item"><a href="/w/index.php?title=Special:CreateAccount&amp;returnto=Speech+recognition" title="You are encouraged to create an account and log in; however, it is not mandatory"><span class="vector-icon mw-ui-icon-userAdd mw-ui-icon-wikimedia-userAdd"></span> <span>Create account</span></a></li><li id="pt-login" class="user-links-collapsible-item mw-list-item"><a href="/w/index.php?title=Special:UserLogin&amp;returnto=Speech+recognition" title="You&#039;re encouraged to log in; however, it&#039;s not mandatory. [o]" accesskey="o"><span class="vector-icon mw-ui-icon-logIn mw-ui-icon-wikimedia-logIn"></span> <span>Log in</span></a></li> </ul> </div> </div> <div id="p-user-menu-anon-editor" class="vector-menu mw-portlet mw-portlet-user-menu-anon-editor" > <div class="vector-menu-heading"> Pages for logged out editors <a href="/wiki/Help:Introduction" aria-label="Learn more about editing"><span>learn more</span></a> </div> <div class="vector-menu-content"> <ul class="vector-menu-content-list"> <li id="pt-anoncontribs" class="mw-list-item"><a href="/wiki/Special:MyContributions" title="A list of edits made from this IP address [y]" accesskey="y"><span>Contributions</span></a></li><li id="pt-anontalk" class="mw-list-item"><a href="/wiki/Special:MyTalk" title="Discussion about edits from this IP address [n]" accesskey="n"><span>Talk</span></a></li> </ul> </div> </div> </div> </div> </nav> </div> </header> </div> <div class="mw-page-container"> <div class="mw-page-container-inner"> <div class="vector-sitenotice-container"> <div id="siteNotice"><!-- CentralNotice --></div> </div> <div class="vector-column-start"> <div class="vector-main-menu-container"> <div id="mw-navigation"> <nav id="mw-panel" class="vector-main-menu-landmark" aria-label="Site"> <div id="vector-main-menu-pinned-container" class="vector-pinned-container"> </div> </nav> </div> </div> <div class="vector-sticky-pinned-container"> <nav id="mw-panel-toc" aria-label="Contents" data-event-name="ui.sidebar-toc" class="mw-table-of-contents-container vector-toc-landmark"> <div id="vector-toc-pinned-container" class="vector-pinned-container"> <div id="vector-toc" class="vector-toc vector-pinnable-element"> <div class="vector-pinnable-header vector-toc-pinnable-header vector-pinnable-header-pinned" data-feature-name="toc-pinned" data-pinnable-element-id="vector-toc" > <h2 class="vector-pinnable-header-label">Contents</h2> <button class="vector-pinnable-header-toggle-button vector-pinnable-header-pin-button" data-event-name="pinnable-header.vector-toc.pin">move to sidebar</button> <button class="vector-pinnable-header-toggle-button vector-pinnable-header-unpin-button" data-event-name="pinnable-header.vector-toc.unpin">hide</button> </div> <ul class="vector-toc-contents" id="mw-panel-toc-list"> <li id="toc-mw-content-text" class="vector-toc-list-item vector-toc-level-1"> <a href="#" class="vector-toc-link"> <div class="vector-toc-text">(Top)</div> </a> </li> <li id="toc-History" class="vector-toc-list-item vector-toc-level-1"> <a class="vector-toc-link" href="#History"> <div class="vector-toc-text"> <span class="vector-toc-numb">1</span> <span>History</span> </div> </a> <button aria-controls="toc-History-sublist" class="cdx-button cdx-button--weight-quiet cdx-button--icon-only vector-toc-toggle"> <span class="vector-icon mw-ui-icon-wikimedia-expand"></span> <span>Toggle History subsection</span> </button> <ul id="toc-History-sublist" class="vector-toc-list"> <li id="toc-Pre-1970" class="vector-toc-list-item vector-toc-level-2"> <a class="vector-toc-link" href="#Pre-1970"> <div class="vector-toc-text"> <span class="vector-toc-numb">1.1</span> <span>Pre-1970</span> </div> </a> <ul id="toc-Pre-1970-sublist" class="vector-toc-list"> </ul> </li> <li id="toc-1970–1990" class="vector-toc-list-item vector-toc-level-2"> <a class="vector-toc-link" href="#1970–1990"> <div class="vector-toc-text"> <span class="vector-toc-numb">1.2</span> <span>1970–1990</span> </div> </a> <ul id="toc-1970–1990-sublist" class="vector-toc-list"> </ul> </li> <li id="toc-Practical_speech_recognition" class="vector-toc-list-item vector-toc-level-2"> <a class="vector-toc-link" href="#Practical_speech_recognition"> <div class="vector-toc-text"> <span class="vector-toc-numb">1.3</span> <span>Practical speech recognition</span> </div> </a> <ul id="toc-Practical_speech_recognition-sublist" class="vector-toc-list"> <li id="toc-2000s" class="vector-toc-list-item vector-toc-level-3"> <a class="vector-toc-link" href="#2000s"> <div class="vector-toc-text"> <span class="vector-toc-numb">1.3.1</span> <span>2000s</span> </div> </a> <ul id="toc-2000s-sublist" class="vector-toc-list"> </ul> </li> <li id="toc-2010s" class="vector-toc-list-item vector-toc-level-3"> <a class="vector-toc-link" href="#2010s"> <div class="vector-toc-text"> <span class="vector-toc-numb">1.3.2</span> <span>2010s</span> </div> </a> <ul id="toc-2010s-sublist" class="vector-toc-list"> </ul> </li> </ul> </li> </ul> </li> <li id="toc-Models,_methods,_and_algorithms" class="vector-toc-list-item vector-toc-level-1"> <a class="vector-toc-link" href="#Models,_methods,_and_algorithms"> <div class="vector-toc-text"> <span class="vector-toc-numb">2</span> <span>Models, methods, and algorithms</span> </div> </a> <button aria-controls="toc-Models,_methods,_and_algorithms-sublist" class="cdx-button cdx-button--weight-quiet cdx-button--icon-only vector-toc-toggle"> <span class="vector-icon mw-ui-icon-wikimedia-expand"></span> <span>Toggle Models, methods, and algorithms subsection</span> </button> <ul id="toc-Models,_methods,_and_algorithms-sublist" class="vector-toc-list"> <li id="toc-Hidden_Markov_models" class="vector-toc-list-item vector-toc-level-2"> <a class="vector-toc-link" href="#Hidden_Markov_models"> <div class="vector-toc-text"> <span class="vector-toc-numb">2.1</span> <span>Hidden Markov models</span> </div> </a> <ul id="toc-Hidden_Markov_models-sublist" class="vector-toc-list"> </ul> </li> <li id="toc-Dynamic_time_warping_(DTW)-based_speech_recognition" class="vector-toc-list-item vector-toc-level-2"> <a class="vector-toc-link" href="#Dynamic_time_warping_(DTW)-based_speech_recognition"> <div class="vector-toc-text"> <span class="vector-toc-numb">2.2</span> <span>Dynamic time warping (DTW)-based speech recognition</span> </div> </a> <ul id="toc-Dynamic_time_warping_(DTW)-based_speech_recognition-sublist" class="vector-toc-list"> </ul> </li> <li id="toc-Neural_networks" class="vector-toc-list-item vector-toc-level-2"> <a class="vector-toc-link" href="#Neural_networks"> <div class="vector-toc-text"> <span class="vector-toc-numb">2.3</span> <span>Neural networks</span> </div> </a> <ul id="toc-Neural_networks-sublist" class="vector-toc-list"> <li id="toc-Deep_feedforward_and_recurrent_neural_networks" class="vector-toc-list-item vector-toc-level-3"> <a class="vector-toc-link" href="#Deep_feedforward_and_recurrent_neural_networks"> <div class="vector-toc-text"> <span class="vector-toc-numb">2.3.1</span> <span>Deep feedforward and recurrent neural networks</span> </div> </a> <ul id="toc-Deep_feedforward_and_recurrent_neural_networks-sublist" class="vector-toc-list"> </ul> </li> </ul> </li> <li id="toc-End-to-end_automatic_speech_recognition" class="vector-toc-list-item vector-toc-level-2"> <a class="vector-toc-link" href="#End-to-end_automatic_speech_recognition"> <div class="vector-toc-text"> <span class="vector-toc-numb">2.4</span> <span>End-to-end automatic speech recognition</span> </div> </a> <ul id="toc-End-to-end_automatic_speech_recognition-sublist" class="vector-toc-list"> </ul> </li> </ul> </li> <li id="toc-Applications" class="vector-toc-list-item vector-toc-level-1"> <a class="vector-toc-link" href="#Applications"> <div class="vector-toc-text"> <span class="vector-toc-numb">3</span> <span>Applications</span> </div> </a> <button aria-controls="toc-Applications-sublist" class="cdx-button cdx-button--weight-quiet cdx-button--icon-only vector-toc-toggle"> <span class="vector-icon mw-ui-icon-wikimedia-expand"></span> <span>Toggle Applications subsection</span> </button> <ul id="toc-Applications-sublist" class="vector-toc-list"> <li id="toc-In-car_systems" class="vector-toc-list-item vector-toc-level-2"> <a class="vector-toc-link" href="#In-car_systems"> <div class="vector-toc-text"> <span class="vector-toc-numb">3.1</span> <span>In-car systems</span> </div> </a> <ul id="toc-In-car_systems-sublist" class="vector-toc-list"> </ul> </li> <li id="toc-Education" class="vector-toc-list-item vector-toc-level-2"> <a class="vector-toc-link" href="#Education"> <div class="vector-toc-text"> <span class="vector-toc-numb">3.2</span> <span>Education</span> </div> </a> <ul id="toc-Education-sublist" class="vector-toc-list"> </ul> </li> <li id="toc-Health_care" class="vector-toc-list-item vector-toc-level-2"> <a class="vector-toc-link" href="#Health_care"> <div class="vector-toc-text"> <span class="vector-toc-numb">3.3</span> <span>Health care</span> </div> </a> <ul id="toc-Health_care-sublist" class="vector-toc-list"> <li id="toc-Medical_documentation" class="vector-toc-list-item vector-toc-level-3"> <a class="vector-toc-link" href="#Medical_documentation"> <div class="vector-toc-text"> <span class="vector-toc-numb">3.3.1</span> <span>Medical documentation</span> </div> </a> <ul id="toc-Medical_documentation-sublist" class="vector-toc-list"> </ul> </li> <li id="toc-Therapeutic_use" class="vector-toc-list-item vector-toc-level-3"> <a class="vector-toc-link" href="#Therapeutic_use"> <div class="vector-toc-text"> <span class="vector-toc-numb">3.3.2</span> <span>Therapeutic use</span> </div> </a> <ul id="toc-Therapeutic_use-sublist" class="vector-toc-list"> </ul> </li> </ul> </li> <li id="toc-Military" class="vector-toc-list-item vector-toc-level-2"> <a class="vector-toc-link" href="#Military"> <div class="vector-toc-text"> <span class="vector-toc-numb">3.4</span> <span>Military</span> </div> </a> <ul id="toc-Military-sublist" class="vector-toc-list"> <li id="toc-High-performance_fighter_aircraft" class="vector-toc-list-item vector-toc-level-3"> <a class="vector-toc-link" href="#High-performance_fighter_aircraft"> <div class="vector-toc-text"> <span class="vector-toc-numb">3.4.1</span> <span>High-performance fighter aircraft</span> </div> </a> <ul id="toc-High-performance_fighter_aircraft-sublist" class="vector-toc-list"> </ul> </li> <li id="toc-Helicopters" class="vector-toc-list-item vector-toc-level-3"> <a class="vector-toc-link" href="#Helicopters"> <div class="vector-toc-text"> <span class="vector-toc-numb">3.4.2</span> <span>Helicopters</span> </div> </a> <ul id="toc-Helicopters-sublist" class="vector-toc-list"> </ul> </li> <li id="toc-Training_air_traffic_controllers" class="vector-toc-list-item vector-toc-level-3"> <a class="vector-toc-link" href="#Training_air_traffic_controllers"> <div class="vector-toc-text"> <span class="vector-toc-numb">3.4.3</span> <span>Training air traffic controllers</span> </div> </a> <ul id="toc-Training_air_traffic_controllers-sublist" class="vector-toc-list"> </ul> </li> </ul> </li> <li id="toc-Telephony_and_other_domains" class="vector-toc-list-item vector-toc-level-2"> <a class="vector-toc-link" href="#Telephony_and_other_domains"> <div class="vector-toc-text"> <span class="vector-toc-numb">3.5</span> <span>Telephony and other domains</span> </div> </a> <ul id="toc-Telephony_and_other_domains-sublist" class="vector-toc-list"> </ul> </li> <li id="toc-People_with_disabilities" class="vector-toc-list-item vector-toc-level-2"> <a class="vector-toc-link" href="#People_with_disabilities"> <div class="vector-toc-text"> <span class="vector-toc-numb">3.6</span> <span>People with disabilities</span> </div> </a> <ul id="toc-People_with_disabilities-sublist" class="vector-toc-list"> </ul> </li> <li id="toc-Further_applications" class="vector-toc-list-item vector-toc-level-2"> <a class="vector-toc-link" href="#Further_applications"> <div class="vector-toc-text"> <span class="vector-toc-numb">3.7</span> <span>Further applications</span> </div> </a> <ul id="toc-Further_applications-sublist" class="vector-toc-list"> </ul> </li> </ul> </li> <li id="toc-Performance" class="vector-toc-list-item vector-toc-level-1"> <a class="vector-toc-link" href="#Performance"> <div class="vector-toc-text"> <span class="vector-toc-numb">4</span> <span>Performance</span> </div> </a> <button aria-controls="toc-Performance-sublist" class="cdx-button cdx-button--weight-quiet cdx-button--icon-only vector-toc-toggle"> <span class="vector-icon mw-ui-icon-wikimedia-expand"></span> <span>Toggle Performance subsection</span> </button> <ul id="toc-Performance-sublist" class="vector-toc-list"> <li id="toc-Accuracy" class="vector-toc-list-item vector-toc-level-2"> <a class="vector-toc-link" href="#Accuracy"> <div class="vector-toc-text"> <span class="vector-toc-numb">4.1</span> <span>Accuracy</span> </div> </a> <ul id="toc-Accuracy-sublist" class="vector-toc-list"> </ul> </li> <li id="toc-Security_concerns" class="vector-toc-list-item vector-toc-level-2"> <a class="vector-toc-link" href="#Security_concerns"> <div class="vector-toc-text"> <span class="vector-toc-numb">4.2</span> <span>Security concerns</span> </div> </a> <ul id="toc-Security_concerns-sublist" class="vector-toc-list"> </ul> </li> </ul> </li> <li id="toc-Further_information" class="vector-toc-list-item vector-toc-level-1"> <a class="vector-toc-link" href="#Further_information"> <div class="vector-toc-text"> <span class="vector-toc-numb">5</span> <span>Further information</span> </div> </a> <button aria-controls="toc-Further_information-sublist" class="cdx-button cdx-button--weight-quiet cdx-button--icon-only vector-toc-toggle"> <span class="vector-icon mw-ui-icon-wikimedia-expand"></span> <span>Toggle Further information subsection</span> </button> <ul id="toc-Further_information-sublist" class="vector-toc-list"> <li id="toc-Conferences_and_journals" class="vector-toc-list-item vector-toc-level-2"> <a class="vector-toc-link" href="#Conferences_and_journals"> <div class="vector-toc-text"> <span class="vector-toc-numb">5.1</span> <span>Conferences and journals</span> </div> </a> <ul id="toc-Conferences_and_journals-sublist" class="vector-toc-list"> </ul> </li> <li id="toc-Books" class="vector-toc-list-item vector-toc-level-2"> <a class="vector-toc-link" href="#Books"> <div class="vector-toc-text"> <span class="vector-toc-numb">5.2</span> <span>Books</span> </div> </a> <ul id="toc-Books-sublist" class="vector-toc-list"> </ul> </li> <li id="toc-Software" class="vector-toc-list-item vector-toc-level-2"> <a class="vector-toc-link" href="#Software"> <div class="vector-toc-text"> <span class="vector-toc-numb">5.3</span> <span>Software</span> </div> </a> <ul id="toc-Software-sublist" class="vector-toc-list"> </ul> </li> </ul> </li> <li id="toc-See_also" class="vector-toc-list-item vector-toc-level-1"> <a class="vector-toc-link" href="#See_also"> <div class="vector-toc-text"> <span class="vector-toc-numb">6</span> <span>See also</span> </div> </a> <ul id="toc-See_also-sublist" class="vector-toc-list"> </ul> </li> <li id="toc-References" class="vector-toc-list-item vector-toc-level-1"> <a class="vector-toc-link" href="#References"> <div class="vector-toc-text"> <span class="vector-toc-numb">7</span> <span>References</span> </div> </a> <ul id="toc-References-sublist" class="vector-toc-list"> </ul> </li> <li id="toc-Further_reading" class="vector-toc-list-item vector-toc-level-1"> <a class="vector-toc-link" href="#Further_reading"> <div class="vector-toc-text"> <span class="vector-toc-numb">8</span> <span>Further reading</span> </div> </a> <ul id="toc-Further_reading-sublist" class="vector-toc-list"> </ul> </li> </ul> </div> </div> </nav> </div> </div> <div class="mw-content-container"> <main id="content" class="mw-body"> <header class="mw-body-header vector-page-titlebar"> <nav aria-label="Contents" class="vector-toc-landmark"> <div id="vector-page-titlebar-toc" class="vector-dropdown vector-page-titlebar-toc vector-button-flush-left" title="Table of Contents" > <input type="checkbox" id="vector-page-titlebar-toc-checkbox" role="button" aria-haspopup="true" data-event-name="ui.dropdown-vector-page-titlebar-toc" class="vector-dropdown-checkbox " aria-label="Toggle the table of contents" > <label id="vector-page-titlebar-toc-label" for="vector-page-titlebar-toc-checkbox" class="vector-dropdown-label cdx-button cdx-button--fake-button cdx-button--fake-button--enabled cdx-button--weight-quiet cdx-button--icon-only " aria-hidden="true" ><span class="vector-icon mw-ui-icon-listBullet mw-ui-icon-wikimedia-listBullet"></span> <span class="vector-dropdown-label-text">Toggle the table of contents</span> </label> <div class="vector-dropdown-content"> <div id="vector-page-titlebar-toc-unpinned-container" class="vector-unpinned-container"> </div> </div> </div> </nav> <h1 id="firstHeading" class="firstHeading mw-first-heading"><span class="mw-page-title-main">Speech recognition</span></h1> <div id="p-lang-btn" class="vector-dropdown mw-portlet mw-portlet-lang" > <input type="checkbox" id="p-lang-btn-checkbox" role="button" aria-haspopup="true" data-event-name="ui.dropdown-p-lang-btn" class="vector-dropdown-checkbox mw-interlanguage-selector" aria-label="Go to an article in another language. Available in 49 languages" > <label id="p-lang-btn-label" for="p-lang-btn-checkbox" class="vector-dropdown-label cdx-button cdx-button--fake-button cdx-button--fake-button--enabled cdx-button--weight-quiet cdx-button--action-progressive mw-portlet-lang-heading-49" aria-hidden="true" ><span class="vector-icon mw-ui-icon-language-progressive mw-ui-icon-wikimedia-language-progressive"></span> <span class="vector-dropdown-label-text">49 languages</span> </label> <div class="vector-dropdown-content"> <div class="vector-menu-content"> <ul class="vector-menu-content-list"> <li class="interlanguage-link interwiki-ar mw-list-item"><a href="https://ar.wikipedia.org/wiki/%D8%AA%D8%B9%D8%B1%D9%81_%D8%A7%D9%84%D9%83%D9%84%D8%A7%D9%85" title="تعرف الكلام – Arabic" lang="ar" hreflang="ar" data-title="تعرف الكلام" data-language-autonym="العربية" data-language-local-name="Arabic" class="interlanguage-link-target"><span>العربية</span></a></li><li class="interlanguage-link interwiki-az mw-list-item"><a href="https://az.wikipedia.org/wiki/Nitqin_tan%C4%B1nmas%C4%B1" title="Nitqin tanınması – Azerbaijani" lang="az" hreflang="az" data-title="Nitqin tanınması" data-language-autonym="Azərbaycanca" data-language-local-name="Azerbaijani" class="interlanguage-link-target"><span>Azərbaycanca</span></a></li><li class="interlanguage-link interwiki-azb mw-list-item"><a href="https://azb.wikipedia.org/wiki/%D8%AF%D8%A7%D9%86%DB%8C%D8%B4%DB%8C%D9%82_%D8%AA%D8%A7%D9%86%DB%8C%D9%85%D8%A7%D8%B3%DB%8C" title="دانیشیق تانیماسی – South Azerbaijani" lang="azb" hreflang="azb" data-title="دانیشیق تانیماسی" data-language-autonym="تۆرکجه" data-language-local-name="South Azerbaijani" class="interlanguage-link-target"><span>تۆرکجه</span></a></li><li class="interlanguage-link interwiki-bn mw-list-item"><a href="https://bn.wikipedia.org/wiki/%E0%A6%95%E0%A6%A3%E0%A7%8D%E0%A6%A0_%E0%A6%B6%E0%A6%A8%E0%A6%BE%E0%A6%95%E0%A7%8D%E0%A6%A4%E0%A6%95%E0%A6%B0%E0%A6%A3" title="কণ্ঠ শনাক্তকরণ – Bangla" lang="bn" hreflang="bn" data-title="কণ্ঠ শনাক্তকরণ" data-language-autonym="বাংলা" data-language-local-name="Bangla" class="interlanguage-link-target"><span>বাংলা</span></a></li><li class="interlanguage-link interwiki-be mw-list-item"><a href="https://be.wikipedia.org/wiki/%D0%A0%D0%B0%D1%81%D0%BF%D0%B0%D0%B7%D0%BD%D0%B0%D0%B2%D0%B0%D0%BD%D0%BD%D0%B5_%D0%BC%D0%B0%D1%9E%D0%BB%D0%B5%D0%BD%D0%BD%D1%8F" title="Распазнаванне маўлення – Belarusian" lang="be" hreflang="be" data-title="Распазнаванне маўлення" data-language-autonym="Беларуская" data-language-local-name="Belarusian" class="interlanguage-link-target"><span>Беларуская</span></a></li><li class="interlanguage-link interwiki-ca mw-list-item"><a href="https://ca.wikipedia.org/wiki/Reconeixement_de_la_parla" title="Reconeixement de la parla – Catalan" lang="ca" hreflang="ca" data-title="Reconeixement de la parla" data-language-autonym="Català" data-language-local-name="Catalan" class="interlanguage-link-target"><span>Català</span></a></li><li class="interlanguage-link interwiki-cs mw-list-item"><a href="https://cs.wikipedia.org/wiki/Rozpozn%C3%A1v%C3%A1n%C3%AD_%C5%99e%C4%8Di" title="Rozpoznávání řeči – Czech" lang="cs" hreflang="cs" data-title="Rozpoznávání řeči" data-language-autonym="Čeština" data-language-local-name="Czech" class="interlanguage-link-target"><span>Čeština</span></a></li><li class="interlanguage-link interwiki-da mw-list-item"><a href="https://da.wikipedia.org/wiki/Talegenkendelse" title="Talegenkendelse – Danish" lang="da" hreflang="da" data-title="Talegenkendelse" data-language-autonym="Dansk" data-language-local-name="Danish" class="interlanguage-link-target"><span>Dansk</span></a></li><li class="interlanguage-link interwiki-de badge-Q17437798 badge-goodarticle mw-list-item" title="good article badge"><a href="https://de.wikipedia.org/wiki/Spracherkennung" title="Spracherkennung – German" lang="de" hreflang="de" data-title="Spracherkennung" data-language-autonym="Deutsch" data-language-local-name="German" class="interlanguage-link-target"><span>Deutsch</span></a></li><li class="interlanguage-link interwiki-et mw-list-item"><a href="https://et.wikipedia.org/wiki/K%C3%B5netuvastus" title="Kõnetuvastus – Estonian" lang="et" hreflang="et" data-title="Kõnetuvastus" data-language-autonym="Eesti" data-language-local-name="Estonian" class="interlanguage-link-target"><span>Eesti</span></a></li><li class="interlanguage-link interwiki-el mw-list-item"><a href="https://el.wikipedia.org/wiki/%CE%91%CE%BD%CE%B1%CE%B3%CE%BD%CF%8E%CF%81%CE%B9%CF%83%CE%B7_%CE%BF%CE%BC%CE%B9%CE%BB%CE%AF%CE%B1%CF%82" title="Αναγνώριση ομιλίας – Greek" lang="el" hreflang="el" data-title="Αναγνώριση ομιλίας" data-language-autonym="Ελληνικά" data-language-local-name="Greek" class="interlanguage-link-target"><span>Ελληνικά</span></a></li><li class="interlanguage-link interwiki-es mw-list-item"><a href="https://es.wikipedia.org/wiki/Reconocimiento_del_habla" title="Reconocimiento del habla – Spanish" lang="es" hreflang="es" data-title="Reconocimiento del habla" data-language-autonym="Español" data-language-local-name="Spanish" class="interlanguage-link-target"><span>Español</span></a></li><li class="interlanguage-link interwiki-eo mw-list-item"><a href="https://eo.wikipedia.org/wiki/Parolrekonado" title="Parolrekonado – Esperanto" lang="eo" hreflang="eo" data-title="Parolrekonado" data-language-autonym="Esperanto" data-language-local-name="Esperanto" class="interlanguage-link-target"><span>Esperanto</span></a></li><li class="interlanguage-link interwiki-eu mw-list-item"><a href="https://eu.wikipedia.org/wiki/Ahotsaren_ezagupena" title="Ahotsaren ezagupena – Basque" lang="eu" hreflang="eu" data-title="Ahotsaren ezagupena" data-language-autonym="Euskara" data-language-local-name="Basque" class="interlanguage-link-target"><span>Euskara</span></a></li><li class="interlanguage-link interwiki-fa mw-list-item"><a href="https://fa.wikipedia.org/wiki/%D8%A8%D8%A7%D8%B2%D8%B4%D9%86%D8%A7%D8%B3%DB%8C_%DA%AF%D9%81%D8%AA%D8%A7%D8%B1" title="بازشناسی گفتار – Persian" lang="fa" hreflang="fa" data-title="بازشناسی گفتار" data-language-autonym="فارسی" data-language-local-name="Persian" class="interlanguage-link-target"><span>فارسی</span></a></li><li class="interlanguage-link interwiki-fr mw-list-item"><a href="https://fr.wikipedia.org/wiki/Reconnaissance_automatique_de_la_parole" title="Reconnaissance automatique de la parole – French" lang="fr" hreflang="fr" data-title="Reconnaissance automatique de la parole" data-language-autonym="Français" data-language-local-name="French" class="interlanguage-link-target"><span>Français</span></a></li><li class="interlanguage-link interwiki-gl mw-list-item"><a href="https://gl.wikipedia.org/wiki/Reco%C3%B1ecemento_da_fala" title="Recoñecemento da fala – Galician" lang="gl" hreflang="gl" data-title="Recoñecemento da fala" data-language-autonym="Galego" data-language-local-name="Galician" class="interlanguage-link-target"><span>Galego</span></a></li><li class="interlanguage-link interwiki-ko mw-list-item"><a href="https://ko.wikipedia.org/wiki/%EC%9D%8C%EC%84%B1_%EC%9D%B8%EC%8B%9D" title="음성 인식 – Korean" lang="ko" hreflang="ko" data-title="음성 인식" data-language-autonym="한국어" data-language-local-name="Korean" class="interlanguage-link-target"><span>한국어</span></a></li><li class="interlanguage-link interwiki-hy mw-list-item"><a href="https://hy.wikipedia.org/wiki/%D4%BD%D5%B8%D5%BD%D6%84%D5%AB_%D5%B3%D5%A1%D5%B6%D5%A1%D5%B9%D5%B8%D6%82%D5%B4" title="Խոսքի ճանաչում – Armenian" lang="hy" hreflang="hy" data-title="Խոսքի ճանաչում" data-language-autonym="Հայերեն" data-language-local-name="Armenian" class="interlanguage-link-target"><span>Հայերեն</span></a></li><li class="interlanguage-link interwiki-hi mw-list-item"><a href="https://hi.wikipedia.org/wiki/%E0%A4%B6%E0%A5%8D%E0%A4%B0%E0%A5%81%E0%A4%A4%E0%A4%B2%E0%A5%87%E0%A4%96%E0%A4%A8_%E0%A4%B8%E0%A5%89%E0%A4%AB%E0%A5%8D%E0%A4%9F%E0%A4%B5%E0%A5%87%E0%A4%AF%E0%A4%B0" title="श्रुतलेखन सॉफ्टवेयर – Hindi" lang="hi" hreflang="hi" data-title="श्रुतलेखन सॉफ्टवेयर" data-language-autonym="हिन्दी" data-language-local-name="Hindi" class="interlanguage-link-target"><span>हिन्दी</span></a></li><li class="interlanguage-link interwiki-id mw-list-item"><a href="https://id.wikipedia.org/wiki/Pengenalan_ucapan" title="Pengenalan ucapan – Indonesian" lang="id" hreflang="id" data-title="Pengenalan ucapan" data-language-autonym="Bahasa Indonesia" data-language-local-name="Indonesian" class="interlanguage-link-target"><span>Bahasa Indonesia</span></a></li><li class="interlanguage-link interwiki-is mw-list-item"><a href="https://is.wikipedia.org/wiki/Talgreining" title="Talgreining – Icelandic" lang="is" hreflang="is" data-title="Talgreining" data-language-autonym="Íslenska" data-language-local-name="Icelandic" class="interlanguage-link-target"><span>Íslenska</span></a></li><li class="interlanguage-link interwiki-it mw-list-item"><a href="https://it.wikipedia.org/wiki/Riconoscimento_vocale" title="Riconoscimento vocale – Italian" lang="it" hreflang="it" data-title="Riconoscimento vocale" data-language-autonym="Italiano" data-language-local-name="Italian" class="interlanguage-link-target"><span>Italiano</span></a></li><li class="interlanguage-link interwiki-he mw-list-item"><a href="https://he.wikipedia.org/wiki/%D7%9E%D7%A2%D7%A8%D7%9B%D7%AA_%D7%96%D7%99%D7%94%D7%95%D7%99_%D7%93%D7%99%D7%91%D7%95%D7%A8" title="מערכת זיהוי דיבור – Hebrew" lang="he" hreflang="he" data-title="מערכת זיהוי דיבור" data-language-autonym="עברית" data-language-local-name="Hebrew" class="interlanguage-link-target"><span>עברית</span></a></li><li class="interlanguage-link interwiki-mk mw-list-item"><a href="https://mk.wikipedia.org/wiki/%D0%9F%D1%80%D0%B5%D0%BF%D0%BE%D0%B7%D0%BD%D0%B0%D0%B2%D0%B0%D1%9A%D0%B5_%D0%BD%D0%B0_%D0%B3%D0%BE%D0%B2%D0%BE%D1%80" title="Препознавање на говор – Macedonian" lang="mk" hreflang="mk" data-title="Препознавање на говор" data-language-autonym="Македонски" data-language-local-name="Macedonian" class="interlanguage-link-target"><span>Македонски</span></a></li><li class="interlanguage-link interwiki-ms mw-list-item"><a href="https://ms.wikipedia.org/wiki/Pengecaman_pertuturan" title="Pengecaman pertuturan – Malay" lang="ms" hreflang="ms" data-title="Pengecaman pertuturan" data-language-autonym="Bahasa Melayu" data-language-local-name="Malay" class="interlanguage-link-target"><span>Bahasa Melayu</span></a></li><li class="interlanguage-link interwiki-nl mw-list-item"><a href="https://nl.wikipedia.org/wiki/Spraakherkenning" title="Spraakherkenning – Dutch" lang="nl" hreflang="nl" data-title="Spraakherkenning" data-language-autonym="Nederlands" data-language-local-name="Dutch" class="interlanguage-link-target"><span>Nederlands</span></a></li><li class="interlanguage-link interwiki-ja mw-list-item"><a href="https://ja.wikipedia.org/wiki/%E9%9F%B3%E5%A3%B0%E8%AA%8D%E8%AD%98" title="音声認識 – Japanese" lang="ja" hreflang="ja" data-title="音声認識" data-language-autonym="日本語" data-language-local-name="Japanese" class="interlanguage-link-target"><span>日本語</span></a></li><li class="interlanguage-link interwiki-no mw-list-item"><a href="https://no.wikipedia.org/wiki/Talegjenkjenning" title="Talegjenkjenning – Norwegian Bokmål" lang="nb" hreflang="nb" data-title="Talegjenkjenning" data-language-autonym="Norsk bokmål" data-language-local-name="Norwegian Bokmål" class="interlanguage-link-target"><span>Norsk bokmål</span></a></li><li class="interlanguage-link interwiki-ps mw-list-item"><a href="https://ps.wikipedia.org/wiki/%D9%88%DB%8C%D9%86%D8%A7_%D9%BE%DB%90%DA%98%D9%86%D8%AF%D9%86%D9%87" title="وینا پېژندنه – Pashto" lang="ps" hreflang="ps" data-title="وینا پېژندنه" data-language-autonym="پښتو" data-language-local-name="Pashto" class="interlanguage-link-target"><span>پښتو</span></a></li><li class="interlanguage-link interwiki-pl mw-list-item"><a href="https://pl.wikipedia.org/wiki/Rozpoznawanie_mowy" title="Rozpoznawanie mowy – Polish" lang="pl" hreflang="pl" data-title="Rozpoznawanie mowy" data-language-autonym="Polski" data-language-local-name="Polish" class="interlanguage-link-target"><span>Polski</span></a></li><li class="interlanguage-link interwiki-pt mw-list-item"><a href="https://pt.wikipedia.org/wiki/Reconhecimento_de_fala" title="Reconhecimento de fala – Portuguese" lang="pt" hreflang="pt" data-title="Reconhecimento de fala" data-language-autonym="Português" data-language-local-name="Portuguese" class="interlanguage-link-target"><span>Português</span></a></li><li class="interlanguage-link interwiki-ro mw-list-item"><a href="https://ro.wikipedia.org/wiki/Recunoa%C8%99tere_vocal%C4%83" title="Recunoaștere vocală – Romanian" lang="ro" hreflang="ro" data-title="Recunoaștere vocală" data-language-autonym="Română" data-language-local-name="Romanian" class="interlanguage-link-target"><span>Română</span></a></li><li class="interlanguage-link interwiki-ru mw-list-item"><a href="https://ru.wikipedia.org/wiki/%D0%A0%D0%B0%D1%81%D0%BF%D0%BE%D0%B7%D0%BD%D0%B0%D0%B2%D0%B0%D0%BD%D0%B8%D0%B5_%D1%80%D0%B5%D1%87%D0%B8" title="Распознавание речи – Russian" lang="ru" hreflang="ru" data-title="Распознавание речи" data-language-autonym="Русский" data-language-local-name="Russian" class="interlanguage-link-target"><span>Русский</span></a></li><li class="interlanguage-link interwiki-simple mw-list-item"><a href="https://simple.wikipedia.org/wiki/Speech_recognition" title="Speech recognition – Simple English" lang="en-simple" hreflang="en-simple" data-title="Speech recognition" data-language-autonym="Simple English" data-language-local-name="Simple English" class="interlanguage-link-target"><span>Simple English</span></a></li><li class="interlanguage-link interwiki-sk mw-list-item"><a href="https://sk.wikipedia.org/wiki/Rozpozn%C3%A1vanie_re%C4%8Di" title="Rozpoznávanie reči – Slovak" lang="sk" hreflang="sk" data-title="Rozpoznávanie reči" data-language-autonym="Slovenčina" data-language-local-name="Slovak" class="interlanguage-link-target"><span>Slovenčina</span></a></li><li class="interlanguage-link interwiki-sr mw-list-item"><a href="https://sr.wikipedia.org/wiki/%D0%9F%D1%80%D0%B5%D0%BF%D0%BE%D0%B7%D0%BD%D0%B0%D0%B2%D0%B0%D1%9A%D0%B5_%D0%B3%D0%BE%D0%B2%D0%BE%D1%80%D0%B0" title="Препознавање говора – Serbian" lang="sr" hreflang="sr" data-title="Препознавање говора" data-language-autonym="Српски / srpski" data-language-local-name="Serbian" class="interlanguage-link-target"><span>Српски / srpski</span></a></li><li class="interlanguage-link interwiki-fi mw-list-item"><a href="https://fi.wikipedia.org/wiki/Puheentunnistus" title="Puheentunnistus – Finnish" lang="fi" hreflang="fi" data-title="Puheentunnistus" data-language-autonym="Suomi" data-language-local-name="Finnish" class="interlanguage-link-target"><span>Suomi</span></a></li><li class="interlanguage-link interwiki-sv mw-list-item"><a href="https://sv.wikipedia.org/wiki/Taligenk%C3%A4nning" title="Taligenkänning – Swedish" lang="sv" hreflang="sv" data-title="Taligenkänning" data-language-autonym="Svenska" data-language-local-name="Swedish" class="interlanguage-link-target"><span>Svenska</span></a></li><li class="interlanguage-link interwiki-ta mw-list-item"><a href="https://ta.wikipedia.org/wiki/%E0%AE%AA%E0%AF%87%E0%AE%9A%E0%AF%8D%E0%AE%9A%E0%AF%81%E0%AE%A3%E0%AE%B0%E0%AE%BF" title="பேச்சுணரி – Tamil" lang="ta" hreflang="ta" data-title="பேச்சுணரி" data-language-autonym="தமிழ்" data-language-local-name="Tamil" class="interlanguage-link-target"><span>தமிழ்</span></a></li><li class="interlanguage-link interwiki-te mw-list-item"><a href="https://te.wikipedia.org/wiki/%E0%B0%B8%E0%B1%8D%E0%B0%AA%E0%B1%80%E0%B0%9A%E0%B1%8D_%E0%B0%B0%E0%B0%BF%E0%B0%95%E0%B0%97%E0%B1%8D%E0%B0%A8%E0%B0%BF%E0%B0%B7%E0%B0%A8%E0%B1%8D" title="స్పీచ్ రికగ్నిషన్ – Telugu" lang="te" hreflang="te" data-title="స్పీచ్ రికగ్నిషన్" data-language-autonym="తెలుగు" data-language-local-name="Telugu" class="interlanguage-link-target"><span>తెలుగు</span></a></li><li class="interlanguage-link interwiki-th mw-list-item"><a href="https://th.wikipedia.org/wiki/%E0%B8%81%E0%B8%B2%E0%B8%A3%E0%B8%A3%E0%B8%B9%E0%B9%89%E0%B8%88%E0%B8%B3%E0%B8%84%E0%B8%B3%E0%B8%9E%E0%B8%B9%E0%B8%94" title="การรู้จำคำพูด – Thai" lang="th" hreflang="th" data-title="การรู้จำคำพูด" data-language-autonym="ไทย" data-language-local-name="Thai" class="interlanguage-link-target"><span>ไทย</span></a></li><li class="interlanguage-link interwiki-tr mw-list-item"><a href="https://tr.wikipedia.org/wiki/Ses_konu%C5%9Fma_tan%C4%B1mlay%C4%B1c%C4%B1_yaz%C4%B1l%C4%B1mlar" title="Ses konuşma tanımlayıcı yazılımlar – Turkish" lang="tr" hreflang="tr" data-title="Ses konuşma tanımlayıcı yazılımlar" data-language-autonym="Türkçe" data-language-local-name="Turkish" class="interlanguage-link-target"><span>Türkçe</span></a></li><li class="interlanguage-link interwiki-uk mw-list-item"><a href="https://uk.wikipedia.org/wiki/%D0%A0%D0%BE%D0%B7%D0%BF%D1%96%D0%B7%D0%BD%D0%B0%D0%B2%D0%B0%D0%BD%D0%BD%D1%8F_%D0%BC%D0%BE%D0%B2%D0%BB%D0%B5%D0%BD%D0%BD%D1%8F" title="Розпізнавання мовлення – Ukrainian" lang="uk" hreflang="uk" data-title="Розпізнавання мовлення" data-language-autonym="Українська" data-language-local-name="Ukrainian" class="interlanguage-link-target"><span>Українська</span></a></li><li class="interlanguage-link interwiki-ur mw-list-item"><a href="https://ur.wikipedia.org/wiki/%DA%A9%D9%84%D8%A7%D9%85_%D8%B4%D9%86%D8%A7%D8%B3%DB%8C" title="کلام شناسی – Urdu" lang="ur" hreflang="ur" data-title="کلام شناسی" data-language-autonym="اردو" data-language-local-name="Urdu" class="interlanguage-link-target"><span>اردو</span></a></li><li class="interlanguage-link interwiki-vi mw-list-item"><a href="https://vi.wikipedia.org/wiki/Nh%E1%BA%ADn_d%E1%BA%A1ng_ti%E1%BA%BFng_n%C3%B3i" title="Nhận dạng tiếng nói – Vietnamese" lang="vi" hreflang="vi" data-title="Nhận dạng tiếng nói" data-language-autonym="Tiếng Việt" data-language-local-name="Vietnamese" class="interlanguage-link-target"><span>Tiếng Việt</span></a></li><li class="interlanguage-link interwiki-wa mw-list-item"><a href="https://wa.wikipedia.org/wiki/Riscrijhaedje_del_vw%C3%A8s" title="Riscrijhaedje del vwès – Walloon" lang="wa" hreflang="wa" data-title="Riscrijhaedje del vwès" data-language-autonym="Walon" data-language-local-name="Walloon" class="interlanguage-link-target"><span>Walon</span></a></li><li class="interlanguage-link interwiki-zh-yue mw-list-item"><a href="https://zh-yue.wikipedia.org/wiki/%E8%AA%9E%E9%9F%B3%E8%AD%98%E5%88%A5" title="語音識別 – Cantonese" lang="yue" hreflang="yue" data-title="語音識別" data-language-autonym="粵語" data-language-local-name="Cantonese" class="interlanguage-link-target"><span>粵語</span></a></li><li class="interlanguage-link interwiki-zh mw-list-item"><a href="https://zh.wikipedia.org/wiki/%E8%AF%AD%E9%9F%B3%E8%AF%86%E5%88%AB" title="语音识别 – Chinese" lang="zh" hreflang="zh" data-title="语音识别" data-language-autonym="中文" data-language-local-name="Chinese" class="interlanguage-link-target"><span>中文</span></a></li> </ul> <div class="after-portlet after-portlet-lang"><span class="wb-langlinks-edit wb-langlinks-link"><a href="https://www.wikidata.org/wiki/Special:EntityPage/Q189436#sitelinks-wikipedia" title="Edit interlanguage links" class="wbc-editpage">Edit links</a></span></div> </div> </div> </div> </header> <div class="vector-page-toolbar"> <div class="vector-page-toolbar-container"> <div id="left-navigation"> <nav aria-label="Namespaces"> <div id="p-associated-pages" class="vector-menu vector-menu-tabs mw-portlet mw-portlet-associated-pages" > <div class="vector-menu-content"> <ul class="vector-menu-content-list"> <li id="ca-nstab-main" class="selected vector-tab-noicon mw-list-item"><a href="/wiki/Speech_recognition" title="View the content page [c]" accesskey="c"><span>Article</span></a></li><li id="ca-talk" class="vector-tab-noicon mw-list-item"><a href="/wiki/Talk:Speech_recognition" rel="discussion" title="Discuss improvements to the content page [t]" accesskey="t"><span>Talk</span></a></li> </ul> </div> </div> <div id="vector-variants-dropdown" class="vector-dropdown emptyPortlet" > <input type="checkbox" id="vector-variants-dropdown-checkbox" role="button" aria-haspopup="true" data-event-name="ui.dropdown-vector-variants-dropdown" class="vector-dropdown-checkbox " aria-label="Change language variant" > <label id="vector-variants-dropdown-label" for="vector-variants-dropdown-checkbox" class="vector-dropdown-label cdx-button cdx-button--fake-button cdx-button--fake-button--enabled cdx-button--weight-quiet" aria-hidden="true" ><span class="vector-dropdown-label-text">English</span> </label> <div class="vector-dropdown-content"> <div id="p-variants" class="vector-menu mw-portlet mw-portlet-variants emptyPortlet" > <div class="vector-menu-content"> <ul class="vector-menu-content-list"> </ul> </div> </div> </div> </div> </nav> </div> <div id="right-navigation" class="vector-collapsible"> <nav aria-label="Views"> <div id="p-views" class="vector-menu vector-menu-tabs mw-portlet mw-portlet-views" > <div class="vector-menu-content"> <ul class="vector-menu-content-list"> <li id="ca-view" class="selected vector-tab-noicon mw-list-item"><a href="/wiki/Speech_recognition"><span>Read</span></a></li><li id="ca-edit" class="vector-tab-noicon mw-list-item"><a href="/w/index.php?title=Speech_recognition&amp;action=edit" title="Edit this page [e]" accesskey="e"><span>Edit</span></a></li><li id="ca-history" class="vector-tab-noicon mw-list-item"><a href="/w/index.php?title=Speech_recognition&amp;action=history" title="Past revisions of this page [h]" accesskey="h"><span>View history</span></a></li> </ul> </div> </div> </nav> <nav class="vector-page-tools-landmark" aria-label="Page tools"> <div id="vector-page-tools-dropdown" class="vector-dropdown vector-page-tools-dropdown" > <input type="checkbox" id="vector-page-tools-dropdown-checkbox" role="button" aria-haspopup="true" data-event-name="ui.dropdown-vector-page-tools-dropdown" class="vector-dropdown-checkbox " aria-label="Tools" > <label id="vector-page-tools-dropdown-label" for="vector-page-tools-dropdown-checkbox" class="vector-dropdown-label cdx-button cdx-button--fake-button cdx-button--fake-button--enabled cdx-button--weight-quiet" aria-hidden="true" ><span class="vector-dropdown-label-text">Tools</span> </label> <div class="vector-dropdown-content"> <div id="vector-page-tools-unpinned-container" class="vector-unpinned-container"> <div id="vector-page-tools" class="vector-page-tools vector-pinnable-element"> <div class="vector-pinnable-header vector-page-tools-pinnable-header vector-pinnable-header-unpinned" data-feature-name="page-tools-pinned" data-pinnable-element-id="vector-page-tools" data-pinned-container-id="vector-page-tools-pinned-container" data-unpinned-container-id="vector-page-tools-unpinned-container" > <div class="vector-pinnable-header-label">Tools</div> <button class="vector-pinnable-header-toggle-button vector-pinnable-header-pin-button" data-event-name="pinnable-header.vector-page-tools.pin">move to sidebar</button> <button class="vector-pinnable-header-toggle-button vector-pinnable-header-unpin-button" data-event-name="pinnable-header.vector-page-tools.unpin">hide</button> </div> <div id="p-cactions" class="vector-menu mw-portlet mw-portlet-cactions emptyPortlet vector-has-collapsible-items" title="More options" > <div class="vector-menu-heading"> Actions </div> <div class="vector-menu-content"> <ul class="vector-menu-content-list"> <li id="ca-more-view" class="selected vector-more-collapsible-item mw-list-item"><a href="/wiki/Speech_recognition"><span>Read</span></a></li><li id="ca-more-edit" class="vector-more-collapsible-item mw-list-item"><a href="/w/index.php?title=Speech_recognition&amp;action=edit" title="Edit this page [e]" accesskey="e"><span>Edit</span></a></li><li id="ca-more-history" class="vector-more-collapsible-item mw-list-item"><a href="/w/index.php?title=Speech_recognition&amp;action=history"><span>View history</span></a></li> </ul> </div> </div> <div id="p-tb" class="vector-menu mw-portlet mw-portlet-tb" > <div class="vector-menu-heading"> General </div> <div class="vector-menu-content"> <ul class="vector-menu-content-list"> <li id="t-whatlinkshere" class="mw-list-item"><a href="/wiki/Special:WhatLinksHere/Speech_recognition" title="List of all English Wikipedia pages containing links to this page [j]" accesskey="j"><span>What links here</span></a></li><li id="t-recentchangeslinked" class="mw-list-item"><a href="/wiki/Special:RecentChangesLinked/Speech_recognition" rel="nofollow" title="Recent changes in pages linked from this page [k]" accesskey="k"><span>Related changes</span></a></li><li id="t-upload" class="mw-list-item"><a href="//en.wikipedia.org/wiki/Wikipedia:File_Upload_Wizard" title="Upload files [u]" accesskey="u"><span>Upload file</span></a></li><li id="t-permalink" class="mw-list-item"><a href="/w/index.php?title=Speech_recognition&amp;oldid=1283243349" title="Permanent link to this revision of this page"><span>Permanent link</span></a></li><li id="t-info" class="mw-list-item"><a href="/w/index.php?title=Speech_recognition&amp;action=info" title="More information about this page"><span>Page information</span></a></li><li id="t-cite" class="mw-list-item"><a href="/w/index.php?title=Special:CiteThisPage&amp;page=Speech_recognition&amp;id=1283243349&amp;wpFormIdentifier=titleform" title="Information on how to cite this page"><span>Cite this page</span></a></li><li id="t-urlshortener" class="mw-list-item"><a href="/w/index.php?title=Special:UrlShortener&amp;url=https%3A%2F%2Fen.wikipedia.org%2Fwiki%2FSpeech_recognition"><span>Get shortened URL</span></a></li><li id="t-urlshortener-qrcode" class="mw-list-item"><a href="/w/index.php?title=Special:QrCode&amp;url=https%3A%2F%2Fen.wikipedia.org%2Fwiki%2FSpeech_recognition"><span>Download QR code</span></a></li> </ul> </div> </div> <div id="p-coll-print_export" class="vector-menu mw-portlet mw-portlet-coll-print_export" > <div class="vector-menu-heading"> Print/export </div> <div class="vector-menu-content"> <ul class="vector-menu-content-list"> <li id="coll-download-as-rl" class="mw-list-item"><a href="/w/index.php?title=Special:DownloadAsPdf&amp;page=Speech_recognition&amp;action=show-download-screen" title="Download this page as a PDF file"><span>Download as PDF</span></a></li><li id="t-print" class="mw-list-item"><a href="/w/index.php?title=Speech_recognition&amp;printable=yes" title="Printable version of this page [p]" accesskey="p"><span>Printable version</span></a></li> </ul> </div> </div> <div id="p-wikibase-otherprojects" class="vector-menu mw-portlet mw-portlet-wikibase-otherprojects" > <div class="vector-menu-heading"> In other projects </div> <div class="vector-menu-content"> <ul class="vector-menu-content-list"> <li class="wb-otherproject-link wb-otherproject-commons mw-list-item"><a href="https://commons.wikimedia.org/wiki/Category:Speech_recognition" hreflang="en"><span>Wikimedia Commons</span></a></li><li id="t-wikibase" class="wb-otherproject-link wb-otherproject-wikibase-dataitem mw-list-item"><a href="https://www.wikidata.org/wiki/Special:EntityPage/Q189436" title="Structured data on this page hosted by Wikidata [g]" accesskey="g"><span>Wikidata item</span></a></li> </ul> </div> </div> </div> </div> </div> </div> </nav> </div> </div> </div> <div class="vector-column-end"> <div class="vector-sticky-pinned-container"> <nav class="vector-page-tools-landmark" aria-label="Page tools"> <div id="vector-page-tools-pinned-container" class="vector-pinned-container"> </div> </nav> <nav class="vector-appearance-landmark" aria-label="Appearance"> <div id="vector-appearance-pinned-container" class="vector-pinned-container"> <div id="vector-appearance" class="vector-appearance vector-pinnable-element"> <div class="vector-pinnable-header vector-appearance-pinnable-header vector-pinnable-header-pinned" data-feature-name="appearance-pinned" data-pinnable-element-id="vector-appearance" data-pinned-container-id="vector-appearance-pinned-container" data-unpinned-container-id="vector-appearance-unpinned-container" > <div class="vector-pinnable-header-label">Appearance</div> <button class="vector-pinnable-header-toggle-button vector-pinnable-header-pin-button" data-event-name="pinnable-header.vector-appearance.pin">move to sidebar</button> <button class="vector-pinnable-header-toggle-button vector-pinnable-header-unpin-button" data-event-name="pinnable-header.vector-appearance.unpin">hide</button> </div> </div> </div> </nav> </div> </div> <div id="bodyContent" class="vector-body" aria-labelledby="firstHeading" data-mw-ve-target-container> <div class="vector-body-before-content"> <div class="mw-indicators"> </div> <div id="siteSub" class="noprint">From Wikipedia, the free encyclopedia</div> </div> <div id="contentSub"><div id="mw-content-subtitle"></div></div> <div id="mw-content-text" class="mw-body-content"><div class="mw-content-ltr mw-parser-output" lang="en" dir="ltr"><div class="shortdescription nomobile noexcerpt noprint searchaux" style="display:none">Automatic conversion of spoken language into text</div> <style data-mw-deduplicate="TemplateStyles:r1236090951">.mw-parser-output .hatnote{font-style:italic}.mw-parser-output div.hatnote{padding-left:1.6em;margin-bottom:0.5em}.mw-parser-output .hatnote i{font-style:normal}.mw-parser-output .hatnote+link+.hatnote{margin-top:-0.5em}@media print{body.ns-0 .mw-parser-output .hatnote{display:none!important}}</style><div role="note" class="hatnote navigation-not-searchable">For the human linguistic concept, see <a href="/wiki/Speech_perception" title="Speech perception">Speech perception</a>.</div> <p> <b>Speech recognition</b> is an <a href="/wiki/Interdisciplinary" class="mw-redirect" title="Interdisciplinary">interdisciplinary</a> subfield of <a href="/wiki/Computer_science" title="Computer science">computer science</a> and <a href="/wiki/Computational_linguistics" title="Computational linguistics">computational linguistics</a> that develops <a href="/wiki/Methodology" title="Methodology">methodologies</a> and technologies that enable the recognition and <a href="/wiki/Translation" title="Translation">translation</a> of spoken language into text by computers. It is also known as <b>automatic speech recognition</b> (<b>ASR</b>), <b>computer speech recognition</b> or <b>speech-to-text</b> (<b>STT</b>). It incorporates knowledge and research in the <a href="/wiki/Computer_science" title="Computer science">computer science</a>, <a href="/wiki/Linguistics" title="Linguistics">linguistics</a> and <a href="/wiki/Computer_engineering" title="Computer engineering">computer engineering</a> fields. The reverse process is <a href="/wiki/Speech_synthesis" title="Speech synthesis">speech synthesis</a>. </p><p>Some speech recognition systems require "training" (also called "enrollment") where an individual speaker reads text or isolated <a href="/wiki/Vocabulary" title="Vocabulary">vocabulary</a> into the system. The system analyzes the person's specific voice and uses it to fine-tune the recognition of that person's speech, resulting in increased accuracy. Systems that do not use training are called "speaker-independent"<sup id="cite_ref-1" class="reference"><a href="#cite_note-1"><span class="cite-bracket">&#91;</span>1<span class="cite-bracket">&#93;</span></a></sup> systems. Systems that use training are called "speaker dependent". </p><p>Speech recognition applications include <a href="/wiki/Voice_user_interface" title="Voice user interface">voice user interfaces</a> such as voice dialing (e.g. "call home"), call routing (e.g. "I would like to make a collect call"), <a href="/wiki/Domotic" class="mw-redirect" title="Domotic">domotic</a> appliance control, search key words (e.g. find a podcast where particular words were spoken), simple data entry (e.g., entering a credit card number), preparation of structured documents (e.g. a radiology report), determining speaker characteristics,<sup id="cite_ref-2" class="reference"><a href="#cite_note-2"><span class="cite-bracket">&#91;</span>2<span class="cite-bracket">&#93;</span></a></sup> speech-to-text processing (e.g., <a href="/wiki/Word_processor" title="Word processor">word processors</a> or <a href="/wiki/Email" title="Email">emails</a>), and <a href="/wiki/Aircraft" title="Aircraft">aircraft</a> (usually termed <a href="/wiki/Direct_voice_input" title="Direct voice input">direct voice input</a>). Automatic <a href="/wiki/Pronunciation_assessment" title="Pronunciation assessment">pronunciation assessment</a> is used in education such as for spoken language learning. </p><p><span class="anchor" id="vs_voice_rec"></span>The term <i>voice recognition</i><sup id="cite_ref-Macmillan_Brit._def_of_voice_recognition_3-0" class="reference"><a href="#cite_note-Macmillan_Brit._def_of_voice_recognition-3"><span class="cite-bracket">&#91;</span>3<span class="cite-bracket">&#93;</span></a></sup><sup id="cite_ref-Voice_rec,_definition_4-0" class="reference"><a href="#cite_note-Voice_rec,_definition-4"><span class="cite-bracket">&#91;</span>4<span class="cite-bracket">&#93;</span></a></sup><sup id="cite_ref-mail_bag,_gazette_5-0" class="reference"><a href="#cite_note-mail_bag,_gazette-5"><span class="cite-bracket">&#91;</span>5<span class="cite-bracket">&#93;</span></a></sup> or <i><a href="/wiki/Speaker_recognition" title="Speaker recognition">speaker identification</a></i><sup id="cite_ref-6" class="reference"><a href="#cite_note-6"><span class="cite-bracket">&#91;</span>6<span class="cite-bracket">&#93;</span></a></sup><sup id="cite_ref-7" class="reference"><a href="#cite_note-7"><span class="cite-bracket">&#91;</span>7<span class="cite-bracket">&#93;</span></a></sup><sup id="cite_ref-8" class="reference"><a href="#cite_note-8"><span class="cite-bracket">&#91;</span>8<span class="cite-bracket">&#93;</span></a></sup> refers to identifying the speaker, rather than what they are saying. <a href="/wiki/Speaker_recognition" title="Speaker recognition">Recognizing the speaker</a> can simplify the task of <a href="/wiki/Speech_translation" title="Speech translation">translating speech</a> in systems that have been trained on a specific person's voice or it can be used to <a href="/wiki/Authentication" title="Authentication">authenticate</a> or verify the identity of a speaker as part of a security process. </p><p>From the technology perspective, speech recognition has a long history with several waves of major innovations. Most recently, the field has benefited from advances in <a href="/wiki/Deep_learning" title="Deep learning">deep learning</a> and <a href="/wiki/Big_data" title="Big data">big data</a>. The advances are evidenced not only by the surge of academic papers published in the field, but more importantly by the worldwide industry adoption of a variety of deep learning methods in designing and deploying speech recognition systems. </p> <meta property="mw:PageProp/toc" /> <div class="mw-heading mw-heading2"><h2 id="History">History</h2><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Speech_recognition&amp;action=edit&amp;section=1" title="Edit section: History"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <p>The key areas of growth were: vocabulary size, speaker independence, and processing speed. </p> <div class="mw-heading mw-heading3"><h3 id="Pre-1970">Pre-1970</h3><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Speech_recognition&amp;action=edit&amp;section=2" title="Edit section: Pre-1970"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <ul><li><b>1952</b> – Three Bell Labs researchers, Stephen Balashek,<sup id="cite_ref-9" class="reference"><a href="#cite_note-9"><span class="cite-bracket">&#91;</span>9<span class="cite-bracket">&#93;</span></a></sup> R. Biddulph, and K. H. Davis built a system called "Audrey"<sup id="cite_ref-10" class="reference"><a href="#cite_note-10"><span class="cite-bracket">&#91;</span>10<span class="cite-bracket">&#93;</span></a></sup> for single-speaker digit recognition. Their system located the <a href="/wiki/Formants" class="mw-redirect" title="Formants">formants</a> in the power spectrum of each utterance.<sup id="cite_ref-11" class="reference"><a href="#cite_note-11"><span class="cite-bracket">&#91;</span>11<span class="cite-bracket">&#93;</span></a></sup></li> <li><b>1960</b> – <a href="/wiki/Gunnar_Fant" title="Gunnar Fant">Gunnar Fant</a> developed and published the <a href="/wiki/Source-filter_model_of_speech_production" class="mw-redirect" title="Source-filter model of speech production">source-filter model of speech production</a>.</li> <li><b>1962</b> – <a href="/wiki/IBM" title="IBM">IBM</a> demonstrated its 16-word "Shoebox" machine's speech recognition capability at the <a href="/wiki/1962_World%27s_Fair" class="mw-redirect" title="1962 World&#39;s Fair">1962 World's Fair</a>.<sup id="cite_ref-PCW.Siri_12-0" class="reference"><a href="#cite_note-PCW.Siri-12"><span class="cite-bracket">&#91;</span>12<span class="cite-bracket">&#93;</span></a></sup></li> <li><b>1966</b> – <a href="/wiki/Linear_predictive_coding" title="Linear predictive coding">Linear predictive coding</a> (LPC), a <a href="/wiki/Speech_coding" title="Speech coding">speech coding</a> method, was first proposed by <a href="/wiki/Fumitada_Itakura" title="Fumitada Itakura">Fumitada Itakura</a> of <a href="/wiki/Nagoya_University" title="Nagoya University">Nagoya University</a> and Shuzo Saito of <a href="/wiki/Nippon_Telegraph_and_Telephone" title="Nippon Telegraph and Telephone">Nippon Telegraph and Telephone</a> (NTT), while working on speech recognition.<sup id="cite_ref-Gray_13-0" class="reference"><a href="#cite_note-Gray-13"><span class="cite-bracket">&#91;</span>13<span class="cite-bracket">&#93;</span></a></sup></li> <li><b>1969</b> – Funding at <a href="/wiki/Bell_Labs" title="Bell Labs">Bell Labs</a> dried up for several years when, in 1969, the influential <a href="/wiki/John_R._Pierce" title="John R. Pierce">John Pierce</a> wrote an open letter that was critical of and defunded speech recognition research.<sup id="cite_ref-jasapierce_14-0" class="reference"><a href="#cite_note-jasapierce-14"><span class="cite-bracket">&#91;</span>14<span class="cite-bracket">&#93;</span></a></sup> This defunding lasted until Pierce retired and <a href="/wiki/James_L._Flanagan" title="James L. Flanagan">James L. Flanagan</a> took over.</li></ul> <p><a href="/wiki/Raj_Reddy" title="Raj Reddy">Raj Reddy</a> was the first person to take on continuous speech recognition as a graduate student at <a href="/wiki/Stanford_University" title="Stanford University">Stanford University</a> in the late 1960s. Previous systems required users to pause after each word. Reddy's system issued spoken commands for playing <a href="/wiki/Chess" title="Chess">chess</a>. </p><p>Around this time Soviet researchers invented the <a href="/wiki/Dynamic_time_warping" title="Dynamic time warping">dynamic time warping</a> (DTW) algorithm and used it to create a recognizer capable of operating on a 200-word vocabulary.<sup id="cite_ref-15" class="reference"><a href="#cite_note-15"><span class="cite-bracket">&#91;</span>15<span class="cite-bracket">&#93;</span></a></sup> DTW processed speech by dividing it into short frames, e.g. 10ms segments, and processing each frame as a single unit. Although DTW would be superseded by later algorithms, the technique carried on. Achieving speaker independence remained unsolved at this time period. </p> <div class="mw-heading mw-heading3"><h3 id="1970–1990"><span id="1970.E2.80.931990"></span>1970–1990</h3><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Speech_recognition&amp;action=edit&amp;section=3" title="Edit section: 1970–1990"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <ul><li><b>1971</b> – <a href="/wiki/DARPA" title="DARPA">DARPA</a> funded five years for <i>Speech Understanding Research</i>, speech recognition research seeking a minimum vocabulary size of 1,000 words. They thought <a href="/wiki/Natural-language_understanding" class="mw-redirect" title="Natural-language understanding">speech <i>understanding</i></a> would be key to making progress in speech <i>recognition</i>, but this later proved untrue.<sup id="cite_ref-16" class="reference"><a href="#cite_note-16"><span class="cite-bracket">&#91;</span>16<span class="cite-bracket">&#93;</span></a></sup> <a href="/wiki/BBN_Technologies" class="mw-redirect" title="BBN Technologies">BBN</a>, <a href="/wiki/IBM" title="IBM">IBM</a>, <a href="/wiki/Carnegie_Mellon" class="mw-redirect" title="Carnegie Mellon">Carnegie Mellon</a> and <a href="/wiki/Stanford_Research_Institute" class="mw-redirect" title="Stanford Research Institute">Stanford Research Institute</a> all participated in the program.<sup id="cite_ref-17" class="reference"><a href="#cite_note-17"><span class="cite-bracket">&#91;</span>17<span class="cite-bracket">&#93;</span></a></sup><sup id="cite_ref-18" class="reference"><a href="#cite_note-18"><span class="cite-bracket">&#91;</span>18<span class="cite-bracket">&#93;</span></a></sup> This revived speech recognition research post John Pierce's letter.</li> <li><b>1972</b> – The IEEE Acoustics, Speech, and Signal Processing group held a conference in Newton, Massachusetts.</li> <li><b>1976</b> – The first <a href="/wiki/ICASSP" class="mw-redirect" title="ICASSP">ICASSP</a> was held in <a href="/wiki/Philadelphia" title="Philadelphia">Philadelphia</a>, which since then has been a major venue for the publication of research on speech recognition.<sup id="cite_ref-19" class="reference"><a href="#cite_note-19"><span class="cite-bracket">&#91;</span>19<span class="cite-bracket">&#93;</span></a></sup></li></ul> <p>During the late 1960s <a href="/wiki/Leonard_E._Baum" title="Leonard E. Baum">Leonard Baum</a> developed the mathematics of <a href="/wiki/Markov_chain" title="Markov chain">Markov chains</a> at the <a href="/wiki/Institute_for_Defense_Analysis" class="mw-redirect" title="Institute for Defense Analysis">Institute for Defense Analysis</a>. A decade later, at CMU, Raj Reddy's students <a href="/wiki/James_K._Baker" title="James K. Baker">James Baker</a> and <a href="/wiki/Janet_M._Baker" title="Janet M. Baker">Janet M. Baker</a> began using the <a href="/wiki/Hidden_Markov_model" title="Hidden Markov model">hidden Markov model</a> (HMM) for speech recognition.<sup id="cite_ref-20" class="reference"><a href="#cite_note-20"><span class="cite-bracket">&#91;</span>20<span class="cite-bracket">&#93;</span></a></sup> James Baker had learned about HMMs from a summer job at the Institute of Defense Analysis during his undergraduate education.<sup id="cite_ref-James_Baker_interview_21-0" class="reference"><a href="#cite_note-James_Baker_interview-21"><span class="cite-bracket">&#91;</span>21<span class="cite-bracket">&#93;</span></a></sup> The use of HMMs allowed researchers to combine different sources of knowledge, such as acoustics, language, and syntax, in a unified probabilistic model. </p> <ul><li>By the <b>mid-1980s</b> IBM's <a href="/wiki/Frederick_Jelinek" title="Frederick Jelinek">Fred Jelinek's</a> team created a voice activated typewriter called Tangora, which could handle a 20,000-word vocabulary<sup id="cite_ref-22" class="reference"><a href="#cite_note-22"><span class="cite-bracket">&#91;</span>22<span class="cite-bracket">&#93;</span></a></sup> Jelinek's statistical approach put less emphasis on emulating the way the human brain processes and understands speech in favor of using statistical modeling techniques like HMMs. (Jelinek's group independently discovered the application of HMMs to speech.<sup id="cite_ref-James_Baker_interview_21-1" class="reference"><a href="#cite_note-James_Baker_interview-21"><span class="cite-bracket">&#91;</span>21<span class="cite-bracket">&#93;</span></a></sup>) This was controversial with linguists since HMMs are too simplistic to account for many common features of human languages.<sup id="cite_ref-23" class="reference"><a href="#cite_note-23"><span class="cite-bracket">&#91;</span>23<span class="cite-bracket">&#93;</span></a></sup> However, the HMM proved to be a highly useful way for modeling speech and replaced dynamic time warping to become the dominant speech recognition algorithm in the 1980s.<sup id="cite_ref-24" class="reference"><a href="#cite_note-24"><span class="cite-bracket">&#91;</span>24<span class="cite-bracket">&#93;</span></a></sup><sup id="cite_ref-25" class="reference"><a href="#cite_note-25"><span class="cite-bracket">&#91;</span>25<span class="cite-bracket">&#93;</span></a></sup></li> <li><b>1982</b> – Dragon Systems, founded by James and <a href="/wiki/Janet_M._Baker" title="Janet M. Baker">Janet M. Baker</a>,<sup id="cite_ref-26" class="reference"><a href="#cite_note-26"><span class="cite-bracket">&#91;</span>26<span class="cite-bracket">&#93;</span></a></sup> was one of IBM's few competitors.</li></ul> <div class="mw-heading mw-heading3"><h3 id="Practical_speech_recognition">Practical speech recognition</h3><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Speech_recognition&amp;action=edit&amp;section=4" title="Edit section: Practical speech recognition"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <p>The 1980s also saw the introduction of the <a href="/wiki/N-gram" title="N-gram">n-gram</a> language model. </p> <ul><li><b>1987</b> – The <a href="/wiki/Katz%27s_back-off_model" title="Katz&#39;s back-off model">back-off model</a> allowed language models to use multiple length n-grams, and <a href="/wiki/CSELT" title="CSELT">CSELT</a><sup id="cite_ref-27" class="reference"><a href="#cite_note-27"><span class="cite-bracket">&#91;</span>27<span class="cite-bracket">&#93;</span></a></sup> used HMM to recognize languages (both in software and in hardware specialized processors, e.g. <a href="/wiki/RIPAC_(microprocessor)" title="RIPAC (microprocessor)">RIPAC</a>).</li></ul> <p>Much of the progress in the field is owed to the rapidly increasing capabilities of computers. At the end of the DARPA program in 1976, the best computer available to researchers was the <a href="/wiki/PDP-10" title="PDP-10">PDP-10</a> with 4 MB ram.<sup id="cite_ref-Communications_of_the_ACM_28-0" class="reference"><a href="#cite_note-Communications_of_the_ACM-28"><span class="cite-bracket">&#91;</span>28<span class="cite-bracket">&#93;</span></a></sup> It could take up to 100 minutes to decode just 30 seconds of speech.<sup id="cite_ref-29" class="reference"><a href="#cite_note-29"><span class="cite-bracket">&#91;</span>29<span class="cite-bracket">&#93;</span></a></sup> </p><p>Two practical products were: </p> <ul><li><b>1984</b> – was released the <a href="/wiki/Apricot_Portable" title="Apricot Portable">Apricot Portable</a> with up to 4096 words support, of which only 64 could be held in <a href="/wiki/Random-access_memory" title="Random-access memory">RAM</a> at a time.<sup id="cite_ref-:2_30-0" class="reference"><a href="#cite_note-:2-30"><span class="cite-bracket">&#91;</span>30<span class="cite-bracket">&#93;</span></a></sup></li> <li><b>1987</b> – a recognizer from Kurzweil Applied Intelligence</li> <li><b>1990</b> – Dragon Dictate, a consumer product released in 1990<sup id="cite_ref-31" class="reference"><a href="#cite_note-31"><span class="cite-bracket">&#91;</span>31<span class="cite-bracket">&#93;</span></a></sup><sup id="cite_ref-KurzweilAIbio_32-0" class="reference"><a href="#cite_note-KurzweilAIbio-32"><span class="cite-bracket">&#91;</span>32<span class="cite-bracket">&#93;</span></a></sup> <a href="/wiki/AT%26T" title="AT&amp;T">AT&amp;T</a> deployed the Voice Recognition Call Processing service in 1992 to route telephone calls without the use of a human operator.<sup id="cite_ref-33" class="reference"><a href="#cite_note-33"><span class="cite-bracket">&#91;</span>33<span class="cite-bracket">&#93;</span></a></sup> The technology was developed by <a href="/wiki/Lawrence_Rabiner" title="Lawrence Rabiner">Lawrence Rabiner</a> and others at Bell Labs.</li></ul> <p>By this point, the vocabulary of the typical commercial speech recognition system was larger than the average human vocabulary.<sup id="cite_ref-Communications_of_the_ACM_28-1" class="reference"><a href="#cite_note-Communications_of_the_ACM-28"><span class="cite-bracket">&#91;</span>28<span class="cite-bracket">&#93;</span></a></sup> Raj Reddy's former student, <a href="/wiki/Xuedong_Huang" title="Xuedong Huang">Xuedong Huang</a>, developed the <a href="/wiki/CMU_Sphinx" title="CMU Sphinx">Sphinx-II</a> system at CMU. The Sphinx-II system was the first to do speaker-independent, large vocabulary, continuous speech recognition and it had the best performance in DARPA's 1992 evaluation. Handling continuous speech with a large vocabulary was a major milestone in the history of speech recognition. Huang went on to found the <a href="/wiki/Windows_Speech_Recognition" title="Windows Speech Recognition">speech recognition group at Microsoft</a> in 1993. Raj Reddy's student <a href="/wiki/Kai-Fu_Lee" title="Kai-Fu Lee">Kai-Fu Lee</a> joined Apple where, in 1992, he helped develop a speech interface prototype for the Apple computer known as Casper. </p><p><a href="/wiki/Lernout_%26_Hauspie" title="Lernout &amp; Hauspie">Lernout &amp; Hauspie</a>, a Belgium-based speech recognition company, acquired several other companies, including Kurzweil Applied Intelligence in 1997 and Dragon Systems in 2000. The L&amp;H speech technology was used in the <a href="/wiki/Windows_XP" title="Windows XP">Windows XP</a> operating system. L&amp;H was an industry leader until an accounting scandal brought an end to the company in 2001. The speech technology from L&amp;H was bought by ScanSoft which became <a href="/wiki/Nuance_Communications" title="Nuance Communications">Nuance</a> in 2005. <a href="/wiki/Apple_Inc." title="Apple Inc.">Apple</a> originally licensed software from Nuance to provide speech recognition capability to its digital assistant <a href="/wiki/Siri" title="Siri">Siri</a>.<sup id="cite_ref-34" class="reference"><a href="#cite_note-34"><span class="cite-bracket">&#91;</span>34<span class="cite-bracket">&#93;</span></a></sup> </p> <div class="mw-heading mw-heading4"><h4 id="2000s">2000s</h4><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Speech_recognition&amp;action=edit&amp;section=5" title="Edit section: 2000s"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <p>In the 2000s DARPA sponsored two speech recognition programs: Effective Affordable Reusable Speech-to-Text (EARS) in 2002 and <a href="/wiki/DARPA_Global_autonomous_language_exploitation_program" title="DARPA Global autonomous language exploitation program">Global Autonomous Language Exploitation</a> (GALE). Four teams participated in the EARS program: <a href="/wiki/IBM" title="IBM">IBM</a>, a team led by <a href="/wiki/BBN_Technologies" class="mw-redirect" title="BBN Technologies">BBN</a> with <a href="/wiki/LIMSI" class="mw-redirect" title="LIMSI">LIMSI</a> and <a href="/wiki/University_of_Pittsburgh" title="University of Pittsburgh">Univ. of Pittsburgh</a>, <a href="/wiki/Cambridge_University" class="mw-redirect" title="Cambridge University">Cambridge University</a>, and a team composed of <a href="/wiki/International_Computer_Science_Institute" title="International Computer Science Institute">ICSI</a>, <a href="/wiki/Stanford_Research_Institute" class="mw-redirect" title="Stanford Research Institute">SRI</a> and <a href="/wiki/University_of_Washington" title="University of Washington">University of Washington</a>. EARS funded the collection of the Switchboard telephone <a href="/wiki/Speech_corpus" title="Speech corpus">speech corpus</a> containing 260 hours of recorded conversations from over 500 speakers.<sup id="cite_ref-35" class="reference"><a href="#cite_note-35"><span class="cite-bracket">&#91;</span>35<span class="cite-bracket">&#93;</span></a></sup> The GALE program focused on <a href="/wiki/Modern_Standard_Arabic" title="Modern Standard Arabic">Arabic</a> and <a href="/wiki/Standard_Chinese" title="Standard Chinese">Mandarin</a> broadcast news speech. <a href="/wiki/Google" title="Google">Google</a>'s first effort at speech recognition came in 2007 after hiring some researchers from Nuance.<sup id="cite_ref-36" class="reference"><a href="#cite_note-36"><span class="cite-bracket">&#91;</span>36<span class="cite-bracket">&#93;</span></a></sup> The first product was <a href="/wiki/GOOG-411" title="GOOG-411">GOOG-411</a>, a telephone based directory service. The recordings from GOOG-411 produced valuable data that helped Google improve their recognition systems. <a href="/wiki/Google_Voice_Search" title="Google Voice Search">Google Voice Search</a> is now supported in over 30 languages. </p><p>In the United States, the <a href="/wiki/National_Security_Agency" title="National Security Agency">National Security Agency</a> has made use of a type of speech recognition for <a href="/wiki/Keyword_spotting" title="Keyword spotting">keyword spotting</a> since at least 2006.<sup id="cite_ref-37" class="reference"><a href="#cite_note-37"><span class="cite-bracket">&#91;</span>37<span class="cite-bracket">&#93;</span></a></sup> This technology allows analysts to search through large volumes of recorded conversations and isolate mentions of keywords. Recordings can be indexed and analysts can run queries over the database to find conversations of interest. Some government research programs focused on intelligence applications of speech recognition, e.g. DARPA's EARS's program and <a href="/wiki/IARPA" class="mw-redirect" title="IARPA">IARPA</a>'s <a href="/wiki/Babel_program" title="Babel program">Babel program</a>. </p><p>In the early 2000s, speech recognition was still dominated by traditional approaches such as <a href="/wiki/Hidden_Markov_model" title="Hidden Markov model">hidden Markov models</a> combined with feedforward <a href="/wiki/Artificial_neural_networks" class="mw-redirect" title="Artificial neural networks">artificial neural networks</a>.<sup id="cite_ref-bourlard1994_38-0" class="reference"><a href="#cite_note-bourlard1994-38"><span class="cite-bracket">&#91;</span>38<span class="cite-bracket">&#93;</span></a></sup> Today, however, many aspects of speech recognition have been taken over by a <a href="/wiki/Deep_learning" title="Deep learning">deep learning</a> method called <a href="/wiki/Long_short-term_memory" title="Long short-term memory">Long short-term memory</a> (LSTM), a <a href="/wiki/Recurrent_neural_network" title="Recurrent neural network">recurrent neural network</a> published by <a href="/wiki/Sepp_Hochreiter" title="Sepp Hochreiter">Sepp Hochreiter</a> &amp; <a href="/wiki/J%C3%BCrgen_Schmidhuber" title="Jürgen Schmidhuber">Jürgen Schmidhuber</a> in 1997.<sup id="cite_ref-lstm_39-0" class="reference"><a href="#cite_note-lstm-39"><span class="cite-bracket">&#91;</span>39<span class="cite-bracket">&#93;</span></a></sup> LSTM RNNs avoid the <a href="/wiki/Vanishing_gradient_problem" title="Vanishing gradient problem">vanishing gradient problem</a> and can learn "Very Deep Learning" tasks<sup id="cite_ref-schmidhuber2015_40-0" class="reference"><a href="#cite_note-schmidhuber2015-40"><span class="cite-bracket">&#91;</span>40<span class="cite-bracket">&#93;</span></a></sup> that require memories of events that happened thousands of discrete time steps ago, which is important for speech. Around 2007, LSTM trained by Connectionist Temporal Classification (CTC)<sup id="cite_ref-graves2006_41-0" class="reference"><a href="#cite_note-graves2006-41"><span class="cite-bracket">&#91;</span>41<span class="cite-bracket">&#93;</span></a></sup> started to outperform traditional speech recognition in certain applications.<sup id="cite_ref-fernandez2007keyword_42-0" class="reference"><a href="#cite_note-fernandez2007keyword-42"><span class="cite-bracket">&#91;</span>42<span class="cite-bracket">&#93;</span></a></sup> In 2015, Google's speech recognition reportedly experienced a dramatic performance jump of 49% through CTC-trained LSTM, which is now available through <a href="/wiki/Google_Voice" title="Google Voice">Google Voice</a> to all smartphone users.<sup id="cite_ref-sak2015_43-0" class="reference"><a href="#cite_note-sak2015-43"><span class="cite-bracket">&#91;</span>43<span class="cite-bracket">&#93;</span></a></sup> <a href="/wiki/Transformer_(machine_learning_model)" class="mw-redirect" title="Transformer (machine learning model)">Transformers</a>, a type of neural network based solely on "attention", have been widely adopted in computer vision<sup id="cite_ref-44" class="reference"><a href="#cite_note-44"><span class="cite-bracket">&#91;</span>44<span class="cite-bracket">&#93;</span></a></sup><sup id="cite_ref-45" class="reference"><a href="#cite_note-45"><span class="cite-bracket">&#91;</span>45<span class="cite-bracket">&#93;</span></a></sup> and language modeling,<sup id="cite_ref-46" class="reference"><a href="#cite_note-46"><span class="cite-bracket">&#91;</span>46<span class="cite-bracket">&#93;</span></a></sup><sup id="cite_ref-47" class="reference"><a href="#cite_note-47"><span class="cite-bracket">&#91;</span>47<span class="cite-bracket">&#93;</span></a></sup> sparking the interest of adapting such models to new domains, including speech recognition.<sup id="cite_ref-:1_48-0" class="reference"><a href="#cite_note-:1-48"><span class="cite-bracket">&#91;</span>48<span class="cite-bracket">&#93;</span></a></sup><sup id="cite_ref-:3_49-0" class="reference"><a href="#cite_note-:3-49"><span class="cite-bracket">&#91;</span>49<span class="cite-bracket">&#93;</span></a></sup><sup id="cite_ref-:4_50-0" class="reference"><a href="#cite_note-:4-50"><span class="cite-bracket">&#91;</span>50<span class="cite-bracket">&#93;</span></a></sup> Some recent papers reported superior performance levels using transformer models for speech recognition, but these models usually require large scale training datasets to reach high performance levels. </p><p>The use of deep feedforward (non-recurrent) networks for <a href="/wiki/Acoustic_model" title="Acoustic model">acoustic modeling</a> was introduced during the later part of 2009 by <a href="/wiki/Geoffrey_Hinton" title="Geoffrey Hinton">Geoffrey Hinton</a> and his students at the University of Toronto and by Li Deng<sup id="cite_ref-51" class="reference"><a href="#cite_note-51"><span class="cite-bracket">&#91;</span>51<span class="cite-bracket">&#93;</span></a></sup> and colleagues at Microsoft Research, initially in the collaborative work between Microsoft and the University of Toronto which was subsequently expanded to include IBM and Google (hence "The shared views of four research groups" subtitle in their 2012 review paper).<sup id="cite_ref-NIPS2009_52-0" class="reference"><a href="#cite_note-NIPS2009-52"><span class="cite-bracket">&#91;</span>52<span class="cite-bracket">&#93;</span></a></sup><sup id="cite_ref-HintonDengYu2012_53-0" class="reference"><a href="#cite_note-HintonDengYu2012-53"><span class="cite-bracket">&#91;</span>53<span class="cite-bracket">&#93;</span></a></sup><sup id="cite_ref-ReferenceICASSP2013_54-0" class="reference"><a href="#cite_note-ReferenceICASSP2013-54"><span class="cite-bracket">&#91;</span>54<span class="cite-bracket">&#93;</span></a></sup> A Microsoft research executive called this innovation "the most dramatic change in accuracy since 1979".<sup id="cite_ref-Scientists-see-advances_55-0" class="reference"><a href="#cite_note-Scientists-see-advances-55"><span class="cite-bracket">&#91;</span>55<span class="cite-bracket">&#93;</span></a></sup> In contrast to the steady incremental improvements of the past few decades, the application of deep learning decreased word error rate by 30%.<sup id="cite_ref-Scientists-see-advances_55-1" class="reference"><a href="#cite_note-Scientists-see-advances-55"><span class="cite-bracket">&#91;</span>55<span class="cite-bracket">&#93;</span></a></sup> This innovation was quickly adopted across the field. Researchers have begun to use deep learning techniques for language modeling as well. </p><p>In the long history of speech recognition, both shallow form and deep form (e.g. recurrent nets) of artificial neural networks had been explored for many years during 1980s, 1990s and a few years into the 2000s.<sup id="cite_ref-Morgan1993_56-0" class="reference"><a href="#cite_note-Morgan1993-56"><span class="cite-bracket">&#91;</span>56<span class="cite-bracket">&#93;</span></a></sup><sup id="cite_ref-Robinson1992_57-0" class="reference"><a href="#cite_note-Robinson1992-57"><span class="cite-bracket">&#91;</span>57<span class="cite-bracket">&#93;</span></a></sup><sup id="cite_ref-Waibel1989_58-0" class="reference"><a href="#cite_note-Waibel1989-58"><span class="cite-bracket">&#91;</span>58<span class="cite-bracket">&#93;</span></a></sup> But these methods never won over the non-uniform internal-handcrafting <a href="/wiki/Mixture_model" title="Mixture model">Gaussian mixture model</a>/<a href="/wiki/Hidden_Markov_model" title="Hidden Markov model">hidden Markov model</a> (GMM-HMM) technology based on generative models of speech trained discriminatively.<sup id="cite_ref-Baker2009_59-0" class="reference"><a href="#cite_note-Baker2009-59"><span class="cite-bracket">&#91;</span>59<span class="cite-bracket">&#93;</span></a></sup> A number of key difficulties had been methodologically analyzed in the 1990s, including gradient diminishing<sup id="cite_ref-hochreiter1991_60-0" class="reference"><a href="#cite_note-hochreiter1991-60"><span class="cite-bracket">&#91;</span>60<span class="cite-bracket">&#93;</span></a></sup> and weak temporal correlation structure in the neural predictive models.<sup id="cite_ref-Bengio1991_61-0" class="reference"><a href="#cite_note-Bengio1991-61"><span class="cite-bracket">&#91;</span>61<span class="cite-bracket">&#93;</span></a></sup><sup id="cite_ref-Deng1994_62-0" class="reference"><a href="#cite_note-Deng1994-62"><span class="cite-bracket">&#91;</span>62<span class="cite-bracket">&#93;</span></a></sup> All these difficulties were in addition to the lack of big training data and big computing power in these early days. Most speech recognition researchers who understood such barriers hence subsequently moved away from neural nets to pursue generative modeling approaches until the recent resurgence of deep learning starting around 2009–2010 that had overcome all these difficulties. Hinton et al. and Deng et al. reviewed part of this recent history about how their collaboration with each other and then with colleagues across four groups (University of Toronto, Microsoft, Google, and IBM) ignited a renaissance of applications of deep feedforward neural networks for speech recognition.<sup id="cite_ref-HintonDengYu2012_53-1" class="reference"><a href="#cite_note-HintonDengYu2012-53"><span class="cite-bracket">&#91;</span>53<span class="cite-bracket">&#93;</span></a></sup><sup id="cite_ref-ReferenceICASSP2013_54-1" class="reference"><a href="#cite_note-ReferenceICASSP2013-54"><span class="cite-bracket">&#91;</span>54<span class="cite-bracket">&#93;</span></a></sup><sup id="cite_ref-HintonKeynoteICASSP2013_63-0" class="reference"><a href="#cite_note-HintonKeynoteICASSP2013-63"><span class="cite-bracket">&#91;</span>63<span class="cite-bracket">&#93;</span></a></sup><sup id="cite_ref-interspeech2014Keynote_64-0" class="reference"><a href="#cite_note-interspeech2014Keynote-64"><span class="cite-bracket">&#91;</span>64<span class="cite-bracket">&#93;</span></a></sup> </p> <div class="mw-heading mw-heading4"><h4 id="2010s">2010s</h4><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Speech_recognition&amp;action=edit&amp;section=6" title="Edit section: 2010s"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <p>By early 2010s <i>speech</i> recognition, also called voice recognition<sup id="cite_ref-65" class="reference"><a href="#cite_note-65"><span class="cite-bracket">&#91;</span>65<span class="cite-bracket">&#93;</span></a></sup><sup id="cite_ref-66" class="reference"><a href="#cite_note-66"><span class="cite-bracket">&#91;</span>66<span class="cite-bracket">&#93;</span></a></sup><sup id="cite_ref-67" class="reference"><a href="#cite_note-67"><span class="cite-bracket">&#91;</span>67<span class="cite-bracket">&#93;</span></a></sup> was clearly differentiated from <i>speaker</i> recognition, and speaker independence was considered a major breakthrough. Until then, systems required a "training" period. A 1987 ad for a doll had carried the tagline "Finally, the doll that understands you." – despite the fact that it was described as "which children could train to respond to their voice".<sup id="cite_ref-PCW.Siri_12-1" class="reference"><a href="#cite_note-PCW.Siri-12"><span class="cite-bracket">&#91;</span>12<span class="cite-bracket">&#93;</span></a></sup> </p><p>In 2017, Microsoft researchers reached a historical human parity milestone of transcribing conversational telephony speech on the widely benchmarked Switchboard task. Multiple deep learning models were used to optimize speech recognition accuracy. The speech recognition word error rate was reported to be as low as 4 professional human transcribers working together on the same benchmark, which was funded by IBM Watson speech team on the same task.<sup id="cite_ref-68" class="reference"><a href="#cite_note-68"><span class="cite-bracket">&#91;</span>68<span class="cite-bracket">&#93;</span></a></sup> </p> <div class="mw-heading mw-heading2"><h2 id="Models,_methods,_and_algorithms"><span id="Models.2C_methods.2C_and_algorithms"></span>Models, methods, and algorithms</h2><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Speech_recognition&amp;action=edit&amp;section=7" title="Edit section: Models, methods, and algorithms"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <p>Both <a href="/wiki/Acoustic_model" title="Acoustic model">acoustic modeling</a> and <a href="/wiki/Language_model" title="Language model">language modeling</a> are important parts of modern statistically based speech recognition algorithms. Hidden Markov models (HMMs) are widely used in many systems. Language modeling is also used in many other natural language processing applications such as <a href="/wiki/Document_classification" title="Document classification">document classification</a> or <a href="/wiki/Statistical_machine_translation" title="Statistical machine translation">statistical machine translation</a>. </p> <div class="mw-heading mw-heading3"><h3 id="Hidden_Markov_models">Hidden Markov models</h3><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Speech_recognition&amp;action=edit&amp;section=8" title="Edit section: Hidden Markov models"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1236090951" /><div role="note" class="hatnote navigation-not-searchable">Main article: <a href="/wiki/Hidden_Markov_model" title="Hidden Markov model">Hidden Markov model</a></div> <p>Modern general-purpose speech recognition systems are based on hidden Markov models. These are statistical models that output a sequence of symbols or quantities. HMMs are used in speech recognition because a speech signal can be viewed as a piecewise stationary signal or a short-time stationary signal. In a short time scale (e.g., 10 milliseconds), speech can be approximated as a <a href="/wiki/Stationary_process" title="Stationary process">stationary process</a>. Speech can be thought of as a <a href="/wiki/Markov_model" title="Markov model">Markov model</a> for many stochastic purposes. </p><p>Another reason why HMMs are popular is that they can be trained automatically and are simple and computationally feasible to use. In speech recognition, the hidden Markov model would output a sequence of <i>n</i>-dimensional real-valued vectors (with <i>n</i> being a small integer, such as 10), outputting one of these every 10 milliseconds. The vectors would consist of <a href="/wiki/Cepstrum" title="Cepstrum">cepstral</a> coefficients, which are obtained by taking a <a href="/wiki/Fourier_transform" title="Fourier transform">Fourier transform</a> of a short time window of speech and decorrelating the spectrum using a <a href="/wiki/Cosine_transform" class="mw-redirect" title="Cosine transform">cosine transform</a>, then taking the first (most significant) coefficients. The hidden Markov model will tend to have in each state a statistical distribution that is a mixture of diagonal covariance Gaussians, which will give a likelihood for each observed vector. Each word, or (for more general speech recognition systems), each <a href="/wiki/Phoneme" title="Phoneme">phoneme</a>, will have a different output distribution; a hidden Markov model for a sequence of words or phonemes is made by concatenating the individual trained hidden Markov models for the separate words and phonemes. </p><p>Described above are the core elements of the most common, HMM-based approach to speech recognition. Modern speech recognition systems use various combinations of a number of standard techniques in order to improve results over the basic approach described above. A typical large-vocabulary system would need <a href="/w/index.php?title=Context_dependency&amp;action=edit&amp;redlink=1" class="new" title="Context dependency (page does not exist)">context dependency</a> for the <a href="/wiki/Phoneme" title="Phoneme">phonemes</a> (so that phonemes with different left and right context would have different realizations as HMM states); it would use <a href="/wiki/Cepstral_normalization" class="mw-redirect" title="Cepstral normalization">cepstral normalization</a> to normalize for a different speaker and recording conditions; for further speaker normalization, it might use vocal tract length normalization (VTLN) for male-female normalization and <a href="/w/index.php?title=Maximum_likelihood_linear_regression&amp;action=edit&amp;redlink=1" class="new" title="Maximum likelihood linear regression (page does not exist)">maximum likelihood linear regression</a> (MLLR) for more general speaker adaptation. The features would have so-called <a href="/w/index.php?title=Delta_coefficient&amp;action=edit&amp;redlink=1" class="new" title="Delta coefficient (page does not exist)">delta</a> and <a href="/w/index.php?title=Delta-delta_coefficient&amp;action=edit&amp;redlink=1" class="new" title="Delta-delta coefficient (page does not exist)">delta-delta coefficients</a> to capture speech dynamics and in addition, might use <a href="/w/index.php?title=Heteroscedastic_linear_discriminant_analysis&amp;action=edit&amp;redlink=1" class="new" title="Heteroscedastic linear discriminant analysis (page does not exist)">heteroscedastic linear discriminant analysis</a> (HLDA); or might skip the delta and delta-delta coefficients and use <a href="/w/index.php?title=Splicing_(speech_recognition)&amp;action=edit&amp;redlink=1" class="new" title="Splicing (speech recognition) (page does not exist)">splicing</a> and an <a href="/wiki/Linear_Discriminant_Analysis" class="mw-redirect" title="Linear Discriminant Analysis">LDA</a>-based projection followed perhaps by <a href="/wiki/Heteroscedastic" class="mw-redirect" title="Heteroscedastic">heteroscedastic</a> linear discriminant analysis or a <a href="/w/index.php?title=Global_semi-tied_co_variance&amp;action=edit&amp;redlink=1" class="new" title="Global semi-tied co variance (page does not exist)">global semi-tied co variance</a> transform (also known as <a href="/w/index.php?title=Maximum_likelihood_linear_transform&amp;action=edit&amp;redlink=1" class="new" title="Maximum likelihood linear transform (page does not exist)">maximum likelihood linear transform</a>, or MLLT). Many systems use so-called discriminative training techniques that dispense with a purely statistical approach to HMM parameter estimation and instead optimize some classification-related measure of the training data. Examples are maximum <a href="/wiki/Mutual_information" title="Mutual information">mutual information</a> (MMI), minimum classification error (MCE), and minimum phone error (MPE). </p><p>Decoding of the speech (the term for what happens when the system is presented with a new utterance and must compute the most likely source sentence) would probably use the <a href="/wiki/Viterbi_algorithm" title="Viterbi algorithm">Viterbi algorithm</a> to find the best path, and here there is a choice between dynamically creating a combination hidden Markov model, which includes both the acoustic and language model information and combining it statically beforehand (the <a href="/wiki/Finite_state_transducer" class="mw-redirect" title="Finite state transducer">finite state transducer</a>, or FST, approach). </p><p>A possible improvement to decoding is to keep a set of good candidates instead of just keeping the best candidate, and to use a better scoring function (<a href="/w/index.php?title=Re_scoring_(ASR)&amp;action=edit&amp;redlink=1" class="new" title="Re scoring (ASR) (page does not exist)">re scoring</a>) to rate these good candidates so that we may pick the best one according to this refined score. The set of candidates can be kept either as a list (the <a href="/w/index.php?title=N-best_list&amp;action=edit&amp;redlink=1" class="new" title="N-best list (page does not exist)">N-best list</a> approach) or as a subset of the models (a <a href="/wiki/Lattice_(order)" title="Lattice (order)">lattice</a>). Re scoring is usually done by trying to minimize the <a href="/wiki/Bayes_risk" class="mw-redirect" title="Bayes risk">Bayes risk</a><sup id="cite_ref-69" class="reference"><a href="#cite_note-69"><span class="cite-bracket">&#91;</span>69<span class="cite-bracket">&#93;</span></a></sup> (or an approximation thereof) Instead of taking the source sentence with maximal probability, we try to take the sentence that minimizes the expectancy of a given loss function with regards to all possible transcriptions (i.e., we take the sentence that minimizes the average distance to other possible sentences weighted by their estimated probability). The loss function is usually the <a href="/wiki/Levenshtein_distance" title="Levenshtein distance">Levenshtein distance</a>, though it can be different distances for specific tasks; the set of possible transcriptions is, of course, pruned to maintain tractability. Efficient algorithms have been devised to re score <a href="/wiki/Lattice_(order)" title="Lattice (order)">lattices</a> represented as weighted <a href="/wiki/Finite_state_transducers" class="mw-redirect" title="Finite state transducers">finite state transducers</a> with <a href="/wiki/Edit_distance" title="Edit distance">edit distances</a> represented themselves as a <a href="/wiki/Finite_state_transducer" class="mw-redirect" title="Finite state transducer">finite state transducer</a> verifying certain assumptions.<sup id="cite_ref-70" class="reference"><a href="#cite_note-70"><span class="cite-bracket">&#91;</span>70<span class="cite-bracket">&#93;</span></a></sup> </p> <div class="mw-heading mw-heading3"><h3 id="Dynamic_time_warping_(DTW)-based_speech_recognition"><span id="Dynamic_time_warping_.28DTW.29-based_speech_recognition"></span>Dynamic time warping (DTW)-based speech recognition</h3><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Speech_recognition&amp;action=edit&amp;section=9" title="Edit section: Dynamic time warping (DTW)-based speech recognition"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1236090951" /><div role="note" class="hatnote navigation-not-searchable">Main article: <a href="/wiki/Dynamic_time_warping" title="Dynamic time warping">Dynamic time warping</a></div> <p>Dynamic time warping is an approach that was historically used for speech recognition but has now largely been displaced by the more successful HMM-based approach. </p><p>Dynamic time warping is an algorithm for measuring similarity between two sequences that may vary in time or speed. For instance, similarities in walking patterns would be detected, even if in one video the person was walking slowly and if in another he or she were walking more quickly, or even if there were accelerations and deceleration during the course of one observation. DTW has been applied to video, audio, and graphics&#160;– indeed, any data that can be turned into a linear representation can be analyzed with DTW. </p><p>A well-known application has been automatic speech recognition, to cope with different speaking speeds. In general, it is a method that allows a computer to find an optimal match between two given sequences (e.g., time series) with certain restrictions. That is, the sequences are "warped" non-linearly to match each other. This sequence alignment method is often used in the context of hidden Markov models. </p> <div class="mw-heading mw-heading3"><h3 id="Neural_networks">Neural networks</h3><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Speech_recognition&amp;action=edit&amp;section=10" title="Edit section: Neural networks"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1236090951" /><div role="note" class="hatnote navigation-not-searchable">Main article: <a href="/wiki/Artificial_neural_network" class="mw-redirect" title="Artificial neural network">Artificial neural network</a></div> <p>Neural networks emerged as an attractive acoustic modeling approach in ASR in the late 1980s. Since then, neural networks have been used in many aspects of speech recognition such as phoneme classification,<sup id="cite_ref-71" class="reference"><a href="#cite_note-71"><span class="cite-bracket">&#91;</span>71<span class="cite-bracket">&#93;</span></a></sup> phoneme classification through multi-objective evolutionary algorithms,<sup id="cite_ref-Bird_Wanner_Ekárt_Faria_2020_p=113402_72-0" class="reference"><a href="#cite_note-Bird_Wanner_Ekárt_Faria_2020_p=113402-72"><span class="cite-bracket">&#91;</span>72<span class="cite-bracket">&#93;</span></a></sup> isolated word recognition,<sup id="cite_ref-73" class="reference"><a href="#cite_note-73"><span class="cite-bracket">&#91;</span>73<span class="cite-bracket">&#93;</span></a></sup> <a href="/wiki/Audiovisual_speech_recognition" class="mw-redirect" title="Audiovisual speech recognition">audiovisual speech recognition</a>, audiovisual speaker recognition and speaker adaptation. </p><p><a href="/wiki/Artificial_neural_network" class="mw-redirect" title="Artificial neural network">Neural networks</a> make fewer explicit assumptions about feature statistical properties than HMMs and have several qualities making them more attractive recognition models for speech recognition. When used to estimate the probabilities of a speech feature segment, neural networks allow discriminative training in a natural and efficient manner. However, in spite of their effectiveness in classifying short-time units such as individual phonemes and isolated words,<sup id="cite_ref-74" class="reference"><a href="#cite_note-74"><span class="cite-bracket">&#91;</span>74<span class="cite-bracket">&#93;</span></a></sup> early neural networks were rarely successful for continuous recognition tasks because of their limited ability to model temporal dependencies. </p><p>One approach to this limitation was to use neural networks as a pre-processing, feature transformation or dimensionality reduction,<sup id="cite_ref-75" class="reference"><a href="#cite_note-75"><span class="cite-bracket">&#91;</span>75<span class="cite-bracket">&#93;</span></a></sup> step prior to HMM based recognition. However, more recently, LSTM and related recurrent neural networks (RNNs),<sup id="cite_ref-lstm_39-1" class="reference"><a href="#cite_note-lstm-39"><span class="cite-bracket">&#91;</span>39<span class="cite-bracket">&#93;</span></a></sup><sup id="cite_ref-sak2015_43-1" class="reference"><a href="#cite_note-sak2015-43"><span class="cite-bracket">&#91;</span>43<span class="cite-bracket">&#93;</span></a></sup><sup id="cite_ref-fernandez2007_76-0" class="reference"><a href="#cite_note-fernandez2007-76"><span class="cite-bracket">&#91;</span>76<span class="cite-bracket">&#93;</span></a></sup><sup id="cite_ref-77" class="reference"><a href="#cite_note-77"><span class="cite-bracket">&#91;</span>77<span class="cite-bracket">&#93;</span></a></sup> Time Delay Neural Networks(TDNN's),<sup id="cite_ref-78" class="reference"><a href="#cite_note-78"><span class="cite-bracket">&#91;</span>78<span class="cite-bracket">&#93;</span></a></sup> and transformers<sup id="cite_ref-:1_48-1" class="reference"><a href="#cite_note-:1-48"><span class="cite-bracket">&#91;</span>48<span class="cite-bracket">&#93;</span></a></sup><sup id="cite_ref-:3_49-1" class="reference"><a href="#cite_note-:3-49"><span class="cite-bracket">&#91;</span>49<span class="cite-bracket">&#93;</span></a></sup><sup id="cite_ref-:4_50-1" class="reference"><a href="#cite_note-:4-50"><span class="cite-bracket">&#91;</span>50<span class="cite-bracket">&#93;</span></a></sup> have demonstrated improved performance in this area. </p> <div class="mw-heading mw-heading4"><h4 id="Deep_feedforward_and_recurrent_neural_networks">Deep feedforward and recurrent neural networks</h4><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Speech_recognition&amp;action=edit&amp;section=11" title="Edit section: Deep feedforward and recurrent neural networks"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1236090951" /><div role="note" class="hatnote navigation-not-searchable">Main article: <a href="/wiki/Deep_learning" title="Deep learning">Deep learning</a></div> <p>Deep neural networks and denoising <a href="/wiki/Autoencoder" title="Autoencoder">autoencoders</a><sup id="cite_ref-79" class="reference"><a href="#cite_note-79"><span class="cite-bracket">&#91;</span>79<span class="cite-bracket">&#93;</span></a></sup> are also under investigation. A deep feedforward neural network (DNN) is an <a href="/wiki/Artificial_neural_network" class="mw-redirect" title="Artificial neural network">artificial neural network</a> with multiple hidden layers of units between the input and output layers.<sup id="cite_ref-HintonDengYu2012_53-2" class="reference"><a href="#cite_note-HintonDengYu2012-53"><span class="cite-bracket">&#91;</span>53<span class="cite-bracket">&#93;</span></a></sup> Similar to shallow neural networks, DNNs can model complex non-linear relationships. DNN architectures generate compositional models, where extra layers enable composition of features from lower layers, giving a huge learning capacity and thus the potential of modeling complex patterns of speech data.<sup id="cite_ref-BOOK2014_80-0" class="reference"><a href="#cite_note-BOOK2014-80"><span class="cite-bracket">&#91;</span>80<span class="cite-bracket">&#93;</span></a></sup> </p><p>A success of DNNs in large vocabulary speech recognition occurred in 2010 by industrial researchers, in collaboration with academic researchers, where large output layers of the DNN based on context dependent HMM states constructed by decision trees were adopted.<sup id="cite_ref-Roles2010_81-0" class="reference"><a href="#cite_note-Roles2010-81"><span class="cite-bracket">&#91;</span>81<span class="cite-bracket">&#93;</span></a></sup><sup id="cite_ref-ref27_82-0" class="reference"><a href="#cite_note-ref27-82"><span class="cite-bracket">&#91;</span>82<span class="cite-bracket">&#93;</span></a></sup> <sup id="cite_ref-ICASSP2013_83-0" class="reference"><a href="#cite_note-ICASSP2013-83"><span class="cite-bracket">&#91;</span>83<span class="cite-bracket">&#93;</span></a></sup> See comprehensive reviews of this development and of the state of the art as of October 2014 in the recent Springer book from Microsoft Research.<sup id="cite_ref-ReferenceA_84-0" class="reference"><a href="#cite_note-ReferenceA-84"><span class="cite-bracket">&#91;</span>84<span class="cite-bracket">&#93;</span></a></sup> See also the related background of automatic speech recognition and the impact of various machine learning paradigms, notably including <a href="/wiki/Deep_learning" title="Deep learning">deep learning</a>, in recent overview articles.<sup id="cite_ref-85" class="reference"><a href="#cite_note-85"><span class="cite-bracket">&#91;</span>85<span class="cite-bracket">&#93;</span></a></sup><sup id="cite_ref-scholarpedia2015_86-0" class="reference"><a href="#cite_note-scholarpedia2015-86"><span class="cite-bracket">&#91;</span>86<span class="cite-bracket">&#93;</span></a></sup> </p><p>One fundamental principle of <a href="/wiki/Deep_learning" title="Deep learning">deep learning</a> is to do away with hand-crafted <a href="/wiki/Feature_engineering" title="Feature engineering">feature engineering</a> and to use raw features. This principle was first explored successfully in the architecture of deep autoencoder on the "raw" spectrogram or linear filter-bank features,<sup id="cite_ref-interspeech2010_87-0" class="reference"><a href="#cite_note-interspeech2010-87"><span class="cite-bracket">&#91;</span>87<span class="cite-bracket">&#93;</span></a></sup> showing its superiority over the Mel-Cepstral features which contain a few stages of fixed transformation from spectrograms. The true "raw" features of speech, waveforms, have more recently been shown to produce excellent larger-scale speech recognition results.<sup id="cite_ref-interspeech2014_88-0" class="reference"><a href="#cite_note-interspeech2014-88"><span class="cite-bracket">&#91;</span>88<span class="cite-bracket">&#93;</span></a></sup> </p> <div class="mw-heading mw-heading3"><h3 id="End-to-end_automatic_speech_recognition">End-to-end automatic speech recognition</h3><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Speech_recognition&amp;action=edit&amp;section=12" title="Edit section: End-to-end automatic speech recognition"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <p>Since 2014, there has been much research interest in "end-to-end" ASR. Traditional phonetic-based (i.e., all <a href="/wiki/Hidden_Markov_model" title="Hidden Markov model">HMM</a>-based model) approaches required separate components and training for the pronunciation, acoustic, and <a href="/wiki/Language_model" title="Language model">language model</a>. End-to-end models jointly learn all the components of the speech recognizer. This is valuable since it simplifies the training process and deployment process. For example, a <a href="/wiki/N-gram" title="N-gram">n-gram language model</a> is required for all HMM-based systems, and a typical n-gram language model often takes several gigabytes in memory making them impractical to deploy on mobile devices.<sup id="cite_ref-89" class="reference"><a href="#cite_note-89"><span class="cite-bracket">&#91;</span>89<span class="cite-bracket">&#93;</span></a></sup> Consequently, modern commercial ASR systems from <a href="/wiki/Google" title="Google">Google</a> and <a href="/wiki/Apple_Inc." title="Apple Inc.">Apple</a> (as of 2017<sup class="plainlinks noexcerpt noprint asof-tag update" style="display:none;"><a class="external text" href="https://en.wikipedia.org/w/index.php?title=Speech_recognition&amp;action=edit">&#91;update&#93;</a></sup>) are deployed on the cloud and require a network connection as opposed to the device locally. </p><p>The first attempt at end-to-end ASR was with <a href="/wiki/Connectionist_temporal_classification" title="Connectionist temporal classification">Connectionist Temporal Classification</a> (CTC)-based systems introduced by <a href="/wiki/Alex_Graves_(computer_scientist)" title="Alex Graves (computer scientist)">Alex Graves</a> of <a href="/wiki/DeepMind" class="mw-redirect" title="DeepMind">Google DeepMind</a> and Navdeep Jaitly of the <a href="/wiki/University_of_Toronto" title="University of Toronto">University of Toronto</a> in 2014.<sup id="cite_ref-90" class="reference"><a href="#cite_note-90"><span class="cite-bracket">&#91;</span>90<span class="cite-bracket">&#93;</span></a></sup> The model consisted of <a href="/wiki/Recurrent_neural_network" title="Recurrent neural network">recurrent neural networks</a> and a CTC layer. Jointly, the RNN-CTC model learns the pronunciation and acoustic model together, however it is incapable of learning the language due to <a href="/wiki/Conditional_independence" title="Conditional independence">conditional independence</a> assumptions similar to a HMM. Consequently, CTC models can directly learn to map speech acoustics to English characters, but the models make many common spelling mistakes and must rely on a separate language model to clean up the transcripts. Later, <a href="/wiki/Baidu" title="Baidu">Baidu</a> expanded on the work with extremely large datasets and demonstrated some commercial success in Chinese Mandarin and English.<sup id="cite_ref-91" class="reference"><a href="#cite_note-91"><span class="cite-bracket">&#91;</span>91<span class="cite-bracket">&#93;</span></a></sup> In 2016, <a href="/wiki/University_of_Oxford" title="University of Oxford">University of Oxford</a> presented <a href="/wiki/LipNet" title="LipNet">LipNet</a>,<sup id="cite_ref-92" class="reference"><a href="#cite_note-92"><span class="cite-bracket">&#91;</span>92<span class="cite-bracket">&#93;</span></a></sup> the first end-to-end sentence-level lipreading model, using spatiotemporal convolutions coupled with an RNN-CTC architecture, surpassing human-level performance in a restricted grammar dataset.<sup id="cite_ref-93" class="reference"><a href="#cite_note-93"><span class="cite-bracket">&#91;</span>93<span class="cite-bracket">&#93;</span></a></sup> A large-scale CNN-RNN-CTC architecture was presented in 2018 by <a href="/wiki/DeepMind" class="mw-redirect" title="DeepMind">Google DeepMind</a> achieving 6 times better performance than human experts.<sup id="cite_ref-:0_94-0" class="reference"><a href="#cite_note-:0-94"><span class="cite-bracket">&#91;</span>94<span class="cite-bracket">&#93;</span></a></sup> In 2019, <a href="/wiki/Nvidia" title="Nvidia">Nvidia</a> launched two CNN-CTC ASR models, Jasper and QuarzNet, with an overall performance WER of 3%.<sup id="cite_ref-95" class="reference"><a href="#cite_note-95"><span class="cite-bracket">&#91;</span>95<span class="cite-bracket">&#93;</span></a></sup><sup id="cite_ref-96" class="reference"><a href="#cite_note-96"><span class="cite-bracket">&#91;</span>96<span class="cite-bracket">&#93;</span></a></sup> Similar to other deep learning applications, <a href="/wiki/Transfer_learning" title="Transfer learning">transfer learning</a> and <a href="/wiki/Domain_adaptation" title="Domain adaptation">domain adaptation</a> are important strategies for reusing and extending the capabilities of deep learning models, particularly due to the high costs of training models from scratch, and the small size of available corpus in many languages and/or specific domains.<sup id="cite_ref-97" class="reference"><a href="#cite_note-97"><span class="cite-bracket">&#91;</span>97<span class="cite-bracket">&#93;</span></a></sup><sup id="cite_ref-98" class="reference"><a href="#cite_note-98"><span class="cite-bracket">&#91;</span>98<span class="cite-bracket">&#93;</span></a></sup><sup id="cite_ref-99" class="reference"><a href="#cite_note-99"><span class="cite-bracket">&#91;</span>99<span class="cite-bracket">&#93;</span></a></sup> </p><p>An alternative approach to CTC-based models are attention-based models. Attention-based ASR models were introduced simultaneously by Chan et al. of <a href="/wiki/Carnegie_Mellon_University" title="Carnegie Mellon University">Carnegie Mellon University</a> and <a href="/wiki/Google_Brain" title="Google Brain">Google Brain</a> and Bahdanau et al. of the <a href="/wiki/Universit%C3%A9_de_Montr%C3%A9al" title="Université de Montréal">University of Montreal</a> in 2016.<sup id="cite_ref-100" class="reference"><a href="#cite_note-100"><span class="cite-bracket">&#91;</span>100<span class="cite-bracket">&#93;</span></a></sup><sup id="cite_ref-101" class="reference"><a href="#cite_note-101"><span class="cite-bracket">&#91;</span>101<span class="cite-bracket">&#93;</span></a></sup> The model named "Listen, Attend and Spell" (LAS), literally "listens" to the acoustic signal, pays "attention" to different parts of the signal and "spells" out the transcript one character at a time. Unlike CTC-based models, attention-based models do not have conditional-independence assumptions and can learn all the components of a speech recognizer including the pronunciation, acoustic and language model directly. This means, during deployment, there is no need to carry around a language model making it very practical for applications with limited memory. By the end of 2016, the attention-based models have seen considerable success including outperforming the CTC models (with or without an external language model).<sup id="cite_ref-102" class="reference"><a href="#cite_note-102"><span class="cite-bracket">&#91;</span>102<span class="cite-bracket">&#93;</span></a></sup> Various extensions have been proposed since the original LAS model. Latent Sequence Decompositions (LSD) was proposed by <a href="/wiki/Carnegie_Mellon_University" title="Carnegie Mellon University">Carnegie Mellon University</a>, <a href="/wiki/Massachusetts_Institute_of_Technology" title="Massachusetts Institute of Technology">MIT</a> and <a href="/wiki/Google_Brain" title="Google Brain">Google Brain</a> to directly emit sub-word units which are more natural than English characters;<sup id="cite_ref-103" class="reference"><a href="#cite_note-103"><span class="cite-bracket">&#91;</span>103<span class="cite-bracket">&#93;</span></a></sup> <a href="/wiki/University_of_Oxford" title="University of Oxford">University of Oxford</a> and <a href="/wiki/DeepMind" class="mw-redirect" title="DeepMind">Google DeepMind</a> extended LAS to "Watch, Listen, Attend and Spell" (WLAS) to handle lip reading surpassing human-level performance.<sup id="cite_ref-104" class="reference"><a href="#cite_note-104"><span class="cite-bracket">&#91;</span>104<span class="cite-bracket">&#93;</span></a></sup> </p> <div class="mw-heading mw-heading2"><h2 id="Applications">Applications</h2><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Speech_recognition&amp;action=edit&amp;section=13" title="Edit section: Applications"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <div class="mw-heading mw-heading3"><h3 id="In-car_systems">In-car systems</h3><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Speech_recognition&amp;action=edit&amp;section=14" title="Edit section: In-car systems"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <p>Typically a manual control input, for example by means of a finger control on the steering-wheel, enables the speech recognition system and this is signaled to the driver by an audio prompt. Following the audio prompt, the system has a "listening window" during which it may accept a speech input for recognition. <sup class="noprint Inline-Template Template-Fact" style="white-space:nowrap;">&#91;<i><a href="/wiki/Wikipedia:Citation_needed" title="Wikipedia:Citation needed"><span title="This claim needs references to reliable sources. (March 2014)">citation needed</span></a></i>&#93;</sup> </p><p>Simple voice commands may be used to initiate phone calls, select radio stations or play music from a compatible smartphone, MP3 player or music-loaded flash drive. Voice recognition capabilities vary between car make and model. Some of the most recent<sup class="noprint Inline-Template" style="white-space:nowrap;">&#91;<i><a href="/wiki/Wikipedia:Manual_of_Style/Dates_and_numbers#Chronological_items" title="Wikipedia:Manual of Style/Dates and numbers"><span title="The time period mentioned near this tag is ambiguous. (April 2014)">when?</span></a></i>&#93;</sup> car models offer natural-language speech recognition in place of a fixed set of commands, allowing the driver to use full sentences and common phrases. With such systems there is, therefore, no need for the user to memorize a set of fixed command words.<sup class="noprint Inline-Template Template-Fact" style="white-space:nowrap;">&#91;<i><a href="/wiki/Wikipedia:Citation_needed" title="Wikipedia:Citation needed"><span title="This claim needs references to reliable sources. (March 2014)">citation needed</span></a></i>&#93;</sup> </p> <div class="mw-heading mw-heading3"><h3 id="Education">Education</h3><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Speech_recognition&amp;action=edit&amp;section=15" title="Edit section: Education"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1236090951" /><div role="note" class="hatnote navigation-not-searchable">Main article: <a href="/wiki/Pronunciation_assessment" title="Pronunciation assessment">Pronunciation assessment</a></div> <p>Automatic <a href="/wiki/Pronunciation" title="Pronunciation">pronunciation</a> assessment is the use of speech recognition to verify the correctness of pronounced speech,<sup id="cite_ref-105" class="reference"><a href="#cite_note-105"><span class="cite-bracket">&#91;</span>105<span class="cite-bracket">&#93;</span></a></sup> as distinguished from manual assessment by an instructor or proctor.<sup id="cite_ref-106" class="reference"><a href="#cite_note-106"><span class="cite-bracket">&#91;</span>106<span class="cite-bracket">&#93;</span></a></sup> Also called speech verification, pronunciation evaluation, and pronunciation scoring, the main application of this technology is computer-aided pronunciation teaching (CAPT) when combined with <a href="/wiki/Computer-aided_instruction" class="mw-redirect" title="Computer-aided instruction">computer-aided instruction</a> for <a href="/wiki/Computer-assisted_language_learning" title="Computer-assisted language learning">computer-assisted language learning</a> (CALL), speech <a href="/wiki/Remedial_education" title="Remedial education">remediation</a>, or <a href="/wiki/Accent_reduction" title="Accent reduction">accent reduction</a>. Pronunciation assessment does not determine unknown speech (as in <a href="/wiki/Digital_dictation" class="mw-redirect" title="Digital dictation">dictation</a> or <a href="/wiki/Automatic_transcription" class="mw-redirect" title="Automatic transcription">automatic transcription</a>) but instead, knowing the expected word(s) in advance, it attempts to verify the correctness of the learner's pronunciation and ideally their <a href="/wiki/Intelligibility_(communication)" title="Intelligibility (communication)">intelligibility</a> to listeners,<sup id="cite_ref-107" class="reference"><a href="#cite_note-107"><span class="cite-bracket">&#91;</span>107<span class="cite-bracket">&#93;</span></a></sup><sup id="cite_ref-obrien_108-0" class="reference"><a href="#cite_note-obrien-108"><span class="cite-bracket">&#91;</span>108<span class="cite-bracket">&#93;</span></a></sup> sometimes along with often inconsequential <a href="/wiki/Prosody_(linguistics)" title="Prosody (linguistics)">prosody</a> such as <a href="/wiki/Intonation_(linguistics)" title="Intonation (linguistics)">intonation</a>, <a href="/wiki/Pitch_(music)" title="Pitch (music)">pitch</a>, <a href="/wiki/Speech_tempo" title="Speech tempo">tempo</a>, <a href="/wiki/Isochrony" title="Isochrony">rhythm</a>, and <a href="/wiki/Vocal_stress" class="mw-redirect" title="Vocal stress">stress</a>.<sup id="cite_ref-109" class="reference"><a href="#cite_note-109"><span class="cite-bracket">&#91;</span>109<span class="cite-bracket">&#93;</span></a></sup> Pronunciation assessment is also used in <a href="/wiki/Reading_tutoring" title="Reading tutoring">reading tutoring</a>, for example in products such as <a href="/wiki/Microsoft_Teams" title="Microsoft Teams">Microsoft Teams</a><sup id="cite_ref-110" class="reference"><a href="#cite_note-110"><span class="cite-bracket">&#91;</span>110<span class="cite-bracket">&#93;</span></a></sup> and from Amira Learning.<sup id="cite_ref-111" class="reference"><a href="#cite_note-111"><span class="cite-bracket">&#91;</span>111<span class="cite-bracket">&#93;</span></a></sup> Automatic pronunciation assessment can also be used to help diagnose and treat <a href="/wiki/Speech_disorders" class="mw-redirect" title="Speech disorders">speech disorders</a> such as <a href="/wiki/Speech_apraxia" class="mw-redirect" title="Speech apraxia">apraxia</a>.<sup id="cite_ref-112" class="reference"><a href="#cite_note-112"><span class="cite-bracket">&#91;</span>112<span class="cite-bracket">&#93;</span></a></sup> </p><p>Assessing authentic listener intelligibility is essential for avoiding inaccuracies from <a href="/wiki/Accent_(sociolinguistics)" title="Accent (sociolinguistics)">accent</a> bias, especially in high-stakes assessments;<sup id="cite_ref-113" class="reference"><a href="#cite_note-113"><span class="cite-bracket">&#91;</span>113<span class="cite-bracket">&#93;</span></a></sup><sup id="cite_ref-114" class="reference"><a href="#cite_note-114"><span class="cite-bracket">&#91;</span>114<span class="cite-bracket">&#93;</span></a></sup><sup id="cite_ref-115" class="reference"><a href="#cite_note-115"><span class="cite-bracket">&#91;</span>115<span class="cite-bracket">&#93;</span></a></sup> from words with multiple correct pronunciations;<sup id="cite_ref-116" class="reference"><a href="#cite_note-116"><span class="cite-bracket">&#91;</span>116<span class="cite-bracket">&#93;</span></a></sup> and from phoneme coding errors in machine-readable pronunciation dictionaries.<sup id="cite_ref-117" class="reference"><a href="#cite_note-117"><span class="cite-bracket">&#91;</span>117<span class="cite-bracket">&#93;</span></a></sup> In 2022, researchers found that some newer speech to text systems, based on <a href="/wiki/End-to-end_reinforcement_learning" class="mw-redirect" title="End-to-end reinforcement learning">end-to-end reinforcement learning</a> to map audio signals directly into words, produce word and phrase confidence scores very closely correlated with genuine listener intelligibility.<sup id="cite_ref-118" class="reference"><a href="#cite_note-118"><span class="cite-bracket">&#91;</span>118<span class="cite-bracket">&#93;</span></a></sup> In the <a href="/wiki/Common_European_Framework_of_Reference_for_Languages" title="Common European Framework of Reference for Languages">Common European Framework of Reference for Languages</a> (CEFR) assessment criteria for "overall phonological control", intelligibility outweighs formally correct pronunciation at all levels.<sup id="cite_ref-119" class="reference"><a href="#cite_note-119"><span class="cite-bracket">&#91;</span>119<span class="cite-bracket">&#93;</span></a></sup> </p> <div class="mw-heading mw-heading3"><h3 id="Health_care">Health care</h3><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Speech_recognition&amp;action=edit&amp;section=16" title="Edit section: Health care"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <div class="mw-heading mw-heading4"><h4 id="Medical_documentation">Medical documentation</h4><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Speech_recognition&amp;action=edit&amp;section=17" title="Edit section: Medical documentation"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <p>In the <a href="/wiki/Health_care" title="Health care">health care</a> sector, speech recognition can be implemented in front-end or back-end of the medical documentation process. Front-end speech recognition is where the provider dictates into a speech-recognition engine, the recognized words are displayed as they are spoken, and the dictator is responsible for editing and signing off on the document. Back-end or deferred speech recognition is where the provider dictates into a <a href="/wiki/Digital_dictation" class="mw-redirect" title="Digital dictation">digital dictation</a> system, the voice is routed through a speech-recognition machine and the recognized draft document is routed along with the original voice file to the editor, where the draft is edited and report finalized. Deferred speech recognition is widely used in the industry currently. </p><p>One of the major issues relating to the use of speech recognition in healthcare is that the <a href="/wiki/American_Recovery_and_Reinvestment_Act_of_2009" title="American Recovery and Reinvestment Act of 2009">American Recovery and Reinvestment Act of 2009</a> (<a href="/wiki/American_Recovery_and_Reinvestment_Act_of_2009" title="American Recovery and Reinvestment Act of 2009">ARRA</a>) provides for substantial financial benefits to physicians who utilize an EMR according to "Meaningful Use" standards. These standards require that a substantial amount of data be maintained by the EMR (now more commonly referred to as an <a href="/wiki/Electronic_Health_Record" class="mw-redirect" title="Electronic Health Record">Electronic Health Record</a> or EHR). The use of speech recognition is more naturally suited to the generation of narrative text, as part of a radiology/pathology interpretation, progress note or discharge summary: the ergonomic gains of using speech recognition to enter structured discrete data (e.g., numeric values or codes from a list or a <a href="/wiki/Controlled_vocabulary" title="Controlled vocabulary">controlled vocabulary</a>) are relatively minimal for people who are sighted and who can operate a keyboard and mouse. </p><p>A more significant issue is that most EHRs have not been expressly tailored to take advantage of voice-recognition capabilities. A large part of the clinician's interaction with the EHR involves navigation through the user interface using menus, and tab/button clicks, and is heavily dependent on keyboard and mouse: voice-based navigation provides only modest ergonomic benefits. By contrast, many highly customized systems for radiology or pathology dictation implement voice "macros", where the use of certain phrases – e.g., "normal report", will automatically fill in a large number of default values and/or generate boilerplate, which will vary with the type of the exam – e.g., a chest X-ray vs. a gastrointestinal contrast series for a radiology system. </p> <div class="mw-heading mw-heading4"><h4 id="Therapeutic_use">Therapeutic use</h4><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Speech_recognition&amp;action=edit&amp;section=18" title="Edit section: Therapeutic use"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <p>Prolonged use of speech recognition software in conjunction with <a href="/wiki/Word_processor" title="Word processor">word processors</a> has shown benefits to short-term-memory restrengthening in <a href="/wiki/Brain_AVM" class="mw-redirect" title="Brain AVM">brain AVM</a> patients who have been treated with <a href="/wiki/Resection_(surgery)" class="mw-redirect" title="Resection (surgery)">resection</a>. Further research needs to be conducted to determine cognitive benefits for individuals whose AVMs have been treated using radiologic techniques.<sup class="noprint Inline-Template Template-Fact" style="white-space:nowrap;">&#91;<i><a href="/wiki/Wikipedia:Citation_needed" title="Wikipedia:Citation needed"><span title="This claim needs references to reliable sources. (November 2016)">citation needed</span></a></i>&#93;</sup> </p> <div class="mw-heading mw-heading3"><h3 id="Military">Military</h3><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Speech_recognition&amp;action=edit&amp;section=19" title="Edit section: Military"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <div class="mw-heading mw-heading4"><h4 id="High-performance_fighter_aircraft">High-performance fighter aircraft</h4><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Speech_recognition&amp;action=edit&amp;section=20" title="Edit section: High-performance fighter aircraft"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <p>Substantial efforts have been devoted in the last decade to the test and evaluation of speech recognition in <a href="/wiki/Fighter_aircraft" title="Fighter aircraft">fighter aircraft</a>. Of particular note have been the US program in speech recognition for the <a href="/wiki/General_Dynamics_F-16_Fighting_Falcon_variants#F-16_Advanced_Fighter_Technology_Integration" title="General Dynamics F-16 Fighting Falcon variants">Advanced Fighter Technology Integration (AFTI)</a>/<a href="/wiki/F-16" class="mw-redirect" title="F-16">F-16</a> aircraft (<a href="/wiki/F-16_VISTA" class="mw-redirect" title="F-16 VISTA">F-16 VISTA</a>), the program in France for <a href="/wiki/Mirage_(aircraft)" class="mw-redirect" title="Mirage (aircraft)">Mirage</a> aircraft, and other programs in the UK dealing with a variety of aircraft platforms. In these programs, speech recognizers have been operated successfully in fighter aircraft, with applications including setting radio frequencies, commanding an autopilot system, setting steer-point coordinates and weapons release parameters, and controlling flight display. </p><p>Working with Swedish pilots flying in the <a href="/wiki/Saab_JAS_39_Gripen" title="Saab JAS 39 Gripen">JAS-39</a> Gripen cockpit, Englund (2004) found recognition deteriorated with increasing <a href="/wiki/G-force" title="G-force">g-loads</a>. The report also concluded that adaptation greatly improved the results in all cases and that the introduction of models for breathing was shown to improve recognition scores significantly. Contrary to what might have been expected, no effects of the broken English of the speakers were found. It was evident that spontaneous speech caused problems for the recognizer, as might have been expected. A restricted vocabulary, and above all, a proper syntax, could thus be expected to improve recognition accuracy substantially.<sup id="cite_ref-120" class="reference"><a href="#cite_note-120"><span class="cite-bracket">&#91;</span>120<span class="cite-bracket">&#93;</span></a></sup> </p><p>The <a href="/wiki/Eurofighter_Typhoon" title="Eurofighter Typhoon">Eurofighter Typhoon</a>, currently in service with the UK <a href="/wiki/RAF" class="mw-redirect" title="RAF">RAF</a>, employs a speaker-dependent system, requiring each pilot to create a template. The system is not used for any safety-critical or weapon-critical tasks, such as weapon release or lowering of the undercarriage, but is used for a wide range of other cockpit functions. Voice commands are confirmed by visual and/or aural feedback. The system is seen as a major design feature in the reduction of pilot <a href="/wiki/Workload" title="Workload">workload</a>,<sup id="cite_ref-121" class="reference"><a href="#cite_note-121"><span class="cite-bracket">&#91;</span>121<span class="cite-bracket">&#93;</span></a></sup> and even allows the pilot to assign targets to his aircraft with two simple voice commands or to any of his wingmen with only five commands.<sup id="cite_ref-122" class="reference"><a href="#cite_note-122"><span class="cite-bracket">&#91;</span>122<span class="cite-bracket">&#93;</span></a></sup> </p><p>Speaker-independent systems are also being developed and are under test for the <a href="/wiki/Lockheed_Martin_F-35_Lightning_II" title="Lockheed Martin F-35 Lightning II">F-35 Lightning II</a> (JSF) and the <a href="/wiki/Alenia_Aermacchi_M-346_Master" title="Alenia Aermacchi M-346 Master">Alenia Aermacchi M-346 Master</a> lead-in fighter trainer. These systems have produced word accuracy scores in excess of 98%.<sup id="cite_ref-123" class="reference"><a href="#cite_note-123"><span class="cite-bracket">&#91;</span>123<span class="cite-bracket">&#93;</span></a></sup> </p> <div class="mw-heading mw-heading4"><h4 id="Helicopters">Helicopters</h4><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Speech_recognition&amp;action=edit&amp;section=21" title="Edit section: Helicopters"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <p>The problems of achieving high recognition accuracy under stress and noise are particularly relevant in the <a href="/wiki/Helicopter" title="Helicopter">helicopter</a> environment as well as in the jet fighter environment. The acoustic noise problem is actually more severe in the helicopter environment, not only because of the high noise levels but also because the helicopter pilot, in general, does not wear a <a href="/wiki/Fighter_pilot_helmet" class="mw-redirect" title="Fighter pilot helmet">facemask</a>, which would reduce acoustic noise in the <a href="/wiki/Microphone" title="Microphone">microphone</a>. Substantial test and evaluation programs have been carried out in the past decade in speech recognition systems applications in helicopters, notably by the <a href="/wiki/U.S._Army" class="mw-redirect" title="U.S. Army">U.S. Army</a> Avionics Research and Development Activity (AVRADA) and by the Royal Aerospace Establishment (<a href="/wiki/Royal_Aircraft_Establishment" title="Royal Aircraft Establishment">RAE</a>) in the UK. Work in France has included speech recognition in the <a href="/wiki/Puma_helicopter" class="mw-redirect" title="Puma helicopter">Puma helicopter</a>. There has also been much useful work in <a href="/wiki/Canada" title="Canada">Canada</a>. Results have been encouraging, and voice applications have included: control of communication radios, setting of <a href="/wiki/Navigation" title="Navigation">navigation</a> systems, and control of an automated target handover system. </p><p>As in fighter applications, the overriding issue for voice in helicopters is the impact on pilot effectiveness. Encouraging results are reported for the AVRADA tests, although these represent only a feasibility demonstration in a test environment. Much remains to be done both in speech recognition and in overall <a href="/wiki/Speech_technology" title="Speech technology">speech technology</a> in order to consistently achieve performance improvements in operational settings. </p> <div class="mw-heading mw-heading4"><h4 id="Training_air_traffic_controllers">Training air traffic controllers</h4><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Speech_recognition&amp;action=edit&amp;section=22" title="Edit section: Training air traffic controllers"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <p>Training for air traffic controllers (ATC) represents an excellent application for speech recognition systems. Many ATC training systems currently require a person to act as a "pseudo-pilot", engaging in a voice dialog with the trainee controller, which simulates the dialog that the controller would have to conduct with pilots in a real ATC situation. Speech recognition and <a href="/wiki/Speech_synthesis" title="Speech synthesis">synthesis</a> techniques offer the potential to eliminate the need for a person to act as a pseudo-pilot, thus reducing training and support personnel. In theory, Air controller tasks are also characterized by highly structured speech as the primary output of the controller, hence reducing the difficulty of the speech recognition task should be possible. In practice, this is rarely the case. The FAA document 7110.65 details the phrases that should be used by air traffic controllers. While this document gives less than 150 examples of such phrases, the number of phrases supported by one of the simulation vendors speech recognition systems is in excess of 500,000. </p><p>The USAF, USMC, US Army, US Navy, and FAA as well as a number of international ATC training organizations such as the Royal Australian Air Force and Civil Aviation Authorities in Italy, Brazil, and Canada are currently using ATC simulators with speech recognition from a number of different vendors.<sup class="noprint Inline-Template Template-Fact" style="white-space:nowrap;">&#91;<i><a href="/wiki/Wikipedia:Citation_needed" title="Wikipedia:Citation needed"><span title="This claim needs references to reliable sources. (December 2012)">citation needed</span></a></i>&#93;</sup> </p> <div class="mw-heading mw-heading3"><h3 id="Telephony_and_other_domains">Telephony and other domains</h3><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Speech_recognition&amp;action=edit&amp;section=23" title="Edit section: Telephony and other domains"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <p>ASR is now commonplace in the field of <a href="/wiki/Telephony" title="Telephony">telephony</a> and is becoming more widespread in the field of <a href="/wiki/Computer_gaming" class="mw-redirect" title="Computer gaming">computer gaming</a> and simulation. In telephony systems, ASR is now being predominantly used in contact centers by integrating it with <a href="/wiki/IVR" class="mw-redirect" title="IVR">IVR</a> systems. Despite the high level of integration with word processing in general personal computing, in the field of document production, ASR has not seen the expected increases in use. </p><p>The improvement of mobile processor speeds has made speech recognition practical in <a href="/wiki/Smartphone" title="Smartphone">smartphones</a>. Speech is used mostly as a part of a user interface, for creating predefined or custom speech commands. </p> <div class="mw-heading mw-heading3"><h3 id="People_with_disabilities">People with disabilities</h3><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Speech_recognition&amp;action=edit&amp;section=24" title="Edit section: People with disabilities"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <p>People with disabilities can benefit from speech recognition programs. For individuals that are Deaf or Hard of Hearing, speech recognition software is used to automatically generate a closed-captioning of conversations such as discussions in conference rooms, classroom lectures, and/or religious services.<sup id="cite_ref-124" class="reference"><a href="#cite_note-124"><span class="cite-bracket">&#91;</span>124<span class="cite-bracket">&#93;</span></a></sup> </p><p>Students who are blind (see <a href="/wiki/Blindness_and_education" title="Blindness and education">Blindness and education</a>) or have very low vision can benefit from using the technology to convey words and then hear the computer recite them, as well as use a computer by commanding with their voice, instead of having to look at the screen and keyboard.<sup id="cite_ref-brainline_125-0" class="reference"><a href="#cite_note-brainline-125"><span class="cite-bracket">&#91;</span>125<span class="cite-bracket">&#93;</span></a></sup> </p><p>Students who are physically disabled have a <a href="/wiki/Repetitive_strain_injury" title="Repetitive strain injury">Repetitive strain injury</a>/other injuries to the upper extremities can be relieved from having to worry about handwriting, typing, or working with scribe on school assignments by using speech-to-text programs. They can also utilize speech recognition technology to enjoy searching the Internet or using a computer at home without having to physically operate a mouse and keyboard.<sup id="cite_ref-brainline_125-1" class="reference"><a href="#cite_note-brainline-125"><span class="cite-bracket">&#91;</span>125<span class="cite-bracket">&#93;</span></a></sup> </p><p>Speech recognition can allow students with learning disabilities to become better writers. By saying the words aloud, they can increase the fluidity of their writing, and be alleviated of concerns regarding spelling, punctuation, and other mechanics of writing.<sup id="cite_ref-126" class="reference"><a href="#cite_note-126"><span class="cite-bracket">&#91;</span>126<span class="cite-bracket">&#93;</span></a></sup> Also, see <a href="/wiki/Learning_disability" title="Learning disability">Learning disability</a>. </p><p>The use of voice recognition software, in conjunction with a digital audio recorder and a personal computer running word-processing software has proven to be positive for restoring damaged short-term memory capacity, in stroke and craniotomy individuals. </p><p>Speech recognition is also very useful for people who have difficulty using their hands, ranging from mild repetitive stress injuries to involve disabilities that preclude using conventional computer input devices. In fact, people who used the keyboard a lot and developed <a href="/wiki/Repetitive_Strain_Injury" class="mw-redirect" title="Repetitive Strain Injury">RSI</a> became an urgent early market for speech recognition.<sup id="cite_ref-127" class="reference"><a href="#cite_note-127"><span class="cite-bracket">&#91;</span>127<span class="cite-bracket">&#93;</span></a></sup><sup id="cite_ref-128" class="reference"><a href="#cite_note-128"><span class="cite-bracket">&#91;</span>128<span class="cite-bracket">&#93;</span></a></sup> Speech recognition is used in <a href="/wiki/Deaf" class="mw-redirect" title="Deaf">deaf</a> <a href="/wiki/Telephony" title="Telephony">telephony</a>, such as voicemail to text, <a href="/wiki/Relay_services" class="mw-redirect" title="Relay services">relay services</a>, and <a href="/wiki/Telecommunications_Relay_Service#Captioned_telephone" class="mw-redirect" title="Telecommunications Relay Service">captioned telephone</a>. Individuals with learning disabilities who have problems with thought-to-paper communication (essentially they think of an idea but it is processed incorrectly causing it to end up differently on paper) can possibly benefit from the software but the technology is not bug proof.<sup id="cite_ref-129" class="reference"><a href="#cite_note-129"><span class="cite-bracket">&#91;</span>129<span class="cite-bracket">&#93;</span></a></sup> Also the whole idea of speak to text can be hard for intellectually disabled person's due to the fact that it is rare that anyone tries to learn the technology to teach the person with the disability.<sup id="cite_ref-130" class="reference"><a href="#cite_note-130"><span class="cite-bracket">&#91;</span>130<span class="cite-bracket">&#93;</span></a></sup> </p><p>This type of technology can help those with dyslexia but other disabilities are still in question. The effectiveness of the product is the problem that is hindering it from being effective. Although a kid may be able to say a word depending on how clear they say it the technology may think they are saying another word and input the wrong one. Giving them more work to fix, causing them to have to take more time with fixing the wrong word.<sup id="cite_ref-131" class="reference"><a href="#cite_note-131"><span class="cite-bracket">&#91;</span>131<span class="cite-bracket">&#93;</span></a></sup> </p> <div class="mw-heading mw-heading3"><h3 id="Further_applications">Further applications</h3><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Speech_recognition&amp;action=edit&amp;section=25" title="Edit section: Further applications"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <ul><li><a href="/wiki/Aerospace" title="Aerospace">Aerospace</a> (e.g. <a href="/wiki/Space_exploration" title="Space exploration">space exploration</a>, <a href="/wiki/Spacecraft" title="Spacecraft">spacecraft</a>, etc.) NASA's <a href="/wiki/Mars_Polar_Lander" title="Mars Polar Lander">Mars Polar Lander</a> used speech recognition technology from <a href="/wiki/Sensory,_Inc." title="Sensory, Inc.">Sensory, Inc.</a> in the Mars Microphone on the Lander<sup id="cite_ref-Planetary_Society_article_132-0" class="reference"><a href="#cite_note-Planetary_Society_article-132"><span class="cite-bracket">&#91;</span>132<span class="cite-bracket">&#93;</span></a></sup></li> <li>Automatic <a href="/wiki/Same_language_subtitling" title="Same language subtitling">subtitling</a> with speech recognition</li> <li>Automatic <a href="/wiki/Emotion_recognition" title="Emotion recognition">emotion recognition</a><sup id="cite_ref-133" class="reference"><a href="#cite_note-133"><span class="cite-bracket">&#91;</span>133<span class="cite-bracket">&#93;</span></a></sup></li> <li>Automatic <a href="/wiki/Shot_(filmmaking)" title="Shot (filmmaking)">shot</a> listing in audiovisual production</li> <li><a href="/wiki/Automatic_translation" class="mw-redirect" title="Automatic translation">Automatic translation</a></li> <li><a href="/wiki/EDiscovery" class="mw-redirect" title="EDiscovery">eDiscovery</a> (Legal discovery)</li> <li><a href="/wiki/Hands-free_computing" title="Hands-free computing">Hands-free computing</a>: Speech recognition computer <a href="/wiki/User_interface" title="User interface">user interface</a></li> <li><a href="/wiki/Home_automation" title="Home automation">Home automation</a></li> <li><a href="/wiki/Interactive_voice_response" title="Interactive voice response">Interactive voice response</a></li> <li><a href="/wiki/Mobile_telephony" title="Mobile telephony">Mobile telephony</a>, including mobile email</li> <li><a href="/wiki/Multimodal_interaction" title="Multimodal interaction">Multimodal interaction</a><sup id="cite_ref-interspeech2014Keynote_64-1" class="reference"><a href="#cite_note-interspeech2014Keynote-64"><span class="cite-bracket">&#91;</span>64<span class="cite-bracket">&#93;</span></a></sup></li> <li>Real Time <a href="/wiki/Captioning" class="mw-redirect" title="Captioning">Captioning</a><sup id="cite_ref-134" class="reference"><a href="#cite_note-134"><span class="cite-bracket">&#91;</span>134<span class="cite-bracket">&#93;</span></a></sup></li> <li><a href="/wiki/Robotics" title="Robotics">Robotics</a></li> <li>Security, including usage with other biometric scanners for <a href="/wiki/Multi-factor_authentication" title="Multi-factor authentication">multi-factor authentication</a><sup id="cite_ref-135" class="reference"><a href="#cite_note-135"><span class="cite-bracket">&#91;</span>135<span class="cite-bracket">&#93;</span></a></sup></li> <li>Speech to text (transcription of speech into text, real time video <a href="/wiki/Captioning" class="mw-redirect" title="Captioning">captioning</a>, Court reporting )</li> <li><a href="/wiki/Telematics" title="Telematics">Telematics</a> (e.g. vehicle Navigation Systems)</li> <li><a href="/wiki/Transcription_(linguistics)" title="Transcription (linguistics)">Transcription</a> (digital speech-to-text)</li> <li><a href="/wiki/Video_games" class="mw-redirect" title="Video games">Video games</a>, with <i><a href="/wiki/Tom_Clancy%27s_EndWar" title="Tom Clancy&#39;s EndWar">Tom Clancy's EndWar</a></i> and <i><a href="/wiki/Lifeline_(video_game)" title="Lifeline (video game)">Lifeline</a></i> as working examples</li> <li><a href="/wiki/Virtual_assistant_(artificial_intelligence)" class="mw-redirect" title="Virtual assistant (artificial intelligence)">Virtual assistant</a> (e.g. <a href="/wiki/Apple_Siri" class="mw-redirect" title="Apple Siri">Apple's Siri</a>)</li></ul> <div class="mw-heading mw-heading2"><h2 id="Performance">Performance</h2><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Speech_recognition&amp;action=edit&amp;section=26" title="Edit section: Performance"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <p>The performance of speech recognition systems is usually evaluated in terms of accuracy and speed.<sup id="cite_ref-136" class="reference"><a href="#cite_note-136"><span class="cite-bracket">&#91;</span>136<span class="cite-bracket">&#93;</span></a></sup><sup id="cite_ref-137" class="reference"><a href="#cite_note-137"><span class="cite-bracket">&#91;</span>137<span class="cite-bracket">&#93;</span></a></sup> Accuracy is usually rated with <a href="/wiki/Word_error_rate" title="Word error rate">word error rate</a> (WER), whereas speed is measured with the <a href="/w/index.php?title=Real_time_factor&amp;action=edit&amp;redlink=1" class="new" title="Real time factor (page does not exist)">real time factor</a>. Other measures of accuracy include <a href="/wiki/Single_Word_Error_Rate" class="mw-redirect" title="Single Word Error Rate">Single Word Error Rate</a> (SWER) and <a href="/w/index.php?title=Command_Success_Rate&amp;action=edit&amp;redlink=1" class="new" title="Command Success Rate (page does not exist)">Command Success Rate</a> (CSR). </p><p>Speech recognition by machine is a very complex problem, however. Vocalizations vary in terms of accent, pronunciation, articulation, roughness, nasality, pitch, volume, and speed. Speech is distorted by a background noise and echoes, electrical characteristics. Accuracy of speech recognition may vary with the following:<sup id="cite_ref-138" class="reference"><a href="#cite_note-138"><span class="cite-bracket">&#91;</span>138<span class="cite-bracket">&#93;</span></a></sup><sup class="noprint Inline-Template Template-Fact" style="white-space:nowrap;">&#91;<i><a href="/wiki/Wikipedia:Citation_needed" title="Wikipedia:Citation needed"><span title="This claim needs references to reliable sources. (May 2013)">citation needed</span></a></i>&#93;</sup> </p> <ul><li>Vocabulary size and confusability</li> <li>Speaker dependence versus independence</li> <li>Isolated, discontinuous or continuous speech</li> <li>Task and language constraints</li> <li>Read versus spontaneous speech</li> <li>Adverse conditions</li></ul> <div class="mw-heading mw-heading3"><h3 id="Accuracy">Accuracy</h3><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Speech_recognition&amp;action=edit&amp;section=27" title="Edit section: Accuracy"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <p>As mentioned earlier in this article, the accuracy of speech recognition may vary depending on the following factors: </p> <ul><li>Error rates increase as the vocabulary size grows:</li></ul> <dl><dd><dl><dd>e.g. the 10 digits "zero" to "nine" can be recognized essentially perfectly, but vocabulary sizes of 200, 5000 or 100000 may have error rates of 3%, 7%, or 45% respectively.</dd></dl></dd></dl> <ul><li>Vocabulary is hard to recognize if it contains confusing letters:</li></ul> <dl><dd><dl><dd>e.g. the 26 letters of the English alphabet are difficult to discriminate because they are confusing words (most notoriously, the E-set: "B, C, D, E, G, P, T, V, Z — when "Z" is pronounced "zee" rather than "zed" depending on the English region); an 8% error rate is considered good for this vocabulary.<sup id="cite_ref-139" class="reference"><a href="#cite_note-139"><span class="cite-bracket">&#91;</span>139<span class="cite-bracket">&#93;</span></a></sup></dd></dl></dd></dl> <ul><li>Speaker dependence vs. independence:</li></ul> <dl><dd><dl><dd>A speaker-dependent system is intended for use by a single speaker.</dd> <dd>A speaker-independent system is intended for use by any speaker (more difficult).</dd></dl></dd></dl> <ul><li>Isolated, Discontinuous or continuous speech</li></ul> <dl><dd><dl><dd>With isolated speech, single words are used, therefore it becomes easier to recognize the speech.</dd></dl></dd></dl> <p>With discontinuous speech full sentences separated by silence are used, therefore it becomes easier to recognize the speech as well as with isolated speech. <br /> With continuous speech naturally spoken sentences are used, therefore it becomes harder to recognize the speech, different from both isolated and discontinuous speech. </p> <ul><li>Task and language constraints <ul><li>e.g. Querying application may dismiss the hypothesis "The apple is red."</li> <li>e.g. Constraints may be semantic; rejecting "The apple is angry."</li> <li>e.g. Syntactic; rejecting "Red is apple the."</li></ul></li></ul> <p>Constraints are often represented by grammar. </p> <ul><li>Read vs. Spontaneous Speech – When a person reads it's usually in a context that has been previously prepared, but when a person uses spontaneous speech, it is difficult to recognize the speech because of the disfluencies (like "uh" and "um", false starts, incomplete sentences, stuttering, coughing, and laughter) and limited vocabulary.</li> <li>Adverse conditions – Environmental noise (e.g. Noise in a car or a factory). Acoustical distortions (e.g. echoes, room acoustics)</li></ul> <p>Speech recognition is a multi-leveled pattern recognition task. </p> <ul><li>Acoustical signals are structured into a hierarchy of units, e.g. <a href="/wiki/Phoneme" title="Phoneme">Phonemes</a>, Words, Phrases, and Sentences;</li> <li>Each level provides additional constraints;</li></ul> <p>e.g. Known word pronunciations or legal word sequences, which can compensate for errors or uncertainties at a lower level; </p> <ul><li>This hierarchy of constraints is exploited. By combining decisions probabilistically at all lower levels, and making more deterministic decisions only at the highest level, speech recognition by a machine is a process broken into several phases. Computationally, it is a problem in which a sound pattern has to be recognized or classified into a category that represents a meaning to a human. Every acoustic signal can be broken into smaller more basic sub-signals. As the more complex sound signal is broken into the smaller sub-sounds, different levels are created, where at the top level we have complex sounds, which are made of simpler sounds on the lower level, and going to lower levels, even more, we create more basic and shorter and simpler sounds. At the lowest level, where the sounds are the most fundamental, a machine would check for simple and more probabilistic rules of what sound should represent. Once these sounds are put together into more complex sounds on upper level, a new set of more deterministic rules should predict what the new complex sound should represent. The most upper level of a deterministic rule should figure out the meaning of complex expressions. In order to expand our knowledge about speech recognition, we need to take into consideration neural networks. There are four steps of neural network approaches:</li> <li>Digitize the speech that we want to recognize</li></ul> <p>For telephone speech the sampling rate is 8000 samples per second; </p> <ul><li>Compute features of spectral-domain of the speech (with Fourier transform);</li></ul> <p>computed every 10&#160;ms, with one 10&#160;ms section called a frame; </p><p>Analysis of four-step neural network approaches can be explained by further information. Sound is produced by air (or some other medium) vibration, which we register by ears, but machines by receivers. Basic sound creates a wave which has two descriptions: <a href="/wiki/Amplitude" title="Amplitude">amplitude</a> (how strong is it), and <a href="/wiki/Frequency" title="Frequency">frequency</a> (how often it vibrates per second). Accuracy can be computed with the help of word error rate (WER). Word error rate can be calculated by aligning the recognized word and referenced word using dynamic string alignment. The problem may occur while computing the word error rate due to the difference between the sequence lengths of the recognized word and referenced word. </p><p>The formula to compute the word error rate (WER) is: </p><p><span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle WER={(s+d+i) \over n}}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mi>W</mi> <mi>E</mi> <mi>R</mi> <mo>=</mo> <mrow class="MJX-TeXAtom-ORD"> <mfrac> <mrow> <mo stretchy="false">(</mo> <mi>s</mi> <mo>+</mo> <mi>d</mi> <mo>+</mo> <mi>i</mi> <mo stretchy="false">)</mo> </mrow> <mi>n</mi> </mfrac> </mrow> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle WER={(s+d+i) \over n}}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/6e631220a0032e20e94cebc338901f0d730486f0" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -1.838ex; width:20.508ex; height:5.676ex;" alt="{\displaystyle WER={(s+d+i) \over n}}" /></span> </p><p>where <i>s</i> is the number of substitutions, <i>d</i> is the number of deletions, <i>i</i> is the number of insertions, and <i>n</i> is the number of word references. </p><p>While computing, the word recognition rate (WRR) is used. The formula is: </p> <dl><dd><span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle WRR=1-WER={(n-s-d-i) \over n}={h-i \over n}}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mi>W</mi> <mi>R</mi> <mi>R</mi> <mo>=</mo> <mn>1</mn> <mo>&#x2212;<!-- − --></mo> <mi>W</mi> <mi>E</mi> <mi>R</mi> <mo>=</mo> <mrow class="MJX-TeXAtom-ORD"> <mfrac> <mrow> <mo stretchy="false">(</mo> <mi>n</mi> <mo>&#x2212;<!-- − --></mo> <mi>s</mi> <mo>&#x2212;<!-- − --></mo> <mi>d</mi> <mo>&#x2212;<!-- − --></mo> <mi>i</mi> <mo stretchy="false">)</mo> </mrow> <mi>n</mi> </mfrac> </mrow> <mo>=</mo> <mrow class="MJX-TeXAtom-ORD"> <mfrac> <mrow> <mi>h</mi> <mo>&#x2212;<!-- − --></mo> <mi>i</mi> </mrow> <mi>n</mi> </mfrac> </mrow> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle WRR=1-WER={(n-s-d-i) \over n}={h-i \over n}}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/5870032dbbce46e26dbb0fbba1b4e03bf8b65f71" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -1.838ex; width:46.724ex; height:5.676ex;" alt="{\displaystyle WRR=1-WER={(n-s-d-i) \over n}={h-i \over n}}" /></span></dd></dl> <p>where <i>h</i> is the number of correctly recognized words: </p> <dl><dd><span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle h=n-(s+d).}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mi>h</mi> <mo>=</mo> <mi>n</mi> <mo>&#x2212;<!-- − --></mo> <mo stretchy="false">(</mo> <mi>s</mi> <mo>+</mo> <mi>d</mi> <mo stretchy="false">)</mo> <mo>.</mo> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle h=n-(s+d).}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/e44ca5a12e23307370955e3b8d05d1167c815397" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.838ex; width:16.275ex; height:2.843ex;" alt="{\displaystyle h=n-(s+d).}" /></span></dd></dl> <div class="mw-heading mw-heading3"><h3 id="Security_concerns">Security concerns</h3><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Speech_recognition&amp;action=edit&amp;section=28" title="Edit section: Security concerns"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <p>Speech recognition can become a means of attack, theft, or accidental operation. For example, activation words like "Alexa" spoken in an audio or video broadcast can cause devices in homes and offices to start listening for input inappropriately, or possibly take an unwanted action.<sup id="cite_ref-140" class="reference"><a href="#cite_note-140"><span class="cite-bracket">&#91;</span>140<span class="cite-bracket">&#93;</span></a></sup> Voice-controlled devices are also accessible to visitors to the building, or even those outside the building if they can be heard inside. Attackers may be able to gain access to personal information, like calendar, address book contents, private messages, and documents. They may also be able to impersonate the user to send messages or make online purchases. </p><p>Two attacks have been demonstrated that use artificial sounds. One transmits ultrasound and attempt to send commands without nearby people noticing.<sup id="cite_ref-141" class="reference"><a href="#cite_note-141"><span class="cite-bracket">&#91;</span>141<span class="cite-bracket">&#93;</span></a></sup> The other adds small, inaudible distortions to other speech or music that are specially crafted to confuse the specific speech recognition system into recognizing music as speech, or to make what sounds like one command to a human sound like a different command to the system.<sup id="cite_ref-142" class="reference"><a href="#cite_note-142"><span class="cite-bracket">&#91;</span>142<span class="cite-bracket">&#93;</span></a></sup> </p> <div class="mw-heading mw-heading2"><h2 id="Further_information">Further information</h2><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Speech_recognition&amp;action=edit&amp;section=29" title="Edit section: Further information"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <div class="mw-heading mw-heading3"><h3 id="Conferences_and_journals">Conferences and journals</h3><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Speech_recognition&amp;action=edit&amp;section=30" title="Edit section: Conferences and journals"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <p>Popular speech recognition conferences held each year or two include SpeechTEK and SpeechTEK Europe, <a href="/wiki/International_Conference_on_Acoustics,_Speech,_and_Signal_Processing" title="International Conference on Acoustics, Speech, and Signal Processing">ICASSP</a>, <a href="/w/index.php?title=Interspeech&amp;action=edit&amp;redlink=1" class="new" title="Interspeech (page does not exist)">Interspeech</a>/Eurospeech, and the IEEE ASRU. Conferences in the field of <a href="/wiki/Natural_language_processing" title="Natural language processing">natural language processing</a>, such as <a href="/wiki/Association_for_Computational_Linguistics" title="Association for Computational Linguistics">ACL</a>, <a href="/wiki/North_American_Chapter_of_the_Association_for_Computational_Linguistics" title="North American Chapter of the Association for Computational Linguistics">NAACL</a>, EMNLP, and HLT, are beginning to include papers on <a href="/wiki/Speech_processing" title="Speech processing">speech processing</a>. Important journals include the <a href="/wiki/IEEE" class="mw-redirect" title="IEEE">IEEE</a> Transactions on Speech and Audio Processing (later renamed <a href="/wiki/IEEE" class="mw-redirect" title="IEEE">IEEE</a> Transactions on Audio, Speech and Language Processing and since Sept 2014 renamed <a href="/wiki/IEEE" class="mw-redirect" title="IEEE">IEEE</a>/ACM Transactions on Audio, Speech and Language Processing—after merging with an ACM publication), Computer Speech and Language, and Speech Communication. </p> <div class="mw-heading mw-heading3"><h3 id="Books">Books</h3><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Speech_recognition&amp;action=edit&amp;section=31" title="Edit section: Books"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <p>Books like "Fundamentals of Speech Recognition" by <a href="/wiki/Lawrence_Rabiner" title="Lawrence Rabiner">Lawrence Rabiner</a> can be useful to acquire basic knowledge but may not be fully up to date (1993). Another good source can be "Statistical Methods for Speech Recognition" by <a href="/wiki/Frederick_Jelinek" title="Frederick Jelinek">Frederick Jelinek</a> and "Spoken Language Processing (2001)" by <a href="/wiki/Xuedong_Huang" title="Xuedong Huang">Xuedong Huang</a> etc., "Computer Speech", by <a href="/wiki/Manfred_R._Schroeder" title="Manfred R. Schroeder">Manfred R. Schroeder</a>, second edition published in 2004, and "Speech Processing: A Dynamic and Optimization-Oriented Approach" published in 2003 by Li Deng and Doug O'Shaughnessey. The updated textbook <i>Speech and Language Processing</i> (2008) by <a href="/wiki/Daniel_Jurafsky" class="mw-redirect" title="Daniel Jurafsky">Jurafsky</a> and Martin presents the basics and the state of the art for ASR. <a href="/wiki/Speaker_recognition" title="Speaker recognition">Speaker recognition</a> also uses the same features, most of the same front-end processing, and classification techniques as is done in speech recognition. A comprehensive textbook, "Fundamentals of Speaker Recognition" is an in depth source for up to date details on the theory and practice.<sup id="cite_ref-auto_143-0" class="reference"><a href="#cite_note-auto-143"><span class="cite-bracket">&#91;</span>143<span class="cite-bracket">&#93;</span></a></sup> A good insight into the techniques used in the best modern systems can be gained by paying attention to government sponsored evaluations such as those organised by <a href="/wiki/DARPA" title="DARPA">DARPA</a> (the largest speech recognition-related project ongoing as of 2007 is the GALE project, which involves both speech recognition and translation components). </p><p>A good and accessible introduction to speech recognition technology and its history is provided by the general audience book "The Voice in the Machine. Building Computers That Understand Speech" by <a href="/wiki/Roberto_Pieraccini" title="Roberto Pieraccini">Roberto Pieraccini</a> (2012). </p><p>The most recent book on speech recognition is <i>Automatic Speech Recognition: A Deep Learning Approach</i> (Publisher: Springer) written by Microsoft researchers D. Yu and L. Deng and published near the end of 2014, with highly mathematically oriented technical detail on how deep learning methods are derived and implemented in modern speech recognition systems based on DNNs and related deep learning methods.<sup id="cite_ref-ReferenceA_84-1" class="reference"><a href="#cite_note-ReferenceA-84"><span class="cite-bracket">&#91;</span>84<span class="cite-bracket">&#93;</span></a></sup> A related book, published earlier in 2014, "Deep Learning: Methods and Applications" by L. Deng and D. Yu provides a less technical but more methodology-focused overview of DNN-based speech recognition during 2009–2014, placed within the more general context of deep learning applications including not only speech recognition but also image recognition, natural language processing, information retrieval, multimodal processing, and multitask learning.<sup id="cite_ref-BOOK2014_80-1" class="reference"><a href="#cite_note-BOOK2014-80"><span class="cite-bracket">&#91;</span>80<span class="cite-bracket">&#93;</span></a></sup> </p> <div class="mw-heading mw-heading3"><h3 id="Software">Software</h3><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Speech_recognition&amp;action=edit&amp;section=32" title="Edit section: Software"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <p>In terms of freely available resources, <a href="/wiki/Carnegie_Mellon_University" title="Carnegie Mellon University">Carnegie Mellon University</a>'s <a href="/wiki/CMU_Sphinx" title="CMU Sphinx">Sphinx</a> toolkit is one place to start to both learn about speech recognition and to start experimenting. Another resource (free but copyrighted) is the <a href="/wiki/HTK_(software)" title="HTK (software)">HTK</a> book (and the accompanying HTK toolkit). For more recent and state-of-the-art techniques, <a href="/wiki/Kaldi_(software)" title="Kaldi (software)">Kaldi</a> toolkit can be used.<sup id="cite_ref-144" class="reference"><a href="#cite_note-144"><span class="cite-bracket">&#91;</span>144<span class="cite-bracket">&#93;</span></a></sup> In 2017 <a href="/wiki/Mozilla" title="Mozilla">Mozilla</a> launched the open source project called <a href="/wiki/Common_Voice" title="Common Voice">Common Voice</a><sup id="cite_ref-145" class="reference"><a href="#cite_note-145"><span class="cite-bracket">&#91;</span>145<span class="cite-bracket">&#93;</span></a></sup> to gather big database of voices that would help build free speech recognition project DeepSpeech (available free at <a href="/wiki/GitHub" title="GitHub">GitHub</a>),<sup id="cite_ref-146" class="reference"><a href="#cite_note-146"><span class="cite-bracket">&#91;</span>146<span class="cite-bracket">&#93;</span></a></sup> using Google's open source platform <a href="/wiki/TensorFlow" title="TensorFlow">TensorFlow</a>.<sup id="cite_ref-147" class="reference"><a href="#cite_note-147"><span class="cite-bracket">&#91;</span>147<span class="cite-bracket">&#93;</span></a></sup> When Mozilla redirected funding away from the project in 2020, it was forked by its original developers as Coqui STT<sup id="cite_ref-148" class="reference"><a href="#cite_note-148"><span class="cite-bracket">&#91;</span>148<span class="cite-bracket">&#93;</span></a></sup> using the same open-source license.<sup id="cite_ref-149" class="reference"><a href="#cite_note-149"><span class="cite-bracket">&#91;</span>149<span class="cite-bracket">&#93;</span></a></sup><sup id="cite_ref-150" class="reference"><a href="#cite_note-150"><span class="cite-bracket">&#91;</span>150<span class="cite-bracket">&#93;</span></a></sup> </p><p>Google <a href="/wiki/Gboard" title="Gboard">Gboard</a> supports speech recognition on all <a href="/wiki/Android_(operating_system)" title="Android (operating system)">Android</a> applications. It can be activated through the <a href="/wiki/Microphone" title="Microphone">microphone</a> <a href="/wiki/Icon_(computing)" title="Icon (computing)">icon</a>.<sup id="cite_ref-151" class="reference"><a href="#cite_note-151"><span class="cite-bracket">&#91;</span>151<span class="cite-bracket">&#93;</span></a></sup> </p><p>The commercial cloud based speech recognition APIs are broadly available. </p><p>For more software resources, see <a href="/wiki/List_of_speech_recognition_software" title="List of speech recognition software">List of speech recognition software</a>. </p> <div class="mw-heading mw-heading2"><h2 id="See_also">See also</h2><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Speech_recognition&amp;action=edit&amp;section=33" title="Edit section: See also"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <style data-mw-deduplicate="TemplateStyles:r1184024115">.mw-parser-output .div-col{margin-top:0.3em;column-width:30em}.mw-parser-output .div-col-small{font-size:90%}.mw-parser-output .div-col-rules{column-rule:1px solid #aaa}.mw-parser-output .div-col dl,.mw-parser-output .div-col ol,.mw-parser-output .div-col ul{margin-top:0}.mw-parser-output .div-col li,.mw-parser-output .div-col dd{page-break-inside:avoid;break-inside:avoid-column}</style><div class="div-col" style="column-width: 22em;"> <ul><li><a href="/wiki/AI_effect" title="AI effect">AI effect</a></li> <li><a href="/wiki/ALPAC" title="ALPAC">ALPAC</a></li> <li><a href="/wiki/Speech_Application_Language_Tags" title="Speech Application Language Tags">Application Language Tags for speech recognition</a></li> <li><a href="/wiki/Articulatory_speech_recognition" title="Articulatory speech recognition">Articulatory speech recognition</a></li> <li><a href="/wiki/Audio_mining" title="Audio mining">Audio mining</a></li> <li><a href="/wiki/Audio-visual_speech_recognition" title="Audio-visual speech recognition">Audio-visual speech recognition</a></li> <li><a href="/wiki/Automatic_Language_Translator" class="mw-redirect" title="Automatic Language Translator">Automatic Language Translator</a></li> <li><a href="/wiki/Automotive_head_unit" title="Automotive head unit">Automotive head unit</a></li> <li><a href="/wiki/Braina" title="Braina">Braina</a></li> <li><a href="/wiki/Cache_language_model" title="Cache language model">Cache language model</a></li> <li><a href="/wiki/Dragon_NaturallySpeaking" title="Dragon NaturallySpeaking">Dragon NaturallySpeaking</a></li> <li><a href="/wiki/Fluency_Voice_Technology" title="Fluency Voice Technology">Fluency Voice Technology</a></li> <li><a href="/wiki/Google_Voice_Search" title="Google Voice Search">Google Voice Search</a></li> <li><a href="/wiki/IBM_ViaVoice" title="IBM ViaVoice">IBM ViaVoice</a></li> <li><a href="/wiki/Keyword_spotting" title="Keyword spotting">Keyword spotting</a></li> <li><a href="/wiki/Kinect" title="Kinect">Kinect</a></li> <li><a href="/wiki/Mondegreen" title="Mondegreen">Mondegreen</a></li> <li><a href="/wiki/Multimedia_information_retrieval" title="Multimedia information retrieval">Multimedia information retrieval</a></li> <li><a href="/wiki/Origin_of_speech" title="Origin of speech">Origin of speech</a></li> <li><a href="/wiki/Phonetic_search_technology" title="Phonetic search technology">Phonetic search technology</a></li> <li><a href="/wiki/Speaker_diarisation" title="Speaker diarisation">Speaker diarisation</a></li> <li><a href="/wiki/Speaker_recognition" title="Speaker recognition">Speaker recognition</a></li> <li><a href="/wiki/Speech_analytics" title="Speech analytics">Speech analytics</a></li> <li><a href="/wiki/Speech_interface_guideline" title="Speech interface guideline">Speech interface guideline</a></li> <li><a href="/wiki/Speech_recognition_software_for_Linux" title="Speech recognition software for Linux">Speech recognition software for Linux</a></li> <li><a href="/wiki/Speech_synthesis" title="Speech synthesis">Speech synthesis</a></li> <li><a href="/wiki/Speech_verification" class="mw-redirect" title="Speech verification">Speech verification</a></li> <li><a href="/wiki/Subtitle_(captioning)" class="mw-redirect" title="Subtitle (captioning)">Subtitle (captioning)</a></li> <li><a href="/wiki/VoiceXML" title="VoiceXML">VoiceXML</a></li> <li><a href="/wiki/VoxForge" title="VoxForge">VoxForge</a></li> <li><a href="/wiki/Windows_Speech_Recognition" title="Windows Speech Recognition">Windows Speech Recognition</a></li></ul> <dl><dt>Lists</dt></dl> <ul><li><a href="/wiki/List_of_speech_recognition_software" title="List of speech recognition software">List of speech recognition software</a></li> <li><a href="/wiki/List_of_emerging_technologies" title="List of emerging technologies">List of emerging technologies</a></li> <li><a href="/wiki/Outline_of_artificial_intelligence" title="Outline of artificial intelligence">Outline of artificial intelligence</a></li> <li><a href="/wiki/Timeline_of_speech_and_voice_recognition" title="Timeline of speech and voice recognition">Timeline of speech and voice recognition</a></li></ul> </div> <div class="mw-heading mw-heading2"><h2 id="References">References</h2><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Speech_recognition&amp;action=edit&amp;section=34" title="Edit section: References"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <style data-mw-deduplicate="TemplateStyles:r1239543626">.mw-parser-output .reflist{margin-bottom:0.5em;list-style-type:decimal}@media screen{.mw-parser-output .reflist{font-size:90%}}.mw-parser-output .reflist .references{font-size:100%;margin-bottom:0;list-style-type:inherit}.mw-parser-output .reflist-columns-2{column-width:30em}.mw-parser-output .reflist-columns-3{column-width:25em}.mw-parser-output .reflist-columns{margin-top:0.3em}.mw-parser-output .reflist-columns ol{margin-top:0}.mw-parser-output .reflist-columns li{page-break-inside:avoid;break-inside:avoid-column}.mw-parser-output .reflist-upper-alpha{list-style-type:upper-alpha}.mw-parser-output .reflist-upper-roman{list-style-type:upper-roman}.mw-parser-output .reflist-lower-alpha{list-style-type:lower-alpha}.mw-parser-output .reflist-lower-greek{list-style-type:lower-greek}.mw-parser-output .reflist-lower-roman{list-style-type:lower-roman}</style><div class="reflist"> <div class="mw-references-wrap mw-references-columns"><ol class="references"> <li id="cite_note-1"><span class="mw-cite-backlink"><b><a href="#cite_ref-1">^</a></b></span> <span class="reference-text"><style data-mw-deduplicate="TemplateStyles:r1238218222">.mw-parser-output cite.citation{font-style:inherit;word-wrap:break-word}.mw-parser-output .citation q{quotes:"\"""\"""'""'"}.mw-parser-output .citation:target{background-color:rgba(0,127,255,0.133)}.mw-parser-output .id-lock-free.id-lock-free a{background:url("//upload.wikimedia.org/wikipedia/commons/6/65/Lock-green.svg")right 0.1em center/9px no-repeat}.mw-parser-output .id-lock-limited.id-lock-limited a,.mw-parser-output .id-lock-registration.id-lock-registration a{background:url("//upload.wikimedia.org/wikipedia/commons/d/d6/Lock-gray-alt-2.svg")right 0.1em center/9px no-repeat}.mw-parser-output .id-lock-subscription.id-lock-subscription a{background:url("//upload.wikimedia.org/wikipedia/commons/a/aa/Lock-red-alt-2.svg")right 0.1em center/9px no-repeat}.mw-parser-output .cs1-ws-icon a{background:url("//upload.wikimedia.org/wikipedia/commons/4/4c/Wikisource-logo.svg")right 0.1em center/12px no-repeat}body:not(.skin-timeless):not(.skin-minerva) .mw-parser-output .id-lock-free a,body:not(.skin-timeless):not(.skin-minerva) .mw-parser-output .id-lock-limited a,body:not(.skin-timeless):not(.skin-minerva) .mw-parser-output .id-lock-registration a,body:not(.skin-timeless):not(.skin-minerva) .mw-parser-output .id-lock-subscription a,body:not(.skin-timeless):not(.skin-minerva) .mw-parser-output .cs1-ws-icon a{background-size:contain;padding:0 1em 0 0}.mw-parser-output .cs1-code{color:inherit;background:inherit;border:none;padding:inherit}.mw-parser-output .cs1-hidden-error{display:none;color:var(--color-error,#d33)}.mw-parser-output .cs1-visible-error{color:var(--color-error,#d33)}.mw-parser-output .cs1-maint{display:none;color:#085;margin-left:0.3em}.mw-parser-output .cs1-kern-left{padding-left:0.2em}.mw-parser-output .cs1-kern-right{padding-right:0.2em}.mw-parser-output .citation .mw-selflink{font-weight:inherit}@media screen{.mw-parser-output .cs1-format{font-size:95%}html.skin-theme-clientpref-night .mw-parser-output .cs1-maint{color:#18911f}}@media screen and (prefers-color-scheme:dark){html.skin-theme-clientpref-os .mw-parser-output .cs1-maint{color:#18911f}}</style><cite class="citation web cs1"><a rel="nofollow" class="external text" href="http://www.fifthgen.com/speaker-independent-connected-s-r.htm">"Speaker Independent Connected Speech Recognition- Fifth Generation Computer Corporation"</a>. Fifthgen.com. <a rel="nofollow" class="external text" href="https://web.archive.org/web/20131111101228/http://www.fifthgen.com/speaker-independent-connected-s-r.htm">Archived</a> from the original on 11 November 2013<span class="reference-accessdate">. Retrieved <span class="nowrap">15 June</span> 2013</span>.</cite><span title="ctx_ver=Z39.88-2004&amp;rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&amp;rft.genre=unknown&amp;rft.btitle=Speaker+Independent+Connected+Speech+Recognition-+Fifth+Generation+Computer+Corporation&amp;rft.pub=Fifthgen.com&amp;rft_id=http%3A%2F%2Fwww.fifthgen.com%2Fspeaker-independent-connected-s-r.htm&amp;rfr_id=info%3Asid%2Fen.wikipedia.org%3ASpeech+recognition" class="Z3988"></span></span> </li> <li id="cite_note-2"><span class="mw-cite-backlink"><b><a href="#cite_ref-2">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222" /><cite id="CITEREFP._Nguyen2010" class="citation book cs1">P. Nguyen (2010). "Automatic classification of speaker characteristics". <i>International Conference on Communications and Electronics 2010</i>. pp.&#160;<span class="nowrap">147–</span>152. <a href="/wiki/Doi_(identifier)" class="mw-redirect" title="Doi (identifier)">doi</a>:<a rel="nofollow" class="external text" href="https://doi.org/10.1109%2FICCE.2010.5670700">10.1109/ICCE.2010.5670700</a>. <a href="/wiki/ISBN_(identifier)" class="mw-redirect" title="ISBN (identifier)">ISBN</a>&#160;<a href="/wiki/Special:BookSources/978-1-4244-7055-6" title="Special:BookSources/978-1-4244-7055-6"><bdi>978-1-4244-7055-6</bdi></a>. <a href="/wiki/S2CID_(identifier)" class="mw-redirect" title="S2CID (identifier)">S2CID</a>&#160;<a rel="nofollow" class="external text" href="https://api.semanticscholar.org/CorpusID:13482115">13482115</a>.</cite><span title="ctx_ver=Z39.88-2004&amp;rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&amp;rft.genre=bookitem&amp;rft.atitle=Automatic+classification+of+speaker+characteristics&amp;rft.btitle=International+Conference+on+Communications+and+Electronics+2010&amp;rft.pages=%3Cspan+class%3D%22nowrap%22%3E147-%3C%2Fspan%3E152&amp;rft.date=2010&amp;rft_id=https%3A%2F%2Fapi.semanticscholar.org%2FCorpusID%3A13482115%23id-name%3DS2CID&amp;rft_id=info%3Adoi%2F10.1109%2FICCE.2010.5670700&amp;rft.isbn=978-1-4244-7055-6&amp;rft.au=P.+Nguyen&amp;rfr_id=info%3Asid%2Fen.wikipedia.org%3ASpeech+recognition" class="Z3988"></span></span> </li> <li id="cite_note-Macmillan_Brit._def_of_voice_recognition-3"><span class="mw-cite-backlink"><b><a href="#cite_ref-Macmillan_Brit._def_of_voice_recognition_3-0">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222" /><cite class="citation web cs1"><a rel="nofollow" class="external text" href="http://www.macmillandictionary.com/dictionary/british/voice-recognition">"British English definition of voice recognition"</a>. Macmillan Publishers Limited. <a rel="nofollow" class="external text" href="https://web.archive.org/web/20110916050430/http://www.macmillandictionary.com/dictionary/british/voice-recognition">Archived</a> from the original on 16 September 2011<span class="reference-accessdate">. Retrieved <span class="nowrap">21 February</span> 2012</span>.</cite><span title="ctx_ver=Z39.88-2004&amp;rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&amp;rft.genre=unknown&amp;rft.btitle=British+English+definition+of+voice+recognition&amp;rft.pub=Macmillan+Publishers+Limited.&amp;rft_id=http%3A%2F%2Fwww.macmillandictionary.com%2Fdictionary%2Fbritish%2Fvoice-recognition&amp;rfr_id=info%3Asid%2Fen.wikipedia.org%3ASpeech+recognition" class="Z3988"></span></span> </li> <li id="cite_note-Voice_rec,_definition-4"><span class="mw-cite-backlink"><b><a href="#cite_ref-Voice_rec,_definition_4-0">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222" /><cite class="citation web cs1"><a rel="nofollow" class="external text" href="http://www.businessdictionary.com/definition/voice-recognition.html">"voice recognition, definition of"</a>. WebFinance, Inc. <a rel="nofollow" class="external text" href="https://web.archive.org/web/20111203144647/http://www.businessdictionary.com/definition/voice-recognition.html">Archived</a> from the original on 3 December 2011<span class="reference-accessdate">. Retrieved <span class="nowrap">21 February</span> 2012</span>.</cite><span title="ctx_ver=Z39.88-2004&amp;rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&amp;rft.genre=unknown&amp;rft.btitle=voice+recognition%2C+definition+of&amp;rft.pub=WebFinance%2C+Inc&amp;rft_id=http%3A%2F%2Fwww.businessdictionary.com%2Fdefinition%2Fvoice-recognition.html&amp;rfr_id=info%3Asid%2Fen.wikipedia.org%3ASpeech+recognition" class="Z3988"></span></span> </li> <li id="cite_note-mail_bag,_gazette-5"><span class="mw-cite-backlink"><b><a href="#cite_ref-mail_bag,_gazette_5-0">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222" /><cite class="citation web cs1"><a rel="nofollow" class="external text" href="http://linuxgazette.net/114/lg_mail.html#mailbag.3">"The Mailbag LG #114"</a>. Linuxgazette.net. <a rel="nofollow" class="external text" href="https://web.archive.org/web/20130219032501/http://linuxgazette.net/114/lg_mail.html#mailbag.3">Archived</a> from the original on 19 February 2013<span class="reference-accessdate">. Retrieved <span class="nowrap">15 June</span> 2013</span>.</cite><span title="ctx_ver=Z39.88-2004&amp;rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&amp;rft.genre=unknown&amp;rft.btitle=The+Mailbag+LG+%23114&amp;rft.pub=Linuxgazette.net&amp;rft_id=http%3A%2F%2Flinuxgazette.net%2F114%2Flg_mail.html%23mailbag.3&amp;rfr_id=info%3Asid%2Fen.wikipedia.org%3ASpeech+recognition" class="Z3988"></span></span> </li> <li id="cite_note-6"><span class="mw-cite-backlink"><b><a href="#cite_ref-6">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222" /><cite id="CITEREFSarangiSahidullah,_MdSaha,_Goutam2020" class="citation journal cs1">Sarangi, Susanta; Sahidullah, Md; Saha, Goutam (September 2020). "Optimization of data-driven filterbank for automatic speaker verification". <i>Digital Signal Processing</i>. <b>104</b>: 102795. <a href="/wiki/ArXiv_(identifier)" class="mw-redirect" title="ArXiv (identifier)">arXiv</a>:<span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://arxiv.org/abs/2007.10729">2007.10729</a></span>. <a href="/wiki/Bibcode_(identifier)" class="mw-redirect" title="Bibcode (identifier)">Bibcode</a>:<a rel="nofollow" class="external text" href="https://ui.adsabs.harvard.edu/abs/2020DSP...10402795S">2020DSP...10402795S</a>. <a href="/wiki/Doi_(identifier)" class="mw-redirect" title="Doi (identifier)">doi</a>:<a rel="nofollow" class="external text" href="https://doi.org/10.1016%2Fj.dsp.2020.102795">10.1016/j.dsp.2020.102795</a>. <a href="/wiki/S2CID_(identifier)" class="mw-redirect" title="S2CID (identifier)">S2CID</a>&#160;<a rel="nofollow" class="external text" href="https://api.semanticscholar.org/CorpusID:220665533">220665533</a>.</cite><span title="ctx_ver=Z39.88-2004&amp;rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&amp;rft.genre=article&amp;rft.jtitle=Digital+Signal+Processing&amp;rft.atitle=Optimization+of+data-driven+filterbank+for+automatic+speaker+verification&amp;rft.volume=104&amp;rft.pages=102795&amp;rft.date=2020-09&amp;rft_id=info%3Aarxiv%2F2007.10729&amp;rft_id=https%3A%2F%2Fapi.semanticscholar.org%2FCorpusID%3A220665533%23id-name%3DS2CID&amp;rft_id=info%3Adoi%2F10.1016%2Fj.dsp.2020.102795&amp;rft_id=info%3Abibcode%2F2020DSP...10402795S&amp;rft.aulast=Sarangi&amp;rft.aufirst=Susanta&amp;rft.au=Sahidullah%2C+Md&amp;rft.au=Saha%2C+Goutam&amp;rfr_id=info%3Asid%2Fen.wikipedia.org%3ASpeech+recognition" class="Z3988"></span></span> </li> <li id="cite_note-7"><span class="mw-cite-backlink"><b><a href="#cite_ref-7">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222" /><cite id="CITEREFReynoldsRose1995" class="citation journal cs1">Reynolds, Douglas; Rose, Richard (January 1995). <a rel="nofollow" class="external text" href="http://www.cs.toronto.edu/~frank/csc401/readings/ReynoldsRose.pdf">"Robust text-independent speaker identification using Gaussian mixture speaker models"</a> <span class="cs1-format">(PDF)</span>. <i>IEEE Transactions on Speech and Audio Processing</i>. <b>3</b> (1): <span class="nowrap">72–</span>83. <a href="/wiki/Doi_(identifier)" class="mw-redirect" title="Doi (identifier)">doi</a>:<a rel="nofollow" class="external text" href="https://doi.org/10.1109%2F89.365379">10.1109/89.365379</a>. <a href="/wiki/ISSN_(identifier)" class="mw-redirect" title="ISSN (identifier)">ISSN</a>&#160;<a rel="nofollow" class="external text" href="https://search.worldcat.org/issn/1063-6676">1063-6676</a>. <a href="/wiki/OCLC_(identifier)" class="mw-redirect" title="OCLC (identifier)">OCLC</a>&#160;<a rel="nofollow" class="external text" href="https://search.worldcat.org/oclc/26108901">26108901</a>. <a href="/wiki/S2CID_(identifier)" class="mw-redirect" title="S2CID (identifier)">S2CID</a>&#160;<a rel="nofollow" class="external text" href="https://api.semanticscholar.org/CorpusID:7319345">7319345</a>. <a rel="nofollow" class="external text" href="https://web.archive.org/web/20140308001101/http://www.cs.toronto.edu/~frank/csc401/readings/ReynoldsRose.pdf">Archived</a> <span class="cs1-format">(PDF)</span> from the original on 8 March 2014<span class="reference-accessdate">. Retrieved <span class="nowrap">21 February</span> 2014</span>.</cite><span title="ctx_ver=Z39.88-2004&amp;rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&amp;rft.genre=article&amp;rft.jtitle=IEEE+Transactions+on+Speech+and+Audio+Processing&amp;rft.atitle=Robust+text-independent+speaker+identification+using+Gaussian+mixture+speaker+models&amp;rft.volume=3&amp;rft.issue=1&amp;rft.pages=%3Cspan+class%3D%22nowrap%22%3E72-%3C%2Fspan%3E83&amp;rft.date=1995-01&amp;rft_id=info%3Aoclcnum%2F26108901&amp;rft_id=https%3A%2F%2Fapi.semanticscholar.org%2FCorpusID%3A7319345%23id-name%3DS2CID&amp;rft.issn=1063-6676&amp;rft_id=info%3Adoi%2F10.1109%2F89.365379&amp;rft.aulast=Reynolds&amp;rft.aufirst=Douglas&amp;rft.au=Rose%2C+Richard&amp;rft_id=http%3A%2F%2Fwww.cs.toronto.edu%2F~frank%2Fcsc401%2Freadings%2FReynoldsRose.pdf&amp;rfr_id=info%3Asid%2Fen.wikipedia.org%3ASpeech+recognition" class="Z3988"></span></span> </li> <li id="cite_note-8"><span class="mw-cite-backlink"><b><a href="#cite_ref-8">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222" /><cite class="citation web cs1"><a rel="nofollow" class="external text" href="http://research.microsoft.com/en-us/projects/whisperid/">"Speaker Identification (WhisperID)"</a>. <i>Microsoft Research</i>. Microsoft. <a rel="nofollow" class="external text" href="https://web.archive.org/web/20140225190956/http://research.microsoft.com/en-us/projects/whisperid/">Archived</a> from the original on 25 February 2014<span class="reference-accessdate">. Retrieved <span class="nowrap">21 February</span> 2014</span>. <q>When you speak to someone, they don't just recognize what you say: they recognize who you are. WhisperID will let computers do that, too, figuring out who you are by the way you sound.</q></cite><span title="ctx_ver=Z39.88-2004&amp;rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&amp;rft.genre=unknown&amp;rft.jtitle=Microsoft+Research&amp;rft.atitle=Speaker+Identification+%28WhisperID%29&amp;rft_id=http%3A%2F%2Fresearch.microsoft.com%2Fen-us%2Fprojects%2Fwhisperid%2F&amp;rfr_id=info%3Asid%2Fen.wikipedia.org%3ASpeech+recognition" class="Z3988"></span></span> </li> <li id="cite_note-9"><span class="mw-cite-backlink"><b><a href="#cite_ref-9">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222" /><cite class="citation news cs1"><a rel="nofollow" class="external text" href="https://obits.nj.com/obituaries/starledger/obituary.aspx?page=lifestory&amp;pid=158702138">"Obituaries: Stephen Balashek"</a>. <i>The Star-Ledger</i>. 22 July 2012. <a rel="nofollow" class="external text" href="https://web.archive.org/web/20190404231352/https://obits.nj.com/obituaries/starledger/obituary.aspx?page=lifestory&amp;pid=158702138">Archived</a> from the original on 4 April 2019<span class="reference-accessdate">. Retrieved <span class="nowrap">9 September</span> 2024</span>.</cite><span title="ctx_ver=Z39.88-2004&amp;rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&amp;rft.genre=article&amp;rft.jtitle=The+Star-Ledger&amp;rft.atitle=Obituaries%3A+Stephen+Balashek&amp;rft.date=2012-07-22&amp;rft_id=https%3A%2F%2Fobits.nj.com%2Fobituaries%2Fstarledger%2Fobituary.aspx%3Fpage%3Dlifestory%26pid%3D158702138&amp;rfr_id=info%3Asid%2Fen.wikipedia.org%3ASpeech+recognition" class="Z3988"></span></span> </li> <li id="cite_note-10"><span class="mw-cite-backlink"><b><a href="#cite_ref-10">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222" /><cite class="citation web cs1"><a rel="nofollow" class="external text" href="https://cdn57.androidauthority.net/wp-content/uploads/2012/04/IBM-Shoebox-front.jpg">"IBM-Shoebox-front.jpg"</a>. androidauthority.net. <a rel="nofollow" class="external text" href="https://web.archive.org/web/20180809153221/https://cdn57.androidauthority.net/wp-content/uploads/2012/04/IBM-Shoebox-front.jpg">Archived</a> from the original on 9 August 2018<span class="reference-accessdate">. Retrieved <span class="nowrap">4 April</span> 2019</span>.</cite><span title="ctx_ver=Z39.88-2004&amp;rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&amp;rft.genre=unknown&amp;rft.btitle=IBM-Shoebox-front.jpg&amp;rft.pub=androidauthority.net&amp;rft_id=https%3A%2F%2Fcdn57.androidauthority.net%2Fwp-content%2Fuploads%2F2012%2F04%2FIBM-Shoebox-front.jpg&amp;rfr_id=info%3Asid%2Fen.wikipedia.org%3ASpeech+recognition" class="Z3988"></span></span> </li> <li id="cite_note-11"><span class="mw-cite-backlink"><b><a href="#cite_ref-11">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222" /><cite id="CITEREFJuangRabiner" class="citation web cs1">Juang, B. H.; Rabiner, Lawrence R. <a rel="nofollow" class="external text" href="http://www.ece.ucsb.edu/faculty/Rabiner/ece259/Reprints/354_LALI-ASRHistory-final-10-8.pdf">"Automatic speech recognition–a brief history of the technology development"</a> <span class="cs1-format">(PDF)</span>. p.&#160;6. <a rel="nofollow" class="external text" href="https://web.archive.org/web/20140817193243/http://www.ece.ucsb.edu/Faculty/Rabiner/ece259/Reprints/354_LALI-ASRHistory-final-10-8.pdf">Archived</a> <span class="cs1-format">(PDF)</span> from the original on 17 August 2014<span class="reference-accessdate">. Retrieved <span class="nowrap">17 January</span> 2015</span>.</cite><span title="ctx_ver=Z39.88-2004&amp;rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&amp;rft.genre=unknown&amp;rft.btitle=Automatic+speech+recognition%E2%80%93a+brief+history+of+the+technology+development&amp;rft.pages=6&amp;rft.aulast=Juang&amp;rft.aufirst=B.+H.&amp;rft.au=Rabiner%2C+Lawrence+R.&amp;rft_id=http%3A%2F%2Fwww.ece.ucsb.edu%2Ffaculty%2FRabiner%2Fece259%2FReprints%2F354_LALI-ASRHistory-final-10-8.pdf&amp;rfr_id=info%3Asid%2Fen.wikipedia.org%3ASpeech+recognition" class="Z3988"></span></span> </li> <li id="cite_note-PCW.Siri-12"><span class="mw-cite-backlink">^ <a href="#cite_ref-PCW.Siri_12-0"><sup><i><b>a</b></i></sup></a> <a href="#cite_ref-PCW.Siri_12-1"><sup><i><b>b</b></i></sup></a></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222" /><cite id="CITEREFMelanie_Pinola2011" class="citation magazine cs1">Melanie Pinola (2 November 2011). <a rel="nofollow" class="external text" href="https://www.pcworld.com/article/243060/speech_recognition_through_the_decades_how_we_ended_up_with_siri.html">"Speech Recognition Through the Decades: How We Ended Up With Siri"</a>. <i>PC World</i>. <a rel="nofollow" class="external text" href="https://web.archive.org/web/20181103105727/https://www.pcworld.com/article/243060/speech_recognition_through_the_decades_how_we_ended_up_with_siri.html">Archived</a> from the original on 3 November 2018<span class="reference-accessdate">. Retrieved <span class="nowrap">22 October</span> 2018</span>.</cite><span title="ctx_ver=Z39.88-2004&amp;rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&amp;rft.genre=article&amp;rft.jtitle=PC+World&amp;rft.atitle=Speech+Recognition+Through+the+Decades%3A+How+We+Ended+Up+With+Siri&amp;rft.date=2011-11-02&amp;rft.au=Melanie+Pinola&amp;rft_id=https%3A%2F%2Fwww.pcworld.com%2Farticle%2F243060%2Fspeech_recognition_through_the_decades_how_we_ended_up_with_siri.html&amp;rfr_id=info%3Asid%2Fen.wikipedia.org%3ASpeech+recognition" class="Z3988"></span></span> </li> <li id="cite_note-Gray-13"><span class="mw-cite-backlink"><b><a href="#cite_ref-Gray_13-0">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222" /><cite id="CITEREFGray2010" class="citation journal cs1">Gray, Robert M. (2010). <a rel="nofollow" class="external text" href="https://ee.stanford.edu/~gray/lpcip.pdf">"A History of Realtime Digital Speech on Packet Networks: Part II of Linear Predictive Coding and the Internet Protocol"</a> <span class="cs1-format">(PDF)</span>. <i>Found. Trends Signal Process</i>. <b>3</b> (4): <span class="nowrap">203–</span>303. <a href="/wiki/Doi_(identifier)" class="mw-redirect" title="Doi (identifier)">doi</a>:<span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://doi.org/10.1561%2F2000000036">10.1561/2000000036</a></span>. <a href="/wiki/ISSN_(identifier)" class="mw-redirect" title="ISSN (identifier)">ISSN</a>&#160;<a rel="nofollow" class="external text" href="https://search.worldcat.org/issn/1932-8346">1932-8346</a>. <a rel="nofollow" class="external text" href="https://ghostarchive.org/archive/20221009/https://ee.stanford.edu/~gray/lpcip.pdf">Archived</a> <span class="cs1-format">(PDF)</span> from the original on 9 October 2022<span class="reference-accessdate">. Retrieved <span class="nowrap">9 September</span> 2024</span>.</cite><span title="ctx_ver=Z39.88-2004&amp;rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&amp;rft.genre=article&amp;rft.jtitle=Found.+Trends+Signal+Process.&amp;rft.atitle=A+History+of+Realtime+Digital+Speech+on+Packet+Networks%3A+Part+II+of+Linear+Predictive+Coding+and+the+Internet+Protocol&amp;rft.volume=3&amp;rft.issue=4&amp;rft.pages=%3Cspan+class%3D%22nowrap%22%3E203-%3C%2Fspan%3E303&amp;rft.date=2010&amp;rft_id=info%3Adoi%2F10.1561%2F2000000036&amp;rft.issn=1932-8346&amp;rft.aulast=Gray&amp;rft.aufirst=Robert+M.&amp;rft_id=https%3A%2F%2Fee.stanford.edu%2F~gray%2Flpcip.pdf&amp;rfr_id=info%3Asid%2Fen.wikipedia.org%3ASpeech+recognition" class="Z3988"></span></span> </li> <li id="cite_note-jasapierce-14"><span class="mw-cite-backlink"><b><a href="#cite_ref-jasapierce_14-0">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222" /><cite id="CITEREFJohn_R._Pierce1969" class="citation journal cs1"><a href="/wiki/John_R._Pierce" title="John R. Pierce">John R. Pierce</a> (1969). "Whither speech recognition?". <i>Journal of the Acoustical Society of America</i>. <b>46</b> (48): <span class="nowrap">1049–</span>1051. <a href="/wiki/Bibcode_(identifier)" class="mw-redirect" title="Bibcode (identifier)">Bibcode</a>:<a rel="nofollow" class="external text" href="https://ui.adsabs.harvard.edu/abs/1969ASAJ...46.1049P">1969ASAJ...46.1049P</a>. <a href="/wiki/Doi_(identifier)" class="mw-redirect" title="Doi (identifier)">doi</a>:<a rel="nofollow" class="external text" href="https://doi.org/10.1121%2F1.1911801">10.1121/1.1911801</a>.</cite><span title="ctx_ver=Z39.88-2004&amp;rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&amp;rft.genre=article&amp;rft.jtitle=Journal+of+the+Acoustical+Society+of+America&amp;rft.atitle=Whither+speech+recognition%3F&amp;rft.volume=46&amp;rft.issue=48&amp;rft.pages=%3Cspan+class%3D%22nowrap%22%3E1049-%3C%2Fspan%3E1051&amp;rft.date=1969&amp;rft_id=info%3Adoi%2F10.1121%2F1.1911801&amp;rft_id=info%3Abibcode%2F1969ASAJ...46.1049P&amp;rft.au=John+R.+Pierce&amp;rfr_id=info%3Asid%2Fen.wikipedia.org%3ASpeech+recognition" class="Z3988"></span></span> </li> <li id="cite_note-15"><span class="mw-cite-backlink"><b><a href="#cite_ref-15">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222" /><cite id="CITEREFBenestySondhiHuang2008" class="citation book cs1">Benesty, Jacob; Sondhi, M. M.; Huang, Yiteng (2008). <i>Springer Handbook of Speech Processing</i>. Springer Science &amp; Business Media. <a href="/wiki/ISBN_(identifier)" class="mw-redirect" title="ISBN (identifier)">ISBN</a>&#160;<a href="/wiki/Special:BookSources/978-3540491255" title="Special:BookSources/978-3540491255"><bdi>978-3540491255</bdi></a>.</cite><span title="ctx_ver=Z39.88-2004&amp;rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&amp;rft.genre=book&amp;rft.btitle=Springer+Handbook+of+Speech+Processing&amp;rft.pub=Springer+Science+%26+Business+Media&amp;rft.date=2008&amp;rft.isbn=978-3540491255&amp;rft.aulast=Benesty&amp;rft.aufirst=Jacob&amp;rft.au=Sondhi%2C+M.+M.&amp;rft.au=Huang%2C+Yiteng&amp;rfr_id=info%3Asid%2Fen.wikipedia.org%3ASpeech+recognition" class="Z3988"></span></span> </li> <li id="cite_note-16"><span class="mw-cite-backlink"><b><a href="#cite_ref-16">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222" /><cite id="CITEREFJohn_Makhoul" class="citation web cs1">John Makhoul. <a rel="nofollow" class="external text" href="https://www.superlectures.com/interspeech2016/isca-medalist-for-leadership-and-extensive-contributions-to-speech-and-language-processing">"ISCA Medalist: For leadership and extensive contributions to speech and language processing"</a>. <a rel="nofollow" class="external text" href="https://web.archive.org/web/20180124071005/https://www.superlectures.com/interspeech2016/isca-medalist-for-leadership-and-extensive-contributions-to-speech-and-language-processing">Archived</a> from the original on 24 January 2018<span class="reference-accessdate">. Retrieved <span class="nowrap">23 January</span> 2018</span>.</cite><span title="ctx_ver=Z39.88-2004&amp;rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&amp;rft.genre=unknown&amp;rft.btitle=ISCA+Medalist%3A+For+leadership+and+extensive+contributions+to+speech+and+language+processing&amp;rft.au=John+Makhoul&amp;rft_id=https%3A%2F%2Fwww.superlectures.com%2Finterspeech2016%2Fisca-medalist-for-leadership-and-extensive-contributions-to-speech-and-language-processing&amp;rfr_id=info%3Asid%2Fen.wikipedia.org%3ASpeech+recognition" class="Z3988"></span></span> </li> <li id="cite_note-17"><span class="mw-cite-backlink"><b><a href="#cite_ref-17">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222" /><cite id="CITEREFBlechmanBlechman2008" class="citation magazine cs1">Blechman, R. O.; Blechman, Nicholas (23 June 2008). <a rel="nofollow" class="external text" href="https://www.newyorker.com/magazine/2008/06/23/hello-hal">"Hello, Hal"</a>. <i>The New Yorker</i>. <a rel="nofollow" class="external text" href="https://web.archive.org/web/20150120042048/http://www.newyorker.com/magazine/2008/06/23/hello-hal">Archived</a> from the original on 20 January 2015<span class="reference-accessdate">. Retrieved <span class="nowrap">17 January</span> 2015</span>.</cite><span title="ctx_ver=Z39.88-2004&amp;rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&amp;rft.genre=article&amp;rft.jtitle=The+New+Yorker&amp;rft.atitle=Hello%2C+Hal&amp;rft.date=2008-06-23&amp;rft.aulast=Blechman&amp;rft.aufirst=R.+O.&amp;rft.au=Blechman%2C+Nicholas&amp;rft_id=https%3A%2F%2Fwww.newyorker.com%2Fmagazine%2F2008%2F06%2F23%2Fhello-hal&amp;rfr_id=info%3Asid%2Fen.wikipedia.org%3ASpeech+recognition" class="Z3988"></span></span> </li> <li id="cite_note-18"><span class="mw-cite-backlink"><b><a href="#cite_ref-18">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222" /><cite id="CITEREFKlatt1977" class="citation journal cs1">Klatt, Dennis H. (1977). "Review of the ARPA speech understanding project". <i>The Journal of the Acoustical Society of America</i>. <b>62</b> (6): <span class="nowrap">1345–</span>1366. <a href="/wiki/Bibcode_(identifier)" class="mw-redirect" title="Bibcode (identifier)">Bibcode</a>:<a rel="nofollow" class="external text" href="https://ui.adsabs.harvard.edu/abs/1977ASAJ...62.1345K">1977ASAJ...62.1345K</a>. <a href="/wiki/Doi_(identifier)" class="mw-redirect" title="Doi (identifier)">doi</a>:<a rel="nofollow" class="external text" href="https://doi.org/10.1121%2F1.381666">10.1121/1.381666</a>.</cite><span title="ctx_ver=Z39.88-2004&amp;rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&amp;rft.genre=article&amp;rft.jtitle=The+Journal+of+the+Acoustical+Society+of+America&amp;rft.atitle=Review+of+the+ARPA+speech+understanding+project&amp;rft.volume=62&amp;rft.issue=6&amp;rft.pages=%3Cspan+class%3D%22nowrap%22%3E1345-%3C%2Fspan%3E1366&amp;rft.date=1977&amp;rft_id=info%3Adoi%2F10.1121%2F1.381666&amp;rft_id=info%3Abibcode%2F1977ASAJ...62.1345K&amp;rft.aulast=Klatt&amp;rft.aufirst=Dennis+H.&amp;rfr_id=info%3Asid%2Fen.wikipedia.org%3ASpeech+recognition" class="Z3988"></span></span> </li> <li id="cite_note-19"><span class="mw-cite-backlink"><b><a href="#cite_ref-19">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222" /><cite id="CITEREFRabiner1984" class="citation web cs1">Rabiner (1984). <a rel="nofollow" class="external text" href="http://www.ece.ucsb.edu/Faculty/Rabiner/ece259/Reprints/216_historical%20perspective.pdf">"The Acoustics, Speech, and Signal Processing Society. A Historical Perspective"</a> <span class="cs1-format">(PDF)</span>. <a rel="nofollow" class="external text" href="https://web.archive.org/web/20170809113828/http://www.ece.ucsb.edu/Faculty/Rabiner/ece259/Reprints/216_historical%20perspective.pdf">Archived</a> <span class="cs1-format">(PDF)</span> from the original on 9 August 2017<span class="reference-accessdate">. Retrieved <span class="nowrap">23 January</span> 2018</span>.</cite><span title="ctx_ver=Z39.88-2004&amp;rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&amp;rft.genre=unknown&amp;rft.btitle=The+Acoustics%2C+Speech%2C+and+Signal+Processing+Society.+A+Historical+Perspective&amp;rft.date=1984&amp;rft.au=Rabiner&amp;rft_id=http%3A%2F%2Fwww.ece.ucsb.edu%2FFaculty%2FRabiner%2Fece259%2FReprints%2F216_historical%2520perspective.pdf&amp;rfr_id=info%3Asid%2Fen.wikipedia.org%3ASpeech+recognition" class="Z3988"></span></span> </li> <li id="cite_note-20"><span class="mw-cite-backlink"><b><a href="#cite_ref-20">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222" /><cite class="citation web cs1"><a rel="nofollow" class="external text" href="http://ethw.org/First-Hand:The_Hidden_Markov_Model">"First-Hand:The Hidden Markov Model – Engineering and Technology History Wiki"</a>. <i>ethw.org</i>. 12 January 2015. <a rel="nofollow" class="external text" href="https://web.archive.org/web/20180403191314/http://ethw.org/First-Hand:The_Hidden_Markov_Model">Archived</a> from the original on 3 April 2018<span class="reference-accessdate">. Retrieved <span class="nowrap">1 May</span> 2018</span>.</cite><span title="ctx_ver=Z39.88-2004&amp;rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&amp;rft.genre=unknown&amp;rft.jtitle=ethw.org&amp;rft.atitle=First-Hand%3AThe+Hidden+Markov+Model+%E2%80%93+Engineering+and+Technology+History+Wiki&amp;rft.date=2015-01-12&amp;rft_id=http%3A%2F%2Fethw.org%2FFirst-Hand%3AThe_Hidden_Markov_Model&amp;rfr_id=info%3Asid%2Fen.wikipedia.org%3ASpeech+recognition" class="Z3988"></span></span> </li> <li id="cite_note-James_Baker_interview-21"><span class="mw-cite-backlink">^ <a href="#cite_ref-James_Baker_interview_21-0"><sup><i><b>a</b></i></sup></a> <a href="#cite_ref-James_Baker_interview_21-1"><sup><i><b>b</b></i></sup></a></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222" /><cite class="citation web cs1"><a rel="nofollow" class="external text" href="http://www.sarasinstitute.org/Audio/JimBaker(2006).mp3">"James Baker interview"</a>. <a rel="nofollow" class="external text" href="https://web.archive.org/web/20170828105222/http://www.sarasinstitute.org/Audio/JimBaker(2006).mp3">Archived</a> from the original on 28 August 2017<span class="reference-accessdate">. Retrieved <span class="nowrap">9 February</span> 2017</span>.</cite><span title="ctx_ver=Z39.88-2004&amp;rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&amp;rft.genre=unknown&amp;rft.btitle=James+Baker+interview&amp;rft_id=http%3A%2F%2Fwww.sarasinstitute.org%2FAudio%2FJimBaker%282006%29.mp3&amp;rfr_id=info%3Asid%2Fen.wikipedia.org%3ASpeech+recognition" class="Z3988"></span></span> </li> <li id="cite_note-22"><span class="mw-cite-backlink"><b><a href="#cite_ref-22">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222" /><cite class="citation web cs1"><a rel="nofollow" class="external text" href="https://web.archive.org/web/20150219080748/http://www-03.ibm.com/ibm/history/ibm100/us/en/icons/speechreco/">"Pioneering Speech Recognition"</a>. 7 March 2012. Archived from <a rel="nofollow" class="external text" href="http://www-03.ibm.com/ibm/history/ibm100/us/en/icons/speechreco/">the original</a> on 19 February 2015<span class="reference-accessdate">. Retrieved <span class="nowrap">18 January</span> 2015</span>.</cite><span title="ctx_ver=Z39.88-2004&amp;rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&amp;rft.genre=unknown&amp;rft.btitle=Pioneering+Speech+Recognition&amp;rft.date=2012-03-07&amp;rft_id=http%3A%2F%2Fwww-03.ibm.com%2Fibm%2Fhistory%2Fibm100%2Fus%2Fen%2Ficons%2Fspeechreco%2F&amp;rfr_id=info%3Asid%2Fen.wikipedia.org%3ASpeech+recognition" class="Z3988"></span></span> </li> <li id="cite_note-23"><span class="mw-cite-backlink"><b><a href="#cite_ref-23">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222" /><cite id="CITEREFHuangBakerReddy2014" class="citation journal cs1">Huang, Xuedong; Baker, James; Reddy, Raj (January 2014). <a rel="nofollow" class="external text" href="https://web.archive.org/web/20231208161616/https://dl.acm.org/doi/fullHtml/10.1145/2500887">"A historical perspective of speech recognition"</a>. <i>Communications of the ACM</i>. <b>57</b> (1): <span class="nowrap">94–</span>103. <a href="/wiki/Doi_(identifier)" class="mw-redirect" title="Doi (identifier)">doi</a>:<a rel="nofollow" class="external text" href="https://doi.org/10.1145%2F2500887">10.1145/2500887</a>. <a href="/wiki/ISSN_(identifier)" class="mw-redirect" title="ISSN (identifier)">ISSN</a>&#160;<a rel="nofollow" class="external text" href="https://search.worldcat.org/issn/0001-0782">0001-0782</a>. <a href="/wiki/S2CID_(identifier)" class="mw-redirect" title="S2CID (identifier)">S2CID</a>&#160;<a rel="nofollow" class="external text" href="https://api.semanticscholar.org/CorpusID:6175701">6175701</a>. Archived from <a rel="nofollow" class="external text" href="https://dl.acm.org/doi/fullHtml/10.1145/2500887">the original</a> on 8 December 2023.</cite><span title="ctx_ver=Z39.88-2004&amp;rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&amp;rft.genre=article&amp;rft.jtitle=Communications+of+the+ACM&amp;rft.atitle=A+historical+perspective+of+speech+recognition&amp;rft.volume=57&amp;rft.issue=1&amp;rft.pages=%3Cspan+class%3D%22nowrap%22%3E94-%3C%2Fspan%3E103&amp;rft.date=2014-01&amp;rft_id=https%3A%2F%2Fapi.semanticscholar.org%2FCorpusID%3A6175701%23id-name%3DS2CID&amp;rft.issn=0001-0782&amp;rft_id=info%3Adoi%2F10.1145%2F2500887&amp;rft.aulast=Huang&amp;rft.aufirst=Xuedong&amp;rft.au=Baker%2C+James&amp;rft.au=Reddy%2C+Raj&amp;rft_id=https%3A%2F%2Fdl.acm.org%2Fdoi%2FfullHtml%2F10.1145%2F2500887&amp;rfr_id=info%3Asid%2Fen.wikipedia.org%3ASpeech+recognition" class="Z3988"></span></span> </li> <li id="cite_note-24"><span class="mw-cite-backlink"><b><a href="#cite_ref-24">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222" /><cite id="CITEREFJuangRabiner" class="citation report cs1">Juang, B. H.; Rabiner, Lawrence R. <a rel="nofollow" class="external text" href="http://www.ece.ucsb.edu/faculty/Rabiner/ece259/Reprints/354_LALI-ASRHistory-final-10-8.pdf">Automatic speech recognition–a brief history of the technology development</a> <span class="cs1-format">(PDF)</span> (Report). p.&#160;10. <a rel="nofollow" class="external text" href="https://web.archive.org/web/20140817193243/http://www.ece.ucsb.edu/Faculty/Rabiner/ece259/Reprints/354_LALI-ASRHistory-final-10-8.pdf">Archived</a> <span class="cs1-format">(PDF)</span> from the original on 17 August 2014<span class="reference-accessdate">. Retrieved <span class="nowrap">17 January</span> 2015</span>.</cite><span title="ctx_ver=Z39.88-2004&amp;rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&amp;rft.genre=report&amp;rft.btitle=Automatic+speech+recognition%E2%80%93a+brief+history+of+the+technology+development&amp;rft.pages=10&amp;rft.aulast=Juang&amp;rft.aufirst=B.+H.&amp;rft.au=Rabiner%2C+Lawrence+R.&amp;rft_id=http%3A%2F%2Fwww.ece.ucsb.edu%2Ffaculty%2FRabiner%2Fece259%2FReprints%2F354_LALI-ASRHistory-final-10-8.pdf&amp;rfr_id=info%3Asid%2Fen.wikipedia.org%3ASpeech+recognition" class="Z3988"></span></span> </li> <li id="cite_note-25"><span class="mw-cite-backlink"><b><a href="#cite_ref-25">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222" /><cite id="CITEREFLi2023" class="citation journal cs1">Li, Xiaochang (1 July 2023). <a rel="nofollow" class="external text" href="https://www.journals.uchicago.edu/doi/10.1086/725132">"<span class="cs1-kern-left"></span>"There's No Data Like More Data": Automatic Speech Recognition and the Making of Algorithmic Culture"</a>. <i>Osiris</i>. <b>38</b>: <span class="nowrap">165–</span>182. <a href="/wiki/Doi_(identifier)" class="mw-redirect" title="Doi (identifier)">doi</a>:<a rel="nofollow" class="external text" href="https://doi.org/10.1086%2F725132">10.1086/725132</a>. <a href="/wiki/ISSN_(identifier)" class="mw-redirect" title="ISSN (identifier)">ISSN</a>&#160;<a rel="nofollow" class="external text" href="https://search.worldcat.org/issn/0369-7827">0369-7827</a>. <a href="/wiki/S2CID_(identifier)" class="mw-redirect" title="S2CID (identifier)">S2CID</a>&#160;<a rel="nofollow" class="external text" href="https://api.semanticscholar.org/CorpusID:259502346">259502346</a>.</cite><span title="ctx_ver=Z39.88-2004&amp;rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&amp;rft.genre=article&amp;rft.jtitle=Osiris&amp;rft.atitle=%22There%27s+No+Data+Like+More+Data%22%3A+Automatic+Speech+Recognition+and+the+Making+of+Algorithmic+Culture&amp;rft.volume=38&amp;rft.pages=%3Cspan+class%3D%22nowrap%22%3E165-%3C%2Fspan%3E182&amp;rft.date=2023-07-01&amp;rft_id=https%3A%2F%2Fapi.semanticscholar.org%2FCorpusID%3A259502346%23id-name%3DS2CID&amp;rft.issn=0369-7827&amp;rft_id=info%3Adoi%2F10.1086%2F725132&amp;rft.aulast=Li&amp;rft.aufirst=Xiaochang&amp;rft_id=https%3A%2F%2Fwww.journals.uchicago.edu%2Fdoi%2F10.1086%2F725132&amp;rfr_id=info%3Asid%2Fen.wikipedia.org%3ASpeech+recognition" class="Z3988"></span></span> </li> <li id="cite_note-26"><span class="mw-cite-backlink"><b><a href="#cite_ref-26">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222" /><cite class="citation web cs1"><a rel="nofollow" class="external text" href="https://web.archive.org/web/20150813223326/http://dragon-medical-transcription.com/history_speech_recognition.html">"History of Speech Recognition"</a>. <i>Dragon Medical Transcription</i>. Archived from <a rel="nofollow" class="external text" href="http://www.dragon-medical-transcription.com/history_speech_recognition.html">the original</a> on 13 August 2015<span class="reference-accessdate">. Retrieved <span class="nowrap">17 January</span> 2015</span>.</cite><span title="ctx_ver=Z39.88-2004&amp;rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&amp;rft.genre=unknown&amp;rft.jtitle=Dragon+Medical+Transcription&amp;rft.atitle=History+of+Speech+Recognition&amp;rft_id=http%3A%2F%2Fwww.dragon-medical-transcription.com%2Fhistory_speech_recognition.html&amp;rfr_id=info%3Asid%2Fen.wikipedia.org%3ASpeech+recognition" class="Z3988"></span></span> </li> <li id="cite_note-27"><span class="mw-cite-backlink"><b><a href="#cite_ref-27">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222" /><cite id="CITEREFBilliCanavesioCiaramellaNebbia1995" class="citation journal cs1">Billi, Roberto; Canavesio, Franco; Ciaramella, Alberto; Nebbia, Luciano (1 November 1995). <a rel="nofollow" class="external text" href="https://www.sciencedirect.com/science/article/abs/pii/016763939500030R">"Interactive voice technology at work: The CSELT experience"</a>. <i>Speech Communication</i>. <b>17</b> (3): <span class="nowrap">263–</span>271. <a href="/wiki/Doi_(identifier)" class="mw-redirect" title="Doi (identifier)">doi</a>:<a rel="nofollow" class="external text" href="https://doi.org/10.1016%2F0167-6393%2895%2900030-R">10.1016/0167-6393(95)00030-R</a>.</cite><span title="ctx_ver=Z39.88-2004&amp;rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&amp;rft.genre=article&amp;rft.jtitle=Speech+Communication&amp;rft.atitle=Interactive+voice+technology+at+work%3A+The+CSELT+experience&amp;rft.volume=17&amp;rft.issue=3&amp;rft.pages=%3Cspan+class%3D%22nowrap%22%3E263-%3C%2Fspan%3E271&amp;rft.date=1995-11-01&amp;rft_id=info%3Adoi%2F10.1016%2F0167-6393%2895%2900030-R&amp;rft.aulast=Billi&amp;rft.aufirst=Roberto&amp;rft.au=Canavesio%2C+Franco&amp;rft.au=Ciaramella%2C+Alberto&amp;rft.au=Nebbia%2C+Luciano&amp;rft_id=https%3A%2F%2Fwww.sciencedirect.com%2Fscience%2Farticle%2Fabs%2Fpii%2F016763939500030R&amp;rfr_id=info%3Asid%2Fen.wikipedia.org%3ASpeech+recognition" class="Z3988"></span></span> </li> <li id="cite_note-Communications_of_the_ACM-28"><span class="mw-cite-backlink">^ <a href="#cite_ref-Communications_of_the_ACM_28-0"><sup><i><b>a</b></i></sup></a> <a href="#cite_ref-Communications_of_the_ACM_28-1"><sup><i><b>b</b></i></sup></a></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222" /><cite id="CITEREFXuedong_HuangJames_BakerRaj_Reddy2014" class="citation web cs1">Xuedong Huang; James Baker; Raj Reddy (January 2014). <a rel="nofollow" class="external text" href="http://cacm.acm.org/magazines/2014/1/170863-a-historical-perspective-of-speech-recognition/fulltext#R5">"A Historical Perspective of Speech Recognition"</a>. Communications of the ACM. <a rel="nofollow" class="external text" href="http://archive.wikiwix.com/cache/20150120074239/http://cacm.acm.org/magazines/2014/1/170863-a-historical-perspective-of-speech-recognition/fulltext#R5">Archived</a> from the original on 20 January 2015<span class="reference-accessdate">. Retrieved <span class="nowrap">20 January</span> 2015</span>.</cite><span title="ctx_ver=Z39.88-2004&amp;rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&amp;rft.genre=unknown&amp;rft.btitle=A+Historical+Perspective+of+Speech+Recognition&amp;rft.pub=Communications+of+the+ACM&amp;rft.date=2014-01&amp;rft.au=Xuedong+Huang&amp;rft.au=James+Baker&amp;rft.au=Raj+Reddy&amp;rft_id=http%3A%2F%2Fcacm.acm.org%2Fmagazines%2F2014%2F1%2F170863-a-historical-perspective-of-speech-recognition%2Ffulltext%23R5&amp;rfr_id=info%3Asid%2Fen.wikipedia.org%3ASpeech+recognition" class="Z3988"></span></span> </li> <li id="cite_note-29"><span class="mw-cite-backlink"><b><a href="#cite_ref-29">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222" /><cite id="CITEREFKevin_McKean1980" class="citation news cs1">Kevin McKean (8 April 1980). <a rel="nofollow" class="external text" href="https://news.google.com/newspapers?nid=1798&amp;dat=19800408&amp;id=xgsdAAAAIBAJ&amp;pg=6057,1141823">"When Cole talks, computers listen"</a>. Sarasota Journal. AP<span class="reference-accessdate">. Retrieved <span class="nowrap">23 November</span> 2015</span>.</cite><span title="ctx_ver=Z39.88-2004&amp;rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&amp;rft.genre=article&amp;rft.atitle=When+Cole+talks%2C+computers+listen&amp;rft.date=1980-04-08&amp;rft.au=Kevin+McKean&amp;rft_id=https%3A%2F%2Fnews.google.com%2Fnewspapers%3Fnid%3D1798%26dat%3D19800408%26id%3DxgsdAAAAIBAJ%26pg%3D6057%2C1141823&amp;rfr_id=info%3Asid%2Fen.wikipedia.org%3ASpeech+recognition" class="Z3988"></span></span> </li> <li id="cite_note-:2-30"><span class="mw-cite-backlink"><b><a href="#cite_ref-:2_30-0">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222" /><cite class="citation web cs1"><a rel="nofollow" class="external text" href="http://actapricot.org/history/apricot_review_1.html">"ACT/Apricot - Apricot history"</a>. <i>actapricot.org</i>. <a rel="nofollow" class="external text" href="https://web.archive.org/web/20161221091131/http://actapricot.org/history/apricot_review_1.html">Archived</a> from the original on 21 December 2016<span class="reference-accessdate">. Retrieved <span class="nowrap">2 February</span> 2016</span>.</cite><span title="ctx_ver=Z39.88-2004&amp;rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&amp;rft.genre=unknown&amp;rft.jtitle=actapricot.org&amp;rft.atitle=ACT%2FApricot+-+Apricot+history&amp;rft_id=http%3A%2F%2Factapricot.org%2Fhistory%2Fapricot_review_1.html&amp;rfr_id=info%3Asid%2Fen.wikipedia.org%3ASpeech+recognition" class="Z3988"></span></span> </li> <li id="cite_note-31"><span class="mw-cite-backlink"><b><a href="#cite_ref-31">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222" /><cite id="CITEREFMelanie_Pinola2011" class="citation web cs1">Melanie Pinola (2 November 2011). <a rel="nofollow" class="external text" href="http://www.pcworld.com/article/243060/speech_recognition_through_the_decades_how_we_ended_up_with_siri.html?page=2">"Speech Recognition Through the Decades: How We Ended Up With Siri"</a>. <i>PC World</i>. <a rel="nofollow" class="external text" href="https://web.archive.org/web/20170113074944/http://www.pcworld.com/article/243060/speech_recognition_through_the_decades_how_we_ended_up_with_siri.html?page=2">Archived</a> from the original on 13 January 2017<span class="reference-accessdate">. Retrieved <span class="nowrap">28 July</span> 2017</span>.</cite><span title="ctx_ver=Z39.88-2004&amp;rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&amp;rft.genre=unknown&amp;rft.jtitle=PC+World&amp;rft.atitle=Speech+Recognition+Through+the+Decades%3A+How+We+Ended+Up+With+Siri&amp;rft.date=2011-11-02&amp;rft.au=Melanie+Pinola&amp;rft_id=http%3A%2F%2Fwww.pcworld.com%2Farticle%2F243060%2Fspeech_recognition_through_the_decades_how_we_ended_up_with_siri.html%3Fpage%3D2&amp;rfr_id=info%3Asid%2Fen.wikipedia.org%3ASpeech+recognition" class="Z3988"></span></span> </li> <li id="cite_note-KurzweilAIbio-32"><span class="mw-cite-backlink"><b><a href="#cite_ref-KurzweilAIbio_32-0">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222" /><cite class="citation web cs1"><a rel="nofollow" class="external text" href="http://www.kurzweilai.net/ray-kurzweil-bio">"Ray Kurzweil biography"</a>. KurzweilAINetwork. <a rel="nofollow" class="external text" href="https://web.archive.org/web/20140205002828/http://www.kurzweilai.net/ray-kurzweil-bio">Archived</a> from the original on 5 February 2014<span class="reference-accessdate">. Retrieved <span class="nowrap">25 September</span> 2014</span>.</cite><span title="ctx_ver=Z39.88-2004&amp;rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&amp;rft.genre=unknown&amp;rft.btitle=Ray+Kurzweil+biography&amp;rft.pub=KurzweilAINetwork&amp;rft_id=http%3A%2F%2Fwww.kurzweilai.net%2Fray-kurzweil-bio&amp;rfr_id=info%3Asid%2Fen.wikipedia.org%3ASpeech+recognition" class="Z3988"></span></span> </li> <li id="cite_note-33"><span class="mw-cite-backlink"><b><a href="#cite_ref-33">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222" /><cite id="CITEREFJuangRabiner" class="citation report cs1">Juang, B.H.; Rabiner, Lawrence. <a rel="nofollow" class="external text" href="http://www.ece.ucsb.edu/Faculty/Rabiner/ece259/Reprints/354_LALI-ASRHistory-final-10-8.pdf">Automatic Speech Recognition – A Brief History of the Technology Development</a> <span class="cs1-format">(PDF)</span> (Report). <a rel="nofollow" class="external text" href="https://web.archive.org/web/20170809211311/http://www.ece.ucsb.edu/Faculty/Rabiner/ece259/Reprints/354_LALI-ASRHistory-final-10-8.pdf">Archived</a> <span class="cs1-format">(PDF)</span> from the original on 9 August 2017<span class="reference-accessdate">. Retrieved <span class="nowrap">28 July</span> 2017</span>.</cite><span title="ctx_ver=Z39.88-2004&amp;rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&amp;rft.genre=report&amp;rft.btitle=Automatic+Speech+Recognition+%E2%80%93+A+Brief+History+of+the+Technology+Development&amp;rft.aulast=Juang&amp;rft.aufirst=B.H.&amp;rft.au=Rabiner%2C+Lawrence&amp;rft_id=http%3A%2F%2Fwww.ece.ucsb.edu%2FFaculty%2FRabiner%2Fece259%2FReprints%2F354_LALI-ASRHistory-final-10-8.pdf&amp;rfr_id=info%3Asid%2Fen.wikipedia.org%3ASpeech+recognition" class="Z3988"></span></span> </li> <li id="cite_note-34"><span class="mw-cite-backlink"><b><a href="#cite_ref-34">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222" /><cite class="citation web cs1"><a rel="nofollow" class="external text" href="http://techpinions.com/nuance-exec-on-iphone-4s-siri-and-the-future-of-speech/3307">"Nuance Exec on iPhone 4S, Siri, and the Future of Speech"</a>. Tech.pinions. 10 October 2011. <a rel="nofollow" class="external text" href="https://web.archive.org/web/20111119211021/http://techpinions.com/nuance-exec-on-iphone-4s-siri-and-the-future-of-speech/3307">Archived</a> from the original on 19 November 2011<span class="reference-accessdate">. Retrieved <span class="nowrap">23 November</span> 2011</span>.</cite><span title="ctx_ver=Z39.88-2004&amp;rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&amp;rft.genre=unknown&amp;rft.btitle=Nuance+Exec+on+iPhone+4S%2C+Siri%2C+and+the+Future+of+Speech&amp;rft.pub=Tech.pinions&amp;rft.date=2011-10-10&amp;rft_id=http%3A%2F%2Ftechpinions.com%2Fnuance-exec-on-iphone-4s-siri-and-the-future-of-speech%2F3307&amp;rfr_id=info%3Asid%2Fen.wikipedia.org%3ASpeech+recognition" class="Z3988"></span></span> </li> <li id="cite_note-35"><span class="mw-cite-backlink"><b><a href="#cite_ref-35">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222" /><cite class="citation web cs1"><a rel="nofollow" class="external text" href="https://catalog.ldc.upenn.edu/LDC97S62">"Switchboard-1 Release 2"</a>. <a rel="nofollow" class="external text" href="https://web.archive.org/web/20170711061225/https://catalog.ldc.upenn.edu/LDC97S62">Archived</a> from the original on 11 July 2017<span class="reference-accessdate">. Retrieved <span class="nowrap">26 July</span> 2017</span>.</cite><span title="ctx_ver=Z39.88-2004&amp;rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&amp;rft.genre=unknown&amp;rft.btitle=Switchboard-1+Release+2&amp;rft_id=https%3A%2F%2Fcatalog.ldc.upenn.edu%2FLDC97S62&amp;rfr_id=info%3Asid%2Fen.wikipedia.org%3ASpeech+recognition" class="Z3988"></span></span> </li> <li id="cite_note-36"><span class="mw-cite-backlink"><b><a href="#cite_ref-36">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222" /><cite id="CITEREFJason_Kincaid2011" class="citation web cs1">Jason Kincaid (13 February 2011). <a rel="nofollow" class="external text" href="https://techcrunch.com/2011/02/13/the-power-of-voice-a-conversation-with-the-head-of-googles-speech-technology/">"The Power of Voice: A Conversation With The Head Of Google's Speech Technology"</a>. <i>Tech Crunch</i>. <a rel="nofollow" class="external text" href="https://web.archive.org/web/20150721034447/http://techcrunch.com/2011/02/13/the-power-of-voice-a-conversation-with-the-head-of-googles-speech-technology/">Archived</a> from the original on 21 July 2015<span class="reference-accessdate">. Retrieved <span class="nowrap">21 July</span> 2015</span>.</cite><span title="ctx_ver=Z39.88-2004&amp;rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&amp;rft.genre=unknown&amp;rft.jtitle=Tech+Crunch&amp;rft.atitle=The+Power+of+Voice%3A+A+Conversation+With+The+Head+Of+Google%27s+Speech+Technology&amp;rft.date=2011-02-13&amp;rft.au=Jason+Kincaid&amp;rft_id=https%3A%2F%2Ftechcrunch.com%2F2011%2F02%2F13%2Fthe-power-of-voice-a-conversation-with-the-head-of-googles-speech-technology%2F&amp;rfr_id=info%3Asid%2Fen.wikipedia.org%3ASpeech+recognition" class="Z3988"></span></span> </li> <li id="cite_note-37"><span class="mw-cite-backlink"><b><a href="#cite_ref-37">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222" /><cite id="CITEREFFroomkin2015" class="citation web cs1">Froomkin, Dan (5 May 2015). <a rel="nofollow" class="external text" href="https://firstlook.org/theintercept/2015/05/05/nsa-speech-recognition-snowden-searchable-text/">"THE COMPUTERS ARE LISTENING"</a>. <i>The Intercept</i>. <a rel="nofollow" class="external text" href="https://web.archive.org/web/20150627185007/https://firstlook.org/theintercept/2015/05/05/nsa-speech-recognition-snowden-searchable-text/">Archived</a> from the original on 27 June 2015<span class="reference-accessdate">. Retrieved <span class="nowrap">20 June</span> 2015</span>.</cite><span title="ctx_ver=Z39.88-2004&amp;rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&amp;rft.genre=unknown&amp;rft.jtitle=The+Intercept&amp;rft.atitle=THE+COMPUTERS+ARE+LISTENING&amp;rft.date=2015-05-05&amp;rft.aulast=Froomkin&amp;rft.aufirst=Dan&amp;rft_id=https%3A%2F%2Ffirstlook.org%2Ftheintercept%2F2015%2F05%2F05%2Fnsa-speech-recognition-snowden-searchable-text%2F&amp;rfr_id=info%3Asid%2Fen.wikipedia.org%3ASpeech+recognition" class="Z3988"></span></span> </li> <li id="cite_note-bourlard1994-38"><span class="mw-cite-backlink"><b><a href="#cite_ref-bourlard1994_38-0">^</a></b></span> <span class="reference-text">Herve Bourlard and <a href="/wiki/Nelson_Morgan" title="Nelson Morgan">Nelson Morgan</a>, Connectionist Speech Recognition: A Hybrid Approach, The Kluwer International Series in Engineering and Computer Science; v. 247, Boston: Kluwer Academic Publishers, 1994.</span> </li> <li id="cite_note-lstm-39"><span class="mw-cite-backlink">^ <a href="#cite_ref-lstm_39-0"><sup><i><b>a</b></i></sup></a> <a href="#cite_ref-lstm_39-1"><sup><i><b>b</b></i></sup></a></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222" /><cite id="CITEREFSepp_HochreiterJ._Schmidhuber1997" class="citation journal cs1"><a href="/wiki/Sepp_Hochreiter" title="Sepp Hochreiter">Sepp Hochreiter</a>; <a href="/wiki/J%C3%BCrgen_Schmidhuber" title="Jürgen Schmidhuber">J. Schmidhuber</a> (1997). "Long Short-Term Memory". <i>Neural Computation</i>. <b>9</b> (8): <span class="nowrap">1735–</span>1780. <a href="/wiki/Doi_(identifier)" class="mw-redirect" title="Doi (identifier)">doi</a>:<a rel="nofollow" class="external text" href="https://doi.org/10.1162%2Fneco.1997.9.8.1735">10.1162/neco.1997.9.8.1735</a>. <a href="/wiki/PMID_(identifier)" class="mw-redirect" title="PMID (identifier)">PMID</a>&#160;<a rel="nofollow" class="external text" href="https://pubmed.ncbi.nlm.nih.gov/9377276">9377276</a>. <a href="/wiki/S2CID_(identifier)" class="mw-redirect" title="S2CID (identifier)">S2CID</a>&#160;<a rel="nofollow" class="external text" href="https://api.semanticscholar.org/CorpusID:1915014">1915014</a>.</cite><span title="ctx_ver=Z39.88-2004&amp;rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&amp;rft.genre=article&amp;rft.jtitle=Neural+Computation&amp;rft.atitle=Long+Short-Term+Memory&amp;rft.volume=9&amp;rft.issue=8&amp;rft.pages=%3Cspan+class%3D%22nowrap%22%3E1735-%3C%2Fspan%3E1780&amp;rft.date=1997&amp;rft_id=https%3A%2F%2Fapi.semanticscholar.org%2FCorpusID%3A1915014%23id-name%3DS2CID&amp;rft_id=info%3Apmid%2F9377276&amp;rft_id=info%3Adoi%2F10.1162%2Fneco.1997.9.8.1735&amp;rft.au=Sepp+Hochreiter&amp;rft.au=J.+Schmidhuber&amp;rfr_id=info%3Asid%2Fen.wikipedia.org%3ASpeech+recognition" class="Z3988"></span></span> </li> <li id="cite_note-schmidhuber2015-40"><span class="mw-cite-backlink"><b><a href="#cite_ref-schmidhuber2015_40-0">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222" /><cite id="CITEREFSchmidhuber2015" class="citation journal cs1"><a href="/wiki/J%C3%BCrgen_Schmidhuber" title="Jürgen Schmidhuber">Schmidhuber, Jürgen</a> (2015). "Deep learning in neural networks: An overview". <i>Neural Networks</i>. <b>61</b>: <span class="nowrap">85–</span>117. <a href="/wiki/ArXiv_(identifier)" class="mw-redirect" title="ArXiv (identifier)">arXiv</a>:<span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://arxiv.org/abs/1404.7828">1404.7828</a></span>. <a href="/wiki/Doi_(identifier)" class="mw-redirect" title="Doi (identifier)">doi</a>:<a rel="nofollow" class="external text" href="https://doi.org/10.1016%2Fj.neunet.2014.09.003">10.1016/j.neunet.2014.09.003</a>. <a href="/wiki/PMID_(identifier)" class="mw-redirect" title="PMID (identifier)">PMID</a>&#160;<a rel="nofollow" class="external text" href="https://pubmed.ncbi.nlm.nih.gov/25462637">25462637</a>. <a href="/wiki/S2CID_(identifier)" class="mw-redirect" title="S2CID (identifier)">S2CID</a>&#160;<a rel="nofollow" class="external text" href="https://api.semanticscholar.org/CorpusID:11715509">11715509</a>.</cite><span title="ctx_ver=Z39.88-2004&amp;rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&amp;rft.genre=article&amp;rft.jtitle=Neural+Networks&amp;rft.atitle=Deep+learning+in+neural+networks%3A+An+overview&amp;rft.volume=61&amp;rft.pages=%3Cspan+class%3D%22nowrap%22%3E85-%3C%2Fspan%3E117&amp;rft.date=2015&amp;rft_id=info%3Aarxiv%2F1404.7828&amp;rft_id=https%3A%2F%2Fapi.semanticscholar.org%2FCorpusID%3A11715509%23id-name%3DS2CID&amp;rft_id=info%3Apmid%2F25462637&amp;rft_id=info%3Adoi%2F10.1016%2Fj.neunet.2014.09.003&amp;rft.aulast=Schmidhuber&amp;rft.aufirst=J%C3%BCrgen&amp;rfr_id=info%3Asid%2Fen.wikipedia.org%3ASpeech+recognition" class="Z3988"></span></span> </li> <li id="cite_note-graves2006-41"><span class="mw-cite-backlink"><b><a href="#cite_ref-graves2006_41-0">^</a></b></span> <span class="reference-text">Alex Graves, Santiago Fernandez, Faustino Gomez, and <a href="/wiki/J%C3%BCrgen_Schmidhuber" title="Jürgen Schmidhuber">Jürgen Schmidhuber</a> (2006). <a rel="nofollow" class="external text" href="https://mediatum.ub.tum.de/doc/1292048/file.pdf">Connectionist temporal classification: Labelling unsegmented sequence data with recurrent neural nets</a> <a rel="nofollow" class="external text" href="https://web.archive.org/web/20240909053409/https://mediatum.ub.tum.de/doc/1292048/file.pdf">Archived</a> 9 September 2024 at the <a href="/wiki/Wayback_Machine" title="Wayback Machine">Wayback Machine</a>. Proceedings of ICML'06, pp. 369–376.</span> </li> <li id="cite_note-fernandez2007keyword-42"><span class="mw-cite-backlink"><b><a href="#cite_ref-fernandez2007keyword_42-0">^</a></b></span> <span class="reference-text">Santiago Fernandez, Alex Graves, and Jürgen Schmidhuber (2007). <a rel="nofollow" class="external text" href="https://www6.in.tum.de/pub/Main/Publications/Fernandez2007b.pdf">An application of recurrent neural networks to discriminative keyword spotting</a><sup class="noprint Inline-Template"><span style="white-space: nowrap;">&#91;<i><a href="/wiki/Wikipedia:Link_rot" title="Wikipedia:Link rot"><span title="&#160;Dead link tagged March 2023">permanent dead link</span></a></i><span style="visibility:hidden; color:transparent; padding-left:2px">&#8205;</span>&#93;</span></sup>. Proceedings of ICANN (2), pp. 220–229.</span> </li> <li id="cite_note-sak2015-43"><span class="mw-cite-backlink">^ <a href="#cite_ref-sak2015_43-0"><sup><i><b>a</b></i></sup></a> <a href="#cite_ref-sak2015_43-1"><sup><i><b>b</b></i></sup></a></span> <span class="reference-text">Haşim Sak, Andrew Senior, Kanishka Rao, Françoise Beaufays and Johan Schalkwyk (September 2015): "<link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222" /><cite class="citation web cs1"><a rel="nofollow" class="external text" href="https://web.archive.org/web/20160309191532/http://googleresearch.blogspot.ch/2015/09/google-voice-search-faster-and-more.html">"Google voice search: faster and more accurate"</a>. Archived from <a rel="nofollow" class="external text" href="http://googleresearch.blogspot.ch/2015/09/google-voice-search-faster-and-more.html">the original</a> on 9 March 2016<span class="reference-accessdate">. Retrieved <span class="nowrap">5 April</span> 2016</span>.</cite><span title="ctx_ver=Z39.88-2004&amp;rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&amp;rft.genre=unknown&amp;rft.btitle=Google+voice+search%3A+faster+and+more+accurate&amp;rft_id=http%3A%2F%2Fgoogleresearch.blogspot.ch%2F2015%2F09%2Fgoogle-voice-search-faster-and-more.html&amp;rfr_id=info%3Asid%2Fen.wikipedia.org%3ASpeech+recognition" class="Z3988"></span>."</span> </li> <li id="cite_note-44"><span class="mw-cite-backlink"><b><a href="#cite_ref-44">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222" /><cite id="CITEREFDosovitskiyBeyerKolesnikovWeissenborn2021" class="citation arxiv cs1">Dosovitskiy, Alexey; Beyer, Lucas; Kolesnikov, Alexander; Weissenborn, Dirk; Zhai, Xiaohua; Unterthiner, Thomas; Dehghani, Mostafa; Minderer, Matthias; Heigold, Georg; Gelly, Sylvain; Uszkoreit, Jakob; Houlsby, Neil (3 June 2021). "An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale". <a href="/wiki/ArXiv_(identifier)" class="mw-redirect" title="ArXiv (identifier)">arXiv</a>:<span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://arxiv.org/abs/2010.11929">2010.11929</a></span> [<a rel="nofollow" class="external text" href="https://arxiv.org/archive/cs.CV">cs.CV</a>].</cite><span title="ctx_ver=Z39.88-2004&amp;rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&amp;rft.genre=preprint&amp;rft.jtitle=arXiv&amp;rft.atitle=An+Image+is+Worth+16x16+Words%3A+Transformers+for+Image+Recognition+at+Scale&amp;rft.date=2021-06-03&amp;rft_id=info%3Aarxiv%2F2010.11929&amp;rft.aulast=Dosovitskiy&amp;rft.aufirst=Alexey&amp;rft.au=Beyer%2C+Lucas&amp;rft.au=Kolesnikov%2C+Alexander&amp;rft.au=Weissenborn%2C+Dirk&amp;rft.au=Zhai%2C+Xiaohua&amp;rft.au=Unterthiner%2C+Thomas&amp;rft.au=Dehghani%2C+Mostafa&amp;rft.au=Minderer%2C+Matthias&amp;rft.au=Heigold%2C+Georg&amp;rft.au=Gelly%2C+Sylvain&amp;rft.au=Uszkoreit%2C+Jakob&amp;rft.au=Houlsby%2C+Neil&amp;rfr_id=info%3Asid%2Fen.wikipedia.org%3ASpeech+recognition" class="Z3988"></span></span> </li> <li id="cite_note-45"><span class="mw-cite-backlink"><b><a href="#cite_ref-45">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222" /><cite id="CITEREFWuXiaoCodellaLiu2021" class="citation arxiv cs1">Wu, Haiping; Xiao, Bin; Codella, Noel; Liu, Mengchen; Dai, Xiyang; Yuan, Lu; Zhang, Lei (29 March 2021). "CvT: Introducing Convolutions to Vision Transformers". <a href="/wiki/ArXiv_(identifier)" class="mw-redirect" title="ArXiv (identifier)">arXiv</a>:<span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://arxiv.org/abs/2103.15808">2103.15808</a></span> [<a rel="nofollow" class="external text" href="https://arxiv.org/archive/cs.CV">cs.CV</a>].</cite><span title="ctx_ver=Z39.88-2004&amp;rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&amp;rft.genre=preprint&amp;rft.jtitle=arXiv&amp;rft.atitle=CvT%3A+Introducing+Convolutions+to+Vision+Transformers&amp;rft.date=2021-03-29&amp;rft_id=info%3Aarxiv%2F2103.15808&amp;rft.aulast=Wu&amp;rft.aufirst=Haiping&amp;rft.au=Xiao%2C+Bin&amp;rft.au=Codella%2C+Noel&amp;rft.au=Liu%2C+Mengchen&amp;rft.au=Dai%2C+Xiyang&amp;rft.au=Yuan%2C+Lu&amp;rft.au=Zhang%2C+Lei&amp;rfr_id=info%3Asid%2Fen.wikipedia.org%3ASpeech+recognition" class="Z3988"></span></span> </li> <li id="cite_note-46"><span class="mw-cite-backlink"><b><a href="#cite_ref-46">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222" /><cite id="CITEREFVaswaniShazeerParmarUszkoreit2017" class="citation journal cs1">Vaswani, Ashish; Shazeer, Noam; Parmar, Niki; Uszkoreit, Jakob; Jones, Llion; Gomez, Aidan N; Kaiser, Łukasz; Polosukhin, Illia (2017). <a rel="nofollow" class="external text" href="https://papers.nips.cc/paper/2017/hash/3f5ee243547dee91fbd053c1c4a845aa-Abstract.html">"Attention is All you Need"</a>. <i>Advances in Neural Information Processing Systems</i>. <b>30</b>. Curran Associates. <a rel="nofollow" class="external text" href="https://web.archive.org/web/20240909053411/https://papers.nips.cc/paper/2017/hash/3f5ee243547dee91fbd053c1c4a845aa-Abstract.html">Archived</a> from the original on 9 September 2024<span class="reference-accessdate">. Retrieved <span class="nowrap">9 September</span> 2024</span>.</cite><span title="ctx_ver=Z39.88-2004&amp;rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&amp;rft.genre=article&amp;rft.jtitle=Advances+in+Neural+Information+Processing+Systems&amp;rft.atitle=Attention+is+All+you+Need&amp;rft.volume=30&amp;rft.date=2017&amp;rft.aulast=Vaswani&amp;rft.aufirst=Ashish&amp;rft.au=Shazeer%2C+Noam&amp;rft.au=Parmar%2C+Niki&amp;rft.au=Uszkoreit%2C+Jakob&amp;rft.au=Jones%2C+Llion&amp;rft.au=Gomez%2C+Aidan+N&amp;rft.au=Kaiser%2C+%C5%81ukasz&amp;rft.au=Polosukhin%2C+Illia&amp;rft_id=https%3A%2F%2Fpapers.nips.cc%2Fpaper%2F2017%2Fhash%2F3f5ee243547dee91fbd053c1c4a845aa-Abstract.html&amp;rfr_id=info%3Asid%2Fen.wikipedia.org%3ASpeech+recognition" class="Z3988"></span></span> </li> <li id="cite_note-47"><span class="mw-cite-backlink"><b><a href="#cite_ref-47">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222" /><cite id="CITEREFDevlinChangLeeToutanova2019" class="citation arxiv cs1">Devlin, Jacob; Chang, Ming-Wei; Lee, Kenton; Toutanova, Kristina (24 May 2019). "BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding". <a href="/wiki/ArXiv_(identifier)" class="mw-redirect" title="ArXiv (identifier)">arXiv</a>:<span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://arxiv.org/abs/1810.04805">1810.04805</a></span> [<a rel="nofollow" class="external text" href="https://arxiv.org/archive/cs.CL">cs.CL</a>].</cite><span title="ctx_ver=Z39.88-2004&amp;rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&amp;rft.genre=preprint&amp;rft.jtitle=arXiv&amp;rft.atitle=BERT%3A+Pre-training+of+Deep+Bidirectional+Transformers+for+Language+Understanding&amp;rft.date=2019-05-24&amp;rft_id=info%3Aarxiv%2F1810.04805&amp;rft.aulast=Devlin&amp;rft.aufirst=Jacob&amp;rft.au=Chang%2C+Ming-Wei&amp;rft.au=Lee%2C+Kenton&amp;rft.au=Toutanova%2C+Kristina&amp;rfr_id=info%3Asid%2Fen.wikipedia.org%3ASpeech+recognition" class="Z3988"></span></span> </li> <li id="cite_note-:1-48"><span class="mw-cite-backlink">^ <a href="#cite_ref-:1_48-0"><sup><i><b>a</b></i></sup></a> <a href="#cite_ref-:1_48-1"><sup><i><b>b</b></i></sup></a></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222" /><cite id="CITEREFGongChungGlass2021" class="citation arxiv cs1">Gong, Yuan; Chung, Yu-An; Glass, James (8 July 2021). "AST: Audio Spectrogram Transformer". <a href="/wiki/ArXiv_(identifier)" class="mw-redirect" title="ArXiv (identifier)">arXiv</a>:<span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://arxiv.org/abs/2104.01778">2104.01778</a></span> [<a rel="nofollow" class="external text" href="https://arxiv.org/archive/cs.SD">cs.SD</a>].</cite><span title="ctx_ver=Z39.88-2004&amp;rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&amp;rft.genre=preprint&amp;rft.jtitle=arXiv&amp;rft.atitle=AST%3A+Audio+Spectrogram+Transformer&amp;rft.date=2021-07-08&amp;rft_id=info%3Aarxiv%2F2104.01778&amp;rft.aulast=Gong&amp;rft.aufirst=Yuan&amp;rft.au=Chung%2C+Yu-An&amp;rft.au=Glass%2C+James&amp;rfr_id=info%3Asid%2Fen.wikipedia.org%3ASpeech+recognition" class="Z3988"></span></span> </li> <li id="cite_note-:3-49"><span class="mw-cite-backlink">^ <a href="#cite_ref-:3_49-0"><sup><i><b>a</b></i></sup></a> <a href="#cite_ref-:3_49-1"><sup><i><b>b</b></i></sup></a></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222" /><cite id="CITEREFRisteaIonescuKhan2022" class="citation arxiv cs1">Ristea, Nicolae-Catalin; Ionescu, Radu Tudor; Khan, Fahad Shahbaz (20 June 2022). "SepTr: Separable Transformer for Audio Spectrogram Processing". <a href="/wiki/ArXiv_(identifier)" class="mw-redirect" title="ArXiv (identifier)">arXiv</a>:<span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://arxiv.org/abs/2203.09581">2203.09581</a></span> [<a rel="nofollow" class="external text" href="https://arxiv.org/archive/cs.CV">cs.CV</a>].</cite><span title="ctx_ver=Z39.88-2004&amp;rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&amp;rft.genre=preprint&amp;rft.jtitle=arXiv&amp;rft.atitle=SepTr%3A+Separable+Transformer+for+Audio+Spectrogram+Processing&amp;rft.date=2022-06-20&amp;rft_id=info%3Aarxiv%2F2203.09581&amp;rft.aulast=Ristea&amp;rft.aufirst=Nicolae-Catalin&amp;rft.au=Ionescu%2C+Radu+Tudor&amp;rft.au=Khan%2C+Fahad+Shahbaz&amp;rfr_id=info%3Asid%2Fen.wikipedia.org%3ASpeech+recognition" class="Z3988"></span></span> </li> <li id="cite_note-:4-50"><span class="mw-cite-backlink">^ <a href="#cite_ref-:4_50-0"><sup><i><b>a</b></i></sup></a> <a href="#cite_ref-:4_50-1"><sup><i><b>b</b></i></sup></a></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222" /><cite id="CITEREFLohrenzLiFingscheidt2021" class="citation arxiv cs1">Lohrenz, Timo; Li, Zhengyang; Fingscheidt, Tim (14 July 2021). "Multi-Encoder Learning and Stream Fusion for Transformer-Based End-to-End Automatic Speech Recognition". <a href="/wiki/ArXiv_(identifier)" class="mw-redirect" title="ArXiv (identifier)">arXiv</a>:<span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://arxiv.org/abs/2104.00120">2104.00120</a></span> [<a rel="nofollow" class="external text" href="https://arxiv.org/archive/eess.AS">eess.AS</a>].</cite><span title="ctx_ver=Z39.88-2004&amp;rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&amp;rft.genre=preprint&amp;rft.jtitle=arXiv&amp;rft.atitle=Multi-Encoder+Learning+and+Stream+Fusion+for+Transformer-Based+End-to-End+Automatic+Speech+Recognition&amp;rft.date=2021-07-14&amp;rft_id=info%3Aarxiv%2F2104.00120&amp;rft.aulast=Lohrenz&amp;rft.aufirst=Timo&amp;rft.au=Li%2C+Zhengyang&amp;rft.au=Fingscheidt%2C+Tim&amp;rfr_id=info%3Asid%2Fen.wikipedia.org%3ASpeech+recognition" class="Z3988"></span></span> </li> <li id="cite_note-51"><span class="mw-cite-backlink"><b><a href="#cite_ref-51">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222" /><cite class="citation web cs1"><a rel="nofollow" class="external text" href="https://lidengsite.wordpress.com/">"Li Deng"</a>. Li Deng Site. <a rel="nofollow" class="external text" href="https://web.archive.org/web/20240909052323/https://lidengsite.wordpress.com/">Archived</a> from the original on 9 September 2024<span class="reference-accessdate">. Retrieved <span class="nowrap">9 September</span> 2024</span>.</cite><span title="ctx_ver=Z39.88-2004&amp;rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&amp;rft.genre=unknown&amp;rft.btitle=Li+Deng&amp;rft.pub=Li+Deng+Site&amp;rft_id=https%3A%2F%2Flidengsite.wordpress.com%2F&amp;rfr_id=info%3Asid%2Fen.wikipedia.org%3ASpeech+recognition" class="Z3988"></span></span> </li> <li id="cite_note-NIPS2009-52"><span class="mw-cite-backlink"><b><a href="#cite_ref-NIPS2009_52-0">^</a></b></span> <span class="reference-text">NIPS Workshop: Deep Learning for Speech Recognition and Related Applications, Whistler, BC, Canada, Dec. 2009 (Organizers: Li Deng, Geoff Hinton, D. Yu).</span> </li> <li id="cite_note-HintonDengYu2012-53"><span class="mw-cite-backlink">^ <a href="#cite_ref-HintonDengYu2012_53-0"><sup><i><b>a</b></i></sup></a> <a href="#cite_ref-HintonDengYu2012_53-1"><sup><i><b>b</b></i></sup></a> <a href="#cite_ref-HintonDengYu2012_53-2"><sup><i><b>c</b></i></sup></a></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222" /><cite id="CITEREFHintonDengYuDahl2012" class="citation journal cs1">Hinton, Geoffrey; Deng, Li; Yu, Dong; Dahl, George; Mohamed, Abdel-Rahman; Jaitly, Navdeep; Senior, Andrew; Vanhoucke, Vincent; Nguyen, Patrick; <a href="/wiki/Tara_Sainath" title="Tara Sainath">Sainath, Tara</a>; Kingsbury, Brian (2012). "Deep Neural Networks for Acoustic Modeling in Speech Recognition: The shared views of four research groups". <i>IEEE Signal Processing Magazine</i>. <b>29</b> (6): <span class="nowrap">82–</span>97. <a href="/wiki/Bibcode_(identifier)" class="mw-redirect" title="Bibcode (identifier)">Bibcode</a>:<a rel="nofollow" class="external text" href="https://ui.adsabs.harvard.edu/abs/2012ISPM...29...82H">2012ISPM...29...82H</a>. <a href="/wiki/Doi_(identifier)" class="mw-redirect" title="Doi (identifier)">doi</a>:<a rel="nofollow" class="external text" href="https://doi.org/10.1109%2FMSP.2012.2205597">10.1109/MSP.2012.2205597</a>. <a href="/wiki/S2CID_(identifier)" class="mw-redirect" title="S2CID (identifier)">S2CID</a>&#160;<a rel="nofollow" class="external text" href="https://api.semanticscholar.org/CorpusID:206485943">206485943</a>.</cite><span title="ctx_ver=Z39.88-2004&amp;rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&amp;rft.genre=article&amp;rft.jtitle=IEEE+Signal+Processing+Magazine&amp;rft.atitle=Deep+Neural+Networks+for+Acoustic+Modeling+in+Speech+Recognition%3A+The+shared+views+of+four+research+groups&amp;rft.volume=29&amp;rft.issue=6&amp;rft.pages=%3Cspan+class%3D%22nowrap%22%3E82-%3C%2Fspan%3E97&amp;rft.date=2012&amp;rft_id=https%3A%2F%2Fapi.semanticscholar.org%2FCorpusID%3A206485943%23id-name%3DS2CID&amp;rft_id=info%3Adoi%2F10.1109%2FMSP.2012.2205597&amp;rft_id=info%3Abibcode%2F2012ISPM...29...82H&amp;rft.aulast=Hinton&amp;rft.aufirst=Geoffrey&amp;rft.au=Deng%2C+Li&amp;rft.au=Yu%2C+Dong&amp;rft.au=Dahl%2C+George&amp;rft.au=Mohamed%2C+Abdel-Rahman&amp;rft.au=Jaitly%2C+Navdeep&amp;rft.au=Senior%2C+Andrew&amp;rft.au=Vanhoucke%2C+Vincent&amp;rft.au=Nguyen%2C+Patrick&amp;rft.au=Sainath%2C+Tara&amp;rft.au=Kingsbury%2C+Brian&amp;rfr_id=info%3Asid%2Fen.wikipedia.org%3ASpeech+recognition" class="Z3988"></span></span> </li> <li id="cite_note-ReferenceICASSP2013-54"><span class="mw-cite-backlink">^ <a href="#cite_ref-ReferenceICASSP2013_54-0"><sup><i><b>a</b></i></sup></a> <a href="#cite_ref-ReferenceICASSP2013_54-1"><sup><i><b>b</b></i></sup></a></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222" /><cite id="CITEREFDengHintonKingsbury2013" class="citation book cs1">Deng, L.; Hinton, G.; Kingsbury, B. (2013). "New types of deep neural network learning for speech recognition and related applications: An overview". <i>2013 IEEE International Conference on Acoustics, Speech and Signal Processing: New types of deep neural network learning for speech recognition and related applications: An overview</i>. p.&#160;8599. <a href="/wiki/Doi_(identifier)" class="mw-redirect" title="Doi (identifier)">doi</a>:<a rel="nofollow" class="external text" href="https://doi.org/10.1109%2FICASSP.2013.6639344">10.1109/ICASSP.2013.6639344</a>. <a href="/wiki/ISBN_(identifier)" class="mw-redirect" title="ISBN (identifier)">ISBN</a>&#160;<a href="/wiki/Special:BookSources/978-1-4799-0356-6" title="Special:BookSources/978-1-4799-0356-6"><bdi>978-1-4799-0356-6</bdi></a>. <a href="/wiki/S2CID_(identifier)" class="mw-redirect" title="S2CID (identifier)">S2CID</a>&#160;<a rel="nofollow" class="external text" href="https://api.semanticscholar.org/CorpusID:13953660">13953660</a>.</cite><span title="ctx_ver=Z39.88-2004&amp;rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&amp;rft.genre=bookitem&amp;rft.atitle=New+types+of+deep+neural+network+learning+for+speech+recognition+and+related+applications%3A+An+overview&amp;rft.btitle=2013+IEEE+International+Conference+on+Acoustics%2C+Speech+and+Signal+Processing%3A+New+types+of+deep+neural+network+learning+for+speech+recognition+and+related+applications%3A+An+overview&amp;rft.pages=8599&amp;rft.date=2013&amp;rft_id=https%3A%2F%2Fapi.semanticscholar.org%2FCorpusID%3A13953660%23id-name%3DS2CID&amp;rft_id=info%3Adoi%2F10.1109%2FICASSP.2013.6639344&amp;rft.isbn=978-1-4799-0356-6&amp;rft.aulast=Deng&amp;rft.aufirst=L.&amp;rft.au=Hinton%2C+G.&amp;rft.au=Kingsbury%2C+B.&amp;rfr_id=info%3Asid%2Fen.wikipedia.org%3ASpeech+recognition" class="Z3988"></span></span> </li> <li id="cite_note-Scientists-see-advances-55"><span class="mw-cite-backlink">^ <a href="#cite_ref-Scientists-see-advances_55-0"><sup><i><b>a</b></i></sup></a> <a href="#cite_ref-Scientists-see-advances_55-1"><sup><i><b>b</b></i></sup></a></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222" /><cite id="CITEREFMarkoff2012" class="citation news cs1">Markoff, John (23 November 2012). <a rel="nofollow" class="external text" href="https://www.nytimes.com/2012/11/24/science/scientists-see-advances-in-deep-learning-a-part-of-artificial-intelligence.html">"Scientists See Promise in Deep-Learning Programs"</a>. <i>New York Times</i>. <a rel="nofollow" class="external text" href="https://web.archive.org/web/20121130080314/http://www.nytimes.com/2012/11/24/science/scientists-see-advances-in-deep-learning-a-part-of-artificial-intelligence.html">Archived</a> from the original on 30 November 2012<span class="reference-accessdate">. Retrieved <span class="nowrap">20 January</span> 2015</span>.</cite><span title="ctx_ver=Z39.88-2004&amp;rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&amp;rft.genre=article&amp;rft.jtitle=New+York+Times&amp;rft.atitle=Scientists+See+Promise+in+Deep-Learning+Programs&amp;rft.date=2012-11-23&amp;rft.aulast=Markoff&amp;rft.aufirst=John&amp;rft_id=https%3A%2F%2Fwww.nytimes.com%2F2012%2F11%2F24%2Fscience%2Fscientists-see-advances-in-deep-learning-a-part-of-artificial-intelligence.html&amp;rfr_id=info%3Asid%2Fen.wikipedia.org%3ASpeech+recognition" class="Z3988"></span></span> </li> <li id="cite_note-Morgan1993-56"><span class="mw-cite-backlink"><b><a href="#cite_ref-Morgan1993_56-0">^</a></b></span> <span class="reference-text">Morgan, Bourlard, Renals, Cohen, Franco (1993) "Hybrid neural network/hidden Markov model systems for continuous speech recognition. ICASSP/IJPRAI"</span> </li> <li id="cite_note-Robinson1992-57"><span class="mw-cite-backlink"><b><a href="#cite_ref-Robinson1992_57-0">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222" /><cite id="CITEREFT._Robinson1992" class="citation book cs1"><a href="/wiki/Tony_Robinson_(speech_recognition)" title="Tony Robinson (speech recognition)">T. Robinson</a> (1992). <a rel="nofollow" class="external text" href="https://www.researchgate.net/publication/3532171">"A real-time recurrent error propagation network word recognition system"</a>. <i>&#91;Proceedings&#93; ICASSP-92: 1992 IEEE International Conference on Acoustics, Speech, and Signal Processing</i>. pp.&#160;617–620 vol.1. <a href="/wiki/Doi_(identifier)" class="mw-redirect" title="Doi (identifier)">doi</a>:<a rel="nofollow" class="external text" href="https://doi.org/10.1109%2FICASSP.1992.225833">10.1109/ICASSP.1992.225833</a>. <a href="/wiki/ISBN_(identifier)" class="mw-redirect" title="ISBN (identifier)">ISBN</a>&#160;<a href="/wiki/Special:BookSources/0-7803-0532-9" title="Special:BookSources/0-7803-0532-9"><bdi>0-7803-0532-9</bdi></a>. <a href="/wiki/S2CID_(identifier)" class="mw-redirect" title="S2CID (identifier)">S2CID</a>&#160;<a rel="nofollow" class="external text" href="https://api.semanticscholar.org/CorpusID:62446313">62446313</a>.</cite><span title="ctx_ver=Z39.88-2004&amp;rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&amp;rft.genre=bookitem&amp;rft.atitle=A+real-time+recurrent+error+propagation+network+word+recognition+system&amp;rft.btitle=%26%2391%3BProceedings%26%2393%3B+ICASSP-92%3A+1992+IEEE+International+Conference+on+Acoustics%2C+Speech%2C+and+Signal+Processing&amp;rft.pages=617-620+vol.1&amp;rft.date=1992&amp;rft_id=https%3A%2F%2Fapi.semanticscholar.org%2FCorpusID%3A62446313%23id-name%3DS2CID&amp;rft_id=info%3Adoi%2F10.1109%2FICASSP.1992.225833&amp;rft.isbn=0-7803-0532-9&amp;rft.au=T.+Robinson&amp;rft_id=https%3A%2F%2Fwww.researchgate.net%2Fpublication%2F3532171&amp;rfr_id=info%3Asid%2Fen.wikipedia.org%3ASpeech+recognition" class="Z3988"></span></span> </li> <li id="cite_note-Waibel1989-58"><span class="mw-cite-backlink"><b><a href="#cite_ref-Waibel1989_58-0">^</a></b></span> <span class="reference-text"><a href="/wiki/Alex_Waibel" title="Alex Waibel">Waibel</a>, Hanazawa, Hinton, Shikano, Lang. (1989) "<a rel="nofollow" class="external text" href="http://www.inf.ufrgs.br/~engel/data/media/file/cmp121/waibel89_TDNN.pdf">Phoneme recognition using time-delay neural networks</a> <a rel="nofollow" class="external text" href="https://web.archive.org/web/20210225163001/http://www.inf.ufrgs.br/~engel/data/media/file/cmp121/waibel89_TDNN.pdf">Archived</a> 25 February 2021 at the <a href="/wiki/Wayback_Machine" title="Wayback Machine">Wayback Machine</a>. IEEE Transactions on Acoustics, Speech, and Signal Processing."</span> </li> <li id="cite_note-Baker2009-59"><span class="mw-cite-backlink"><b><a href="#cite_ref-Baker2009_59-0">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222" /><cite id="CITEREFBakerLi_DengGlassKhudanpur2009" class="citation journal cs1">Baker, J.; Li Deng; Glass, J.; Khudanpur, S.; <a href="/wiki/Chin-Hui_Lee" title="Chin-Hui Lee">Chin-Hui Lee</a>; Morgan, N.; O'Shaughnessy, D. (2009). "Developments and Directions in Speech Recognition and Understanding, Part 1". <i>IEEE Signal Processing Magazine</i>. <b>26</b> (3): <span class="nowrap">75–</span>80. <a href="/wiki/Bibcode_(identifier)" class="mw-redirect" title="Bibcode (identifier)">Bibcode</a>:<a rel="nofollow" class="external text" href="https://ui.adsabs.harvard.edu/abs/2009ISPM...26...75B">2009ISPM...26...75B</a>. <a href="/wiki/Doi_(identifier)" class="mw-redirect" title="Doi (identifier)">doi</a>:<a rel="nofollow" class="external text" href="https://doi.org/10.1109%2FMSP.2009.932166">10.1109/MSP.2009.932166</a>. <a href="/wiki/Hdl_(identifier)" class="mw-redirect" title="Hdl (identifier)">hdl</a>:<span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://hdl.handle.net/1721.1%2F51891">1721.1/51891</a></span>. <a href="/wiki/S2CID_(identifier)" class="mw-redirect" title="S2CID (identifier)">S2CID</a>&#160;<a rel="nofollow" class="external text" href="https://api.semanticscholar.org/CorpusID:357467">357467</a>.</cite><span title="ctx_ver=Z39.88-2004&amp;rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&amp;rft.genre=article&amp;rft.jtitle=IEEE+Signal+Processing+Magazine&amp;rft.atitle=Developments+and+Directions+in+Speech+Recognition+and+Understanding%2C+Part+1&amp;rft.volume=26&amp;rft.issue=3&amp;rft.pages=%3Cspan+class%3D%22nowrap%22%3E75-%3C%2Fspan%3E80&amp;rft.date=2009&amp;rft_id=info%3Ahdl%2F1721.1%2F51891&amp;rft_id=https%3A%2F%2Fapi.semanticscholar.org%2FCorpusID%3A357467%23id-name%3DS2CID&amp;rft_id=info%3Adoi%2F10.1109%2FMSP.2009.932166&amp;rft_id=info%3Abibcode%2F2009ISPM...26...75B&amp;rft.aulast=Baker&amp;rft.aufirst=J.&amp;rft.au=Li+Deng&amp;rft.au=Glass%2C+J.&amp;rft.au=Khudanpur%2C+S.&amp;rft.au=Chin-Hui+Lee&amp;rft.au=Morgan%2C+N.&amp;rft.au=O%27Shaughnessy%2C+D.&amp;rfr_id=info%3Asid%2Fen.wikipedia.org%3ASpeech+recognition" class="Z3988"></span></span> </li> <li id="cite_note-hochreiter1991-60"><span class="mw-cite-backlink"><b><a href="#cite_ref-hochreiter1991_60-0">^</a></b></span> <span class="reference-text"><a href="/wiki/Sepp_Hochreiter" title="Sepp Hochreiter">Sepp Hochreiter</a> (1991), <a rel="nofollow" class="external text" href="http://people.idsia.ch/~juergen/SeppHochreiter1991ThesisAdvisorSchmidhuber.pdf">Untersuchungen zu dynamischen neuronalen Netzen</a> <a rel="nofollow" class="external text" href="https://web.archive.org/web/20150306075401/http://people.idsia.ch/~juergen/SeppHochreiter1991ThesisAdvisorSchmidhuber.pdf">Archived</a> 6 March 2015 at the <a href="/wiki/Wayback_Machine" title="Wayback Machine">Wayback Machine</a>, Diploma thesis. Institut f. Informatik, Technische Univ. Munich. Advisor: J. Schmidhuber.</span> </li> <li id="cite_note-Bengio1991-61"><span class="mw-cite-backlink"><b><a href="#cite_ref-Bengio1991_61-0">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222" /><cite id="CITEREFBengio1991" class="citation thesis cs1">Bengio, Y. (1991). <a rel="nofollow" class="external text" href="https://elibrary.ru/item.asp?id=5790854"><i>Artificial Neural Networks and their Application to Speech/Sequence Recognition</i></a> (Ph.D. thesis). McGill University.</cite><span title="ctx_ver=Z39.88-2004&amp;rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Adissertation&amp;rft.title=Artificial+Neural+Networks+and+their+Application+to+Speech%2FSequence+Recognition&amp;rft.degree=Ph.D.&amp;rft.inst=McGill+University&amp;rft.date=1991&amp;rft.aulast=Bengio&amp;rft.aufirst=Y.&amp;rft_id=https%3A%2F%2Felibrary.ru%2Fitem.asp%3Fid%3D5790854&amp;rfr_id=info%3Asid%2Fen.wikipedia.org%3ASpeech+recognition" class="Z3988"></span></span> </li> <li id="cite_note-Deng1994-62"><span class="mw-cite-backlink"><b><a href="#cite_ref-Deng1994_62-0">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222" /><cite id="CITEREFDengHassaneinElmasry1994" class="citation journal cs1">Deng, L.; Hassanein, K.; Elmasry, M. (1994). "Analysis of the correlation structure for a neural predictive model with application to speech recognition". <i>Neural Networks</i>. <b>7</b> (2): <span class="nowrap">331–</span>339. <a href="/wiki/Doi_(identifier)" class="mw-redirect" title="Doi (identifier)">doi</a>:<a rel="nofollow" class="external text" href="https://doi.org/10.1016%2F0893-6080%2894%2990027-2">10.1016/0893-6080(94)90027-2</a>.</cite><span title="ctx_ver=Z39.88-2004&amp;rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&amp;rft.genre=article&amp;rft.jtitle=Neural+Networks&amp;rft.atitle=Analysis+of+the+correlation+structure+for+a+neural+predictive+model+with+application+to+speech+recognition&amp;rft.volume=7&amp;rft.issue=2&amp;rft.pages=%3Cspan+class%3D%22nowrap%22%3E331-%3C%2Fspan%3E339&amp;rft.date=1994&amp;rft_id=info%3Adoi%2F10.1016%2F0893-6080%2894%2990027-2&amp;rft.aulast=Deng&amp;rft.aufirst=L.&amp;rft.au=Hassanein%2C+K.&amp;rft.au=Elmasry%2C+M.&amp;rfr_id=info%3Asid%2Fen.wikipedia.org%3ASpeech+recognition" class="Z3988"></span></span> </li> <li id="cite_note-HintonKeynoteICASSP2013-63"><span class="mw-cite-backlink"><b><a href="#cite_ref-HintonKeynoteICASSP2013_63-0">^</a></b></span> <span class="reference-text">Keynote talk: Recent Developments in Deep Neural Networks. ICASSP, 2013 (by Geoff Hinton).</span> </li> <li id="cite_note-interspeech2014Keynote-64"><span class="mw-cite-backlink">^ <a href="#cite_ref-interspeech2014Keynote_64-0"><sup><i><b>a</b></i></sup></a> <a href="#cite_ref-interspeech2014Keynote_64-1"><sup><i><b>b</b></i></sup></a></span> <span class="reference-text">Keynote talk: "<a rel="nofollow" class="external text" href="https://www.isca-speech.org/archive/interspeech_2014/i14_3505.html">Achievements and Challenges of Deep Learning: From Speech Analysis and Recognition To Language and Multimodal Processing</a> <a rel="nofollow" class="external text" href="https://web.archive.org/web/20210305043518/https://www.isca-speech.org/archive/interspeech_2014/i14_3505.html">Archived</a> 5 March 2021 at the <a href="/wiki/Wayback_Machine" title="Wayback Machine">Wayback Machine</a>," Interspeech, September 2014 (by <a href="/w/index.php?title=Li_Deng&amp;action=edit&amp;redlink=1" class="new" title="Li Deng (page does not exist)">Li Deng</a>).</span> </li> <li id="cite_note-65"><span class="mw-cite-backlink"><b><a href="#cite_ref-65">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222" /><cite class="citation web cs1"><a rel="nofollow" class="external text" href="https://web.archive.org/web/20181023080207/https://www.techrepublic.com/article/improvements-in-voice-recognition-software-increase-productivity/">"Improvements in voice recognition software increase"</a>. <i>TechRepublic.com</i>. 27 August 2002. Archived from <a rel="nofollow" class="external text" href="https://www.techrepublic.com/article/improvements-in-voice-recognition-software-increase-productivity">the original</a> on 23 October 2018<span class="reference-accessdate">. Retrieved <span class="nowrap">22 October</span> 2018</span>. <q>Maners said IBM has worked on advancing speech recognition ... or on the floor of a noisy trade show.</q></cite><span title="ctx_ver=Z39.88-2004&amp;rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&amp;rft.genre=unknown&amp;rft.jtitle=TechRepublic.com&amp;rft.atitle=Improvements+in+voice+recognition+software+increase&amp;rft.date=2002-08-27&amp;rft_id=https%3A%2F%2Fwww.techrepublic.com%2Farticle%2Fimprovements-in-voice-recognition-software-increase-productivity&amp;rfr_id=info%3Asid%2Fen.wikipedia.org%3ASpeech+recognition" class="Z3988"></span></span> </li> <li id="cite_note-66"><span class="mw-cite-backlink"><b><a href="#cite_ref-66">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222" /><cite class="citation web cs1"><a rel="nofollow" class="external text" href="http://www.businesstravelnews.com/More-News/Voice-Recognition-To-Ease-Travel-Bookings">"Voice Recognition To Ease Travel Bookings: Business Travel News"</a>. <i>BusinessTravelNews.com</i>. 3 March 1997. <a rel="nofollow" class="external text" href="https://web.archive.org/web/20240909052252/https://www.businesstravelnews.com/More-News/Voice-Recognition-To-Ease-Travel-Bookings">Archived</a> from the original on 9 September 2024<span class="reference-accessdate">. Retrieved <span class="nowrap">9 September</span> 2024</span>. <q>The earliest applications of speech recognition software were dictation ... Four months ago, IBM introduced a 'continual dictation product' designed to ... debuted at the National Business Travel Association trade show in 1994.</q></cite><span title="ctx_ver=Z39.88-2004&amp;rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&amp;rft.genre=unknown&amp;rft.jtitle=BusinessTravelNews.com&amp;rft.atitle=Voice+Recognition+To+Ease+Travel+Bookings%3A+Business+Travel+News&amp;rft.date=1997-03-03&amp;rft_id=http%3A%2F%2Fwww.businesstravelnews.com%2FMore-News%2FVoice-Recognition-To-Ease-Travel-Bookings&amp;rfr_id=info%3Asid%2Fen.wikipedia.org%3ASpeech+recognition" class="Z3988"></span></span> </li> <li id="cite_note-67"><span class="mw-cite-backlink"><b><a href="#cite_ref-67">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222" /><cite id="CITEREFEllis_Booker1994" class="citation news cs1">Ellis Booker (14 March 1994). "Voice recognition enters the mainstream". <i><a href="/wiki/Computerworld" title="Computerworld">Computerworld</a></i>. p.&#160;45. <q>Just a few years ago, speech recognition was limited to ...</q></cite><span title="ctx_ver=Z39.88-2004&amp;rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&amp;rft.genre=article&amp;rft.jtitle=Computerworld&amp;rft.atitle=Voice+recognition+enters+the+mainstream&amp;rft.pages=45&amp;rft.date=1994-03-14&amp;rft.au=Ellis+Booker&amp;rfr_id=info%3Asid%2Fen.wikipedia.org%3ASpeech+recognition" class="Z3988"></span></span> </li> <li id="cite_note-68"><span class="mw-cite-backlink"><b><a href="#cite_ref-68">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222" /><cite class="citation web cs1"><a rel="nofollow" class="external text" href="https://www.microsoft.com/en-us/research/blog/microsoft-researchers-achieve-new-conversational-speech-recognition-milestone/">"Microsoft researchers achieve new conversational speech recognition milestone"</a>. <i><a href="/wiki/Microsoft" title="Microsoft">Microsoft</a></i>. 21 August 2017. <a rel="nofollow" class="external text" href="https://web.archive.org/web/20240909052234/https://www.microsoft.com/en-us/research/blog/microsoft-researchers-achieve-new-conversational-speech-recognition-milestone/">Archived</a> from the original on 9 September 2024<span class="reference-accessdate">. Retrieved <span class="nowrap">9 September</span> 2024</span>.</cite><span title="ctx_ver=Z39.88-2004&amp;rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&amp;rft.genre=unknown&amp;rft.jtitle=Microsoft&amp;rft.atitle=Microsoft+researchers+achieve+new+conversational+speech+recognition+milestone&amp;rft.date=2017-08-21&amp;rft_id=https%3A%2F%2Fwww.microsoft.com%2Fen-us%2Fresearch%2Fblog%2Fmicrosoft-researchers-achieve-new-conversational-speech-recognition-milestone%2F&amp;rfr_id=info%3Asid%2Fen.wikipedia.org%3ASpeech+recognition" class="Z3988"></span></span> </li> <li id="cite_note-69"><span class="mw-cite-backlink"><b><a href="#cite_ref-69">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222" /><cite id="CITEREFGoelByrne2000" class="citation journal cs1">Goel, Vaibhava; Byrne, William J. (2000). <a rel="nofollow" class="external text" href="http://www.clsp.jhu.edu/people/vgoel/publications/CSAL.ps">"Minimum Bayes-risk automatic speech recognition"</a>. <i>Computer Speech &amp; Language</i>. <b>14</b> (2): <span class="nowrap">115–</span>135. <a href="/wiki/Doi_(identifier)" class="mw-redirect" title="Doi (identifier)">doi</a>:<span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://doi.org/10.1006%2Fcsla.2000.0138">10.1006/csla.2000.0138</a></span>. <a href="/wiki/S2CID_(identifier)" class="mw-redirect" title="S2CID (identifier)">S2CID</a>&#160;<a rel="nofollow" class="external text" href="https://api.semanticscholar.org/CorpusID:206561058">206561058</a>. <a rel="nofollow" class="external text" href="https://web.archive.org/web/20110725225846/http://www.clsp.jhu.edu/people/vgoel/publications/CSAL.ps">Archived</a> from the original on 25 July 2011<span class="reference-accessdate">. Retrieved <span class="nowrap">28 March</span> 2011</span>.</cite><span title="ctx_ver=Z39.88-2004&amp;rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&amp;rft.genre=article&amp;rft.jtitle=Computer+Speech+%26+Language&amp;rft.atitle=Minimum+Bayes-risk+automatic+speech+recognition&amp;rft.volume=14&amp;rft.issue=2&amp;rft.pages=%3Cspan+class%3D%22nowrap%22%3E115-%3C%2Fspan%3E135&amp;rft.date=2000&amp;rft_id=info%3Adoi%2F10.1006%2Fcsla.2000.0138&amp;rft_id=https%3A%2F%2Fapi.semanticscholar.org%2FCorpusID%3A206561058%23id-name%3DS2CID&amp;rft.aulast=Goel&amp;rft.aufirst=Vaibhava&amp;rft.au=Byrne%2C+William+J.&amp;rft_id=http%3A%2F%2Fwww.clsp.jhu.edu%2Fpeople%2Fvgoel%2Fpublications%2FCSAL.ps&amp;rfr_id=info%3Asid%2Fen.wikipedia.org%3ASpeech+recognition" class="Z3988"></span></span> </li> <li id="cite_note-70"><span class="mw-cite-backlink"><b><a href="#cite_ref-70">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222" /><cite id="CITEREFMohri2002" class="citation journal cs1">Mohri, M. (2002). <a rel="nofollow" class="external text" href="http://www.cs.nyu.edu/~mohri/pub/edit.pdf">"Edit-Distance of Weighted Automata: General Definitions and Algorithms"</a> <span class="cs1-format">(PDF)</span>. <i>International Journal of Foundations of Computer Science</i>. <b>14</b> (6): <span class="nowrap">957–</span>982. <a href="/wiki/Doi_(identifier)" class="mw-redirect" title="Doi (identifier)">doi</a>:<a rel="nofollow" class="external text" href="https://doi.org/10.1142%2FS0129054103002114">10.1142/S0129054103002114</a>. <a rel="nofollow" class="external text" href="https://web.archive.org/web/20120318032640/http://www.cs.nyu.edu/~mohri/pub/edit.pdf">Archived</a> <span class="cs1-format">(PDF)</span> from the original on 18 March 2012<span class="reference-accessdate">. Retrieved <span class="nowrap">28 March</span> 2011</span>.</cite><span title="ctx_ver=Z39.88-2004&amp;rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&amp;rft.genre=article&amp;rft.jtitle=International+Journal+of+Foundations+of+Computer+Science&amp;rft.atitle=Edit-Distance+of+Weighted+Automata%3A+General+Definitions+and+Algorithms&amp;rft.volume=14&amp;rft.issue=6&amp;rft.pages=%3Cspan+class%3D%22nowrap%22%3E957-%3C%2Fspan%3E982&amp;rft.date=2002&amp;rft_id=info%3Adoi%2F10.1142%2FS0129054103002114&amp;rft.aulast=Mohri&amp;rft.aufirst=M.&amp;rft_id=http%3A%2F%2Fwww.cs.nyu.edu%2F~mohri%2Fpub%2Fedit.pdf&amp;rfr_id=info%3Asid%2Fen.wikipedia.org%3ASpeech+recognition" class="Z3988"></span></span> </li> <li id="cite_note-71"><span class="mw-cite-backlink"><b><a href="#cite_ref-71">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222" /><cite id="CITEREFWaibelHanazawaHintonShikano1989" class="citation journal cs1">Waibel, A.; Hanazawa, T.; Hinton, G.; Shikano, K.; Lang, K. J. (1989). "Phoneme recognition using time-delay neural networks". <i>IEEE Transactions on Acoustics, Speech, and Signal Processing</i>. <b>37</b> (3): <span class="nowrap">328–</span>339. <a href="/wiki/Doi_(identifier)" class="mw-redirect" title="Doi (identifier)">doi</a>:<a rel="nofollow" class="external text" href="https://doi.org/10.1109%2F29.21701">10.1109/29.21701</a>. <a href="/wiki/Hdl_(identifier)" class="mw-redirect" title="Hdl (identifier)">hdl</a>:<span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://hdl.handle.net/10338.dmlcz%2F135496">10338.dmlcz/135496</a></span>. <a href="/wiki/S2CID_(identifier)" class="mw-redirect" title="S2CID (identifier)">S2CID</a>&#160;<a rel="nofollow" class="external text" href="https://api.semanticscholar.org/CorpusID:9563026">9563026</a>.</cite><span title="ctx_ver=Z39.88-2004&amp;rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&amp;rft.genre=article&amp;rft.jtitle=IEEE+Transactions+on+Acoustics%2C+Speech%2C+and+Signal+Processing&amp;rft.atitle=Phoneme+recognition+using+time-delay+neural+networks&amp;rft.volume=37&amp;rft.issue=3&amp;rft.pages=%3Cspan+class%3D%22nowrap%22%3E328-%3C%2Fspan%3E339&amp;rft.date=1989&amp;rft_id=info%3Ahdl%2F10338.dmlcz%2F135496&amp;rft_id=https%3A%2F%2Fapi.semanticscholar.org%2FCorpusID%3A9563026%23id-name%3DS2CID&amp;rft_id=info%3Adoi%2F10.1109%2F29.21701&amp;rft.aulast=Waibel&amp;rft.aufirst=A.&amp;rft.au=Hanazawa%2C+T.&amp;rft.au=Hinton%2C+G.&amp;rft.au=Shikano%2C+K.&amp;rft.au=Lang%2C+K.+J.&amp;rfr_id=info%3Asid%2Fen.wikipedia.org%3ASpeech+recognition" class="Z3988"></span></span> </li> <li id="cite_note-Bird_Wanner_Ekárt_Faria_2020_p=113402-72"><span class="mw-cite-backlink"><b><a href="#cite_ref-Bird_Wanner_Ekárt_Faria_2020_p=113402_72-0">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222" /><cite id="CITEREFBirdWannerEkártFaria2020" class="citation journal cs1">Bird, Jordan J.; Wanner, Elizabeth; Ekárt, Anikó; Faria, Diego R. (2020). <a rel="nofollow" class="external text" href="https://publications.aston.ac.uk/id/eprint/41416/1/Speech_Recog_ESWA_2_.pdf">"Optimisation of phonetic aware speech recognition through multi-objective evolutionary algorithms"</a> <span class="cs1-format">(PDF)</span>. <i>Expert Systems with Applications</i>. <b>153</b>. Elsevier BV: 113402. <a href="/wiki/Doi_(identifier)" class="mw-redirect" title="Doi (identifier)">doi</a>:<a rel="nofollow" class="external text" href="https://doi.org/10.1016%2Fj.eswa.2020.113402">10.1016/j.eswa.2020.113402</a>. <a href="/wiki/ISSN_(identifier)" class="mw-redirect" title="ISSN (identifier)">ISSN</a>&#160;<a rel="nofollow" class="external text" href="https://search.worldcat.org/issn/0957-4174">0957-4174</a>. <a href="/wiki/S2CID_(identifier)" class="mw-redirect" title="S2CID (identifier)">S2CID</a>&#160;<a rel="nofollow" class="external text" href="https://api.semanticscholar.org/CorpusID:216472225">216472225</a>. <a rel="nofollow" class="external text" href="https://web.archive.org/web/20240909053419/https://publications.aston.ac.uk/id/eprint/41416/1/Speech_Recog_ESWA_2_.pdf">Archived</a> <span class="cs1-format">(PDF)</span> from the original on 9 September 2024<span class="reference-accessdate">. Retrieved <span class="nowrap">9 September</span> 2024</span>.</cite><span title="ctx_ver=Z39.88-2004&amp;rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&amp;rft.genre=article&amp;rft.jtitle=Expert+Systems+with+Applications&amp;rft.atitle=Optimisation+of+phonetic+aware+speech+recognition+through+multi-objective+evolutionary+algorithms&amp;rft.volume=153&amp;rft.pages=113402&amp;rft.date=2020&amp;rft_id=https%3A%2F%2Fapi.semanticscholar.org%2FCorpusID%3A216472225%23id-name%3DS2CID&amp;rft.issn=0957-4174&amp;rft_id=info%3Adoi%2F10.1016%2Fj.eswa.2020.113402&amp;rft.aulast=Bird&amp;rft.aufirst=Jordan+J.&amp;rft.au=Wanner%2C+Elizabeth&amp;rft.au=Ek%C3%A1rt%2C+Anik%C3%B3&amp;rft.au=Faria%2C+Diego+R.&amp;rft_id=https%3A%2F%2Fpublications.aston.ac.uk%2Fid%2Feprint%2F41416%2F1%2FSpeech_Recog_ESWA_2_.pdf&amp;rfr_id=info%3Asid%2Fen.wikipedia.org%3ASpeech+recognition" class="Z3988"></span></span> </li> <li id="cite_note-73"><span class="mw-cite-backlink"><b><a href="#cite_ref-73">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222" /><cite id="CITEREFWuChan1993" class="citation journal cs1">Wu, J.; Chan, C. (1993). "Isolated Word Recognition by Neural Network Models with Cross-Correlation Coefficients for Speech Dynamics". <i>IEEE Transactions on Pattern Analysis and Machine Intelligence</i>. <b>15</b> (11): <span class="nowrap">1174–</span>1185. <a href="/wiki/Doi_(identifier)" class="mw-redirect" title="Doi (identifier)">doi</a>:<a rel="nofollow" class="external text" href="https://doi.org/10.1109%2F34.244678">10.1109/34.244678</a>.</cite><span title="ctx_ver=Z39.88-2004&amp;rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&amp;rft.genre=article&amp;rft.jtitle=IEEE+Transactions+on+Pattern+Analysis+and+Machine+Intelligence&amp;rft.atitle=Isolated+Word+Recognition+by+Neural+Network+Models+with+Cross-Correlation+Coefficients+for+Speech+Dynamics&amp;rft.volume=15&amp;rft.issue=11&amp;rft.pages=%3Cspan+class%3D%22nowrap%22%3E1174-%3C%2Fspan%3E1185&amp;rft.date=1993&amp;rft_id=info%3Adoi%2F10.1109%2F34.244678&amp;rft.aulast=Wu&amp;rft.aufirst=J.&amp;rft.au=Chan%2C+C.&amp;rfr_id=info%3Asid%2Fen.wikipedia.org%3ASpeech+recognition" class="Z3988"></span></span> </li> <li id="cite_note-74"><span class="mw-cite-backlink"><b><a href="#cite_ref-74">^</a></b></span> <span class="reference-text">S. A. Zahorian, A. M. Zimmer, and F. Meng, (2002) "<a rel="nofollow" class="external text" href="https://www.researchgate.net/profile/Stephen_Zahorian/publication/221480228_Vowel_classification_for_computer-based_visual_feedback_for_speech_training_for_the_hearing_impaired/links/00b7d525d25f51c585000000.pdf">Vowel Classification for Computer based Visual Feedback for Speech Training for the Hearing Impaired</a>," in ICSLP 2002</span> </li> <li id="cite_note-75"><span class="mw-cite-backlink"><b><a href="#cite_ref-75">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222" /><cite id="CITEREFHuZahorian2010" class="citation book cs1">Hu, Hongbing; Zahorian, Stephen A. (2010). <a rel="nofollow" class="external text" href="http://bingweb.binghamton.edu/~hhu1/paper/Hu2010Dimensionality.pdf">"Dimensionality Reduction Methods for HMM Phonetic Recognition"</a> <span class="cs1-format">(PDF)</span>. <i>ICASSP 2010</i>. <a rel="nofollow" class="external text" href="http://archive.wikiwix.com/cache/20120706063756/http://bingweb.binghamton.edu/~hhu1/paper/Hu2010Dimensionality.pdf">Archived</a> <span class="cs1-format">(PDF)</span> from the original on 6 July 2012.</cite><span title="ctx_ver=Z39.88-2004&amp;rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&amp;rft.genre=bookitem&amp;rft.atitle=Dimensionality+Reduction+Methods+for+HMM+Phonetic+Recognition&amp;rft.btitle=ICASSP+2010&amp;rft.date=2010&amp;rft.aulast=Hu&amp;rft.aufirst=Hongbing&amp;rft.au=Zahorian%2C+Stephen+A.&amp;rft_id=http%3A%2F%2Fbingweb.binghamton.edu%2F~hhu1%2Fpaper%2FHu2010Dimensionality.pdf&amp;rfr_id=info%3Asid%2Fen.wikipedia.org%3ASpeech+recognition" class="Z3988"></span></span> </li> <li id="cite_note-fernandez2007-76"><span class="mw-cite-backlink"><b><a href="#cite_ref-fernandez2007_76-0">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222" /><cite id="CITEREFFernandezGravesSchmidhuber2007" class="citation book cs1">Fernandez, Santiago; Graves, Alex; <a href="/wiki/J%C3%BCrgen_Schmidhuber" title="Jürgen Schmidhuber">Schmidhuber, Jürgen</a> (2007). <a rel="nofollow" class="external text" href="http://www.aaai.org/Papers/IJCAI/2007/IJCAI07-124.pdf">"Sequence labelling in structured domains with hierarchical recurrent neural networks"</a> <span class="cs1-format">(PDF)</span>. <i>Proceedings of IJCAI</i>. <a rel="nofollow" class="external text" href="https://web.archive.org/web/20170815003130/http://www.aaai.org/Papers/IJCAI/2007/IJCAI07-124.pdf">Archived</a> <span class="cs1-format">(PDF)</span> from the original on 15 August 2017.</cite><span title="ctx_ver=Z39.88-2004&amp;rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&amp;rft.genre=bookitem&amp;rft.atitle=Sequence+labelling+in+structured+domains+with+hierarchical+recurrent+neural+networks&amp;rft.btitle=Proceedings+of+IJCAI&amp;rft.date=2007&amp;rft.aulast=Fernandez&amp;rft.aufirst=Santiago&amp;rft.au=Graves%2C+Alex&amp;rft.au=Schmidhuber%2C+J%C3%BCrgen&amp;rft_id=http%3A%2F%2Fwww.aaai.org%2FPapers%2FIJCAI%2F2007%2FIJCAI07-124.pdf&amp;rfr_id=info%3Asid%2Fen.wikipedia.org%3ASpeech+recognition" class="Z3988"></span></span> </li> <li id="cite_note-77"><span class="mw-cite-backlink"><b><a href="#cite_ref-77">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222" /><cite id="CITEREFGravesMohamedHinton2013" class="citation arxiv cs1">Graves, Alex; Mohamed, Abdel-rahman; Hinton, Geoffrey (2013). "Speech recognition with deep recurrent neural networks". <a href="/wiki/ArXiv_(identifier)" class="mw-redirect" title="ArXiv (identifier)">arXiv</a>:<span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://arxiv.org/abs/1303.5778">1303.5778</a></span> [<a rel="nofollow" class="external text" href="https://arxiv.org/archive/cs.NE">cs.NE</a>].</cite><span title="ctx_ver=Z39.88-2004&amp;rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&amp;rft.genre=preprint&amp;rft.jtitle=arXiv&amp;rft.atitle=Speech+recognition+with+deep+recurrent+neural+networks&amp;rft.date=2013&amp;rft_id=info%3Aarxiv%2F1303.5778&amp;rft.aulast=Graves&amp;rft.aufirst=Alex&amp;rft.au=Mohamed%2C+Abdel-rahman&amp;rft.au=Hinton%2C+Geoffrey&amp;rfr_id=info%3Asid%2Fen.wikipedia.org%3ASpeech+recognition" class="Z3988"></span> ICASSP 2013.</span> </li> <li id="cite_note-78"><span class="mw-cite-backlink"><b><a href="#cite_ref-78">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222" /><cite id="CITEREFWaibel1989" class="citation journal cs1">Waibel, Alex (1989). <a rel="nofollow" class="external text" href="http://isl.anthropomatik.kit.edu/cmu-kit/Modular_Construction_of_Time-Delay_Neural_Networks_for_Speech_Recognition.pdf">"Modular Construction of Time-Delay Neural Networks for Speech Recognition"</a> <span class="cs1-format">(PDF)</span>. <i>Neural Computation</i>. <b>1</b> (1): <span class="nowrap">39–</span>46. <a href="/wiki/Doi_(identifier)" class="mw-redirect" title="Doi (identifier)">doi</a>:<a rel="nofollow" class="external text" href="https://doi.org/10.1162%2Fneco.1989.1.1.39">10.1162/neco.1989.1.1.39</a>. <a href="/wiki/S2CID_(identifier)" class="mw-redirect" title="S2CID (identifier)">S2CID</a>&#160;<a rel="nofollow" class="external text" href="https://api.semanticscholar.org/CorpusID:236321">236321</a>. <a rel="nofollow" class="external text" href="https://web.archive.org/web/20160629180846/http://isl.anthropomatik.kit.edu/cmu-kit/Modular_Construction_of_Time-Delay_Neural_Networks_for_Speech_Recognition.pdf">Archived</a> <span class="cs1-format">(PDF)</span> from the original on 29 June 2016.</cite><span title="ctx_ver=Z39.88-2004&amp;rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&amp;rft.genre=article&amp;rft.jtitle=Neural+Computation&amp;rft.atitle=Modular+Construction+of+Time-Delay+Neural+Networks+for+Speech+Recognition&amp;rft.volume=1&amp;rft.issue=1&amp;rft.pages=%3Cspan+class%3D%22nowrap%22%3E39-%3C%2Fspan%3E46&amp;rft.date=1989&amp;rft_id=info%3Adoi%2F10.1162%2Fneco.1989.1.1.39&amp;rft_id=https%3A%2F%2Fapi.semanticscholar.org%2FCorpusID%3A236321%23id-name%3DS2CID&amp;rft.aulast=Waibel&amp;rft.aufirst=Alex&amp;rft_id=http%3A%2F%2Fisl.anthropomatik.kit.edu%2Fcmu-kit%2FModular_Construction_of_Time-Delay_Neural_Networks_for_Speech_Recognition.pdf&amp;rfr_id=info%3Asid%2Fen.wikipedia.org%3ASpeech+recognition" class="Z3988"></span></span> </li> <li id="cite_note-79"><span class="mw-cite-backlink"><b><a href="#cite_ref-79">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222" /><cite id="CITEREFMaasLeO&#39;NeilVinyals2012" class="citation book cs1">Maas, Andrew L.; Le, Quoc V.; O'Neil, Tyler M.; Vinyals, Oriol; Nguyen, Patrick; <a href="/wiki/Andrew_Ng" title="Andrew Ng">Ng, Andrew Y.</a> (2012). "Recurrent Neural Networks for Noise Reduction in Robust ASR". <i>Proceedings of Interspeech 2012</i>.</cite><span title="ctx_ver=Z39.88-2004&amp;rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&amp;rft.genre=bookitem&amp;rft.atitle=Recurrent+Neural+Networks+for+Noise+Reduction+in+Robust+ASR&amp;rft.btitle=Proceedings+of+Interspeech+2012&amp;rft.date=2012&amp;rft.aulast=Maas&amp;rft.aufirst=Andrew+L.&amp;rft.au=Le%2C+Quoc+V.&amp;rft.au=O%27Neil%2C+Tyler+M.&amp;rft.au=Vinyals%2C+Oriol&amp;rft.au=Nguyen%2C+Patrick&amp;rft.au=Ng%2C+Andrew+Y.&amp;rfr_id=info%3Asid%2Fen.wikipedia.org%3ASpeech+recognition" class="Z3988"></span></span> </li> <li id="cite_note-BOOK2014-80"><span class="mw-cite-backlink">^ <a href="#cite_ref-BOOK2014_80-0"><sup><i><b>a</b></i></sup></a> <a href="#cite_ref-BOOK2014_80-1"><sup><i><b>b</b></i></sup></a></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222" /><cite id="CITEREFDengYu2014" class="citation journal cs1">Deng, Li; Yu, Dong (2014). <a rel="nofollow" class="external text" href="http://research.microsoft.com/pubs/209355/DeepLearning-NowPublishing-Vol7-SIG-039.pdf">"Deep Learning: Methods and Applications"</a> <span class="cs1-format">(PDF)</span>. <i>Foundations and Trends in Signal Processing</i>. <b>7</b> (<span class="nowrap">3–</span>4): <span class="nowrap">197–</span>387. <a href="/wiki/CiteSeerX_(identifier)" class="mw-redirect" title="CiteSeerX (identifier)">CiteSeerX</a>&#160;<span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.691.3679">10.1.1.691.3679</a></span>. <a href="/wiki/Doi_(identifier)" class="mw-redirect" title="Doi (identifier)">doi</a>:<a rel="nofollow" class="external text" href="https://doi.org/10.1561%2F2000000039">10.1561/2000000039</a>. <a rel="nofollow" class="external text" href="https://web.archive.org/web/20141022161017/http://research.microsoft.com/pubs/209355/DeepLearning-NowPublishing-Vol7-SIG-039.pdf">Archived</a> <span class="cs1-format">(PDF)</span> from the original on 22 October 2014.</cite><span title="ctx_ver=Z39.88-2004&amp;rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&amp;rft.genre=article&amp;rft.jtitle=Foundations+and+Trends+in+Signal+Processing&amp;rft.atitle=Deep+Learning%3A+Methods+and+Applications&amp;rft.volume=7&amp;rft.issue=%3Cspan+class%3D%22nowrap%22%3E3%E2%80%93%3C%2Fspan%3E4&amp;rft.pages=%3Cspan+class%3D%22nowrap%22%3E197-%3C%2Fspan%3E387&amp;rft.date=2014&amp;rft_id=https%3A%2F%2Fciteseerx.ist.psu.edu%2Fviewdoc%2Fsummary%3Fdoi%3D10.1.1.691.3679%23id-name%3DCiteSeerX&amp;rft_id=info%3Adoi%2F10.1561%2F2000000039&amp;rft.aulast=Deng&amp;rft.aufirst=Li&amp;rft.au=Yu%2C+Dong&amp;rft_id=http%3A%2F%2Fresearch.microsoft.com%2Fpubs%2F209355%2FDeepLearning-NowPublishing-Vol7-SIG-039.pdf&amp;rfr_id=info%3Asid%2Fen.wikipedia.org%3ASpeech+recognition" class="Z3988"></span></span> </li> <li id="cite_note-Roles2010-81"><span class="mw-cite-backlink"><b><a href="#cite_ref-Roles2010_81-0">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222" /><cite id="CITEREFYuDengDahl2010" class="citation journal cs1">Yu, D.; Deng, L.; Dahl, G. (2010). <a rel="nofollow" class="external text" href="https://www.microsoft.com/en-us/research/wp-content/uploads/2016/02/dbn4asr-nips2010.pdf">"Roles of Pre-Training and Fine-Tuning in Context-Dependent DBN-HMMs for Real-World Speech Recognition"</a> <span class="cs1-format">(PDF)</span>. <i>NIPS Workshop on Deep Learning and Unsupervised Feature Learning</i>.</cite><span title="ctx_ver=Z39.88-2004&amp;rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&amp;rft.genre=article&amp;rft.jtitle=NIPS+Workshop+on+Deep+Learning+and+Unsupervised+Feature+Learning&amp;rft.atitle=Roles+of+Pre-Training+and+Fine-Tuning+in+Context-Dependent+DBN-HMMs+for+Real-World+Speech+Recognition&amp;rft.date=2010&amp;rft.aulast=Yu&amp;rft.aufirst=D.&amp;rft.au=Deng%2C+L.&amp;rft.au=Dahl%2C+G.&amp;rft_id=https%3A%2F%2Fwww.microsoft.com%2Fen-us%2Fresearch%2Fwp-content%2Fuploads%2F2016%2F02%2Fdbn4asr-nips2010.pdf&amp;rfr_id=info%3Asid%2Fen.wikipedia.org%3ASpeech+recognition" class="Z3988"></span></span> </li> <li id="cite_note-ref27-82"><span class="mw-cite-backlink"><b><a href="#cite_ref-ref27_82-0">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222" /><cite id="CITEREFDahlYuDengAcero2012" class="citation journal cs1">Dahl, George E.; Yu, Dong; Deng, Li; Acero, Alex (2012). "Context-Dependent Pre-Trained Deep Neural Networks for Large-Vocabulary Speech Recognition". <i>IEEE Transactions on Audio, Speech, and Language Processing</i>. <b>20</b> (1): <span class="nowrap">30–</span>42. <a href="/wiki/Doi_(identifier)" class="mw-redirect" title="Doi (identifier)">doi</a>:<a rel="nofollow" class="external text" href="https://doi.org/10.1109%2FTASL.2011.2134090">10.1109/TASL.2011.2134090</a>. <a href="/wiki/S2CID_(identifier)" class="mw-redirect" title="S2CID (identifier)">S2CID</a>&#160;<a rel="nofollow" class="external text" href="https://api.semanticscholar.org/CorpusID:14862572">14862572</a>.</cite><span title="ctx_ver=Z39.88-2004&amp;rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&amp;rft.genre=article&amp;rft.jtitle=IEEE+Transactions+on+Audio%2C+Speech%2C+and+Language+Processing&amp;rft.atitle=Context-Dependent+Pre-Trained+Deep+Neural+Networks+for+Large-Vocabulary+Speech+Recognition&amp;rft.volume=20&amp;rft.issue=1&amp;rft.pages=%3Cspan+class%3D%22nowrap%22%3E30-%3C%2Fspan%3E42&amp;rft.date=2012&amp;rft_id=info%3Adoi%2F10.1109%2FTASL.2011.2134090&amp;rft_id=https%3A%2F%2Fapi.semanticscholar.org%2FCorpusID%3A14862572%23id-name%3DS2CID&amp;rft.aulast=Dahl&amp;rft.aufirst=George+E.&amp;rft.au=Yu%2C+Dong&amp;rft.au=Deng%2C+Li&amp;rft.au=Acero%2C+Alex&amp;rfr_id=info%3Asid%2Fen.wikipedia.org%3ASpeech+recognition" class="Z3988"></span></span> </li> <li id="cite_note-ICASSP2013-83"><span class="mw-cite-backlink"><b><a href="#cite_ref-ICASSP2013_83-0">^</a></b></span> <span class="reference-text">Deng L., Li, J., Huang, J., Yao, K., Yu, D., Seide, F. et al. <a rel="nofollow" class="external text" href="https://pdfs.semanticscholar.org/6bdc/cfe195bc49d218acc5be750aa49e41f408e4.pdf">Recent Advances in Deep Learning for Speech Research at Microsoft</a> <a rel="nofollow" class="external text" href="https://web.archive.org/web/20240909052236/https://pdfs.semanticscholar.org/6bdc/cfe195bc49d218acc5be750aa49e41f408e4.pdf">Archived</a> 9 September 2024 at the <a href="/wiki/Wayback_Machine" title="Wayback Machine">Wayback Machine</a>. ICASSP, 2013.</span> </li> <li id="cite_note-ReferenceA-84"><span class="mw-cite-backlink">^ <a href="#cite_ref-ReferenceA_84-0"><sup><i><b>a</b></i></sup></a> <a href="#cite_ref-ReferenceA_84-1"><sup><i><b>b</b></i></sup></a></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222" /><cite id="CITEREFYuDeng2014" class="citation journal cs1">Yu, D.; Deng, L. (2014). "Automatic Speech Recognition: A Deep Learning Approach (Publisher: Springer)".</cite><span title="ctx_ver=Z39.88-2004&amp;rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&amp;rft.genre=article&amp;rft.atitle=Automatic+Speech+Recognition%3A+A+Deep+Learning+Approach+%28Publisher%3A+Springer%29&amp;rft.date=2014&amp;rft.aulast=Yu&amp;rft.aufirst=D.&amp;rft.au=Deng%2C+L.&amp;rfr_id=info%3Asid%2Fen.wikipedia.org%3ASpeech+recognition" class="Z3988"></span> <span class="cs1-visible-error citation-comment"><code class="cs1-code">{{<a href="/wiki/Template:Cite_journal" title="Template:Cite journal">cite journal</a>}}</code>: </span><span class="cs1-visible-error citation-comment">Cite journal requires <code class="cs1-code">&#124;journal=</code> (<a href="/wiki/Help:CS1_errors#missing_periodical" title="Help:CS1 errors">help</a>)</span></span> </li> <li id="cite_note-85"><span class="mw-cite-backlink"><b><a href="#cite_ref-85">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222" /><cite id="CITEREFDengLi2013" class="citation journal cs1">Deng, L.; Li, Xiao (2013). <a rel="nofollow" class="external text" href="http://cvsp.cs.ntua.gr/courses/patrec/slides_material2018/slides-2018/DengLi_MLParadigms-SpeechRecogn-AnOverview_TALSP13.pdf">"Machine Learning Paradigms for Speech Recognition: An Overview"</a> <span class="cs1-format">(PDF)</span>. <i>IEEE Transactions on Audio, Speech, and Language Processing</i>. <b>21</b> (5): <span class="nowrap">1060–</span>1089. <a href="/wiki/Doi_(identifier)" class="mw-redirect" title="Doi (identifier)">doi</a>:<a rel="nofollow" class="external text" href="https://doi.org/10.1109%2FTASL.2013.2244083">10.1109/TASL.2013.2244083</a>. <a href="/wiki/S2CID_(identifier)" class="mw-redirect" title="S2CID (identifier)">S2CID</a>&#160;<a rel="nofollow" class="external text" href="https://api.semanticscholar.org/CorpusID:16585863">16585863</a>. <a rel="nofollow" class="external text" href="https://web.archive.org/web/20240909052239/http://cvsp.cs.ntua.gr/courses/patrec/slides_material2018/slides-2018/DengLi_MLParadigms-SpeechRecogn-AnOverview_TALSP13.pdf">Archived</a> <span class="cs1-format">(PDF)</span> from the original on 9 September 2024<span class="reference-accessdate">. Retrieved <span class="nowrap">9 September</span> 2024</span>.</cite><span title="ctx_ver=Z39.88-2004&amp;rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&amp;rft.genre=article&amp;rft.jtitle=IEEE+Transactions+on+Audio%2C+Speech%2C+and+Language+Processing&amp;rft.atitle=Machine+Learning+Paradigms+for+Speech+Recognition%3A+An+Overview&amp;rft.volume=21&amp;rft.issue=5&amp;rft.pages=%3Cspan+class%3D%22nowrap%22%3E1060-%3C%2Fspan%3E1089&amp;rft.date=2013&amp;rft_id=info%3Adoi%2F10.1109%2FTASL.2013.2244083&amp;rft_id=https%3A%2F%2Fapi.semanticscholar.org%2FCorpusID%3A16585863%23id-name%3DS2CID&amp;rft.aulast=Deng&amp;rft.aufirst=L.&amp;rft.au=Li%2C+Xiao&amp;rft_id=http%3A%2F%2Fcvsp.cs.ntua.gr%2Fcourses%2Fpatrec%2Fslides_material2018%2Fslides-2018%2FDengLi_MLParadigms-SpeechRecogn-AnOverview_TALSP13.pdf&amp;rfr_id=info%3Asid%2Fen.wikipedia.org%3ASpeech+recognition" class="Z3988"></span></span> </li> <li id="cite_note-scholarpedia2015-86"><span class="mw-cite-backlink"><b><a href="#cite_ref-scholarpedia2015_86-0">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222" /><cite id="CITEREFSchmidhuber2015" class="citation journal cs1"><a href="/wiki/J%C3%BCrgen_Schmidhuber" title="Jürgen Schmidhuber">Schmidhuber, Jürgen</a> (2015). <a rel="nofollow" class="external text" href="https://doi.org/10.4249%2Fscholarpedia.32832">"Deep Learning"</a>. <i>Scholarpedia</i>. <b>10</b> (11): 32832. <a href="/wiki/Bibcode_(identifier)" class="mw-redirect" title="Bibcode (identifier)">Bibcode</a>:<a rel="nofollow" class="external text" href="https://ui.adsabs.harvard.edu/abs/2015SchpJ..1032832S">2015SchpJ..1032832S</a>. <a href="/wiki/Doi_(identifier)" class="mw-redirect" title="Doi (identifier)">doi</a>:<span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://doi.org/10.4249%2Fscholarpedia.32832">10.4249/scholarpedia.32832</a></span>.</cite><span title="ctx_ver=Z39.88-2004&amp;rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&amp;rft.genre=article&amp;rft.jtitle=Scholarpedia&amp;rft.atitle=Deep+Learning&amp;rft.volume=10&amp;rft.issue=11&amp;rft.pages=32832&amp;rft.date=2015&amp;rft_id=info%3Adoi%2F10.4249%2Fscholarpedia.32832&amp;rft_id=info%3Abibcode%2F2015SchpJ..1032832S&amp;rft.aulast=Schmidhuber&amp;rft.aufirst=J%C3%BCrgen&amp;rft_id=https%3A%2F%2Fdoi.org%2F10.4249%252Fscholarpedia.32832&amp;rfr_id=info%3Asid%2Fen.wikipedia.org%3ASpeech+recognition" class="Z3988"></span></span> </li> <li id="cite_note-interspeech2010-87"><span class="mw-cite-backlink"><b><a href="#cite_ref-interspeech2010_87-0">^</a></b></span> <span class="reference-text">L. Deng, M. Seltzer, D. Yu, A. Acero, A. Mohamed, and G. Hinton (2010) <a rel="nofollow" class="external text" href="http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.185.1908&amp;rep=rep1&amp;type=pdf">Binary Coding of Speech Spectrograms Using a Deep Auto-encoder</a>. Interspeech.</span> </li> <li id="cite_note-interspeech2014-88"><span class="mw-cite-backlink"><b><a href="#cite_ref-interspeech2014_88-0">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222" /><cite id="CITEREFTüskeGolikSchlüterNey2014" class="citation book cs1">Tüske, Zoltán; Golik, Pavel; Schlüter, Ralf; Ney, Hermann (2014). <a rel="nofollow" class="external text" href="https://www-i6.informatik.rwth-aachen.de/publications/download/937/T%7Bu%7DskeZolt%7Ba%7DnGolikPavelSchl%7Bu%7DterRalfNeyHermann--AcousticModelingwithDeepNeuralNetworksUsingRawTimeSignalfor%7BLVCSR%7D--2014.pdf">"Acoustic Modeling with Deep Neural Networks Using Raw Time Signal for LVCSR"</a> <span class="cs1-format">(PDF)</span>. <i>Interspeech 2014</i>. <a rel="nofollow" class="external text" href="https://web.archive.org/web/20161221174753/https://www-i6.informatik.rwth-aachen.de/publications/download/937/T%7Bu%7DskeZolt%7Ba%7DnGolikPavelSchl%7Bu%7DterRalfNeyHermann--AcousticModelingwithDeepNeuralNetworksUsingRawTimeSignalfor%7BLVCSR%7D--2014.pdf">Archived</a> <span class="cs1-format">(PDF)</span> from the original on 21 December 2016.</cite><span title="ctx_ver=Z39.88-2004&amp;rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&amp;rft.genre=bookitem&amp;rft.atitle=Acoustic+Modeling+with+Deep+Neural+Networks+Using+Raw+Time+Signal+for+LVCSR&amp;rft.btitle=Interspeech+2014&amp;rft.date=2014&amp;rft.aulast=T%C3%BCske&amp;rft.aufirst=Zolt%C3%A1n&amp;rft.au=Golik%2C+Pavel&amp;rft.au=Schl%C3%BCter%2C+Ralf&amp;rft.au=Ney%2C+Hermann&amp;rft_id=https%3A%2F%2Fwww-i6.informatik.rwth-aachen.de%2Fpublications%2Fdownload%2F937%2FT%257Bu%257DskeZolt%257Ba%257DnGolikPavelSchl%257Bu%257DterRalfNeyHermann--AcousticModelingwithDeepNeuralNetworksUsingRawTimeSignalfor%257BLVCSR%257D--2014.pdf&amp;rfr_id=info%3Asid%2Fen.wikipedia.org%3ASpeech+recognition" class="Z3988"></span></span> </li> <li id="cite_note-89"><span class="mw-cite-backlink"><b><a href="#cite_ref-89">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222" /><cite id="CITEREFJurafsky2016" class="citation book cs1">Jurafsky, Daniel (2016). <i>Speech and Language Processing</i>.</cite><span title="ctx_ver=Z39.88-2004&amp;rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&amp;rft.genre=book&amp;rft.btitle=Speech+and+Language+Processing&amp;rft.date=2016&amp;rft.aulast=Jurafsky&amp;rft.aufirst=Daniel&amp;rfr_id=info%3Asid%2Fen.wikipedia.org%3ASpeech+recognition" class="Z3988"></span></span> </li> <li id="cite_note-90"><span class="mw-cite-backlink"><b><a href="#cite_ref-90">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222" /><cite id="CITEREFGraves2014" class="citation journal cs1">Graves, Alex (2014). <a rel="nofollow" class="external text" href="https://web.archive.org/web/20170110184531/http://jmlr.org/proceedings/papers/v32/graves14.pdf">"Towards End-to-End Speech Recognition with Recurrent Neural Networks"</a> <span class="cs1-format">(PDF)</span>. <i>ICML</i>. Archived from <a rel="nofollow" class="external text" href="http://www.jmlr.org/proceedings/papers/v32/graves14.pdf">the original</a> <span class="cs1-format">(PDF)</span> on 10 January 2017<span class="reference-accessdate">. Retrieved <span class="nowrap">22 July</span> 2019</span>.</cite><span title="ctx_ver=Z39.88-2004&amp;rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&amp;rft.genre=article&amp;rft.jtitle=ICML&amp;rft.atitle=Towards+End-to-End+Speech+Recognition+with+Recurrent+Neural+Networks&amp;rft.date=2014&amp;rft.aulast=Graves&amp;rft.aufirst=Alex&amp;rft_id=http%3A%2F%2Fwww.jmlr.org%2Fproceedings%2Fpapers%2Fv32%2Fgraves14.pdf&amp;rfr_id=info%3Asid%2Fen.wikipedia.org%3ASpeech+recognition" class="Z3988"></span></span> </li> <li id="cite_note-91"><span class="mw-cite-backlink"><b><a href="#cite_ref-91">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222" /><cite id="CITEREFAmodei2016" class="citation arxiv cs1">Amodei, Dario (2016). "Deep Speech 2: End-to-End Speech Recognition in English and Mandarin". <a href="/wiki/ArXiv_(identifier)" class="mw-redirect" title="ArXiv (identifier)">arXiv</a>:<span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://arxiv.org/abs/1512.02595">1512.02595</a></span> [<a rel="nofollow" class="external text" href="https://arxiv.org/archive/cs.CL">cs.CL</a>].</cite><span title="ctx_ver=Z39.88-2004&amp;rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&amp;rft.genre=preprint&amp;rft.jtitle=arXiv&amp;rft.atitle=Deep+Speech+2%3A+End-to-End+Speech+Recognition+in+English+and+Mandarin&amp;rft.date=2016&amp;rft_id=info%3Aarxiv%2F1512.02595&amp;rft.aulast=Amodei&amp;rft.aufirst=Dario&amp;rfr_id=info%3Asid%2Fen.wikipedia.org%3ASpeech+recognition" class="Z3988"></span></span> </li> <li id="cite_note-92"><span class="mw-cite-backlink"><b><a href="#cite_ref-92">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222" /><cite class="citation web cs1"><a rel="nofollow" class="external text" href="https://www.youtube.com/watch?v=fa5QGremQf8">"LipNet: How easy do you think lipreading is?"</a>. <i>YouTube</i>. 4 November 2016. <a rel="nofollow" class="external text" href="https://web.archive.org/web/20170427104009/https://www.youtube.com/watch?v=fa5QGremQf8">Archived</a> from the original on 27 April 2017<span class="reference-accessdate">. Retrieved <span class="nowrap">5 May</span> 2017</span>.</cite><span title="ctx_ver=Z39.88-2004&amp;rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&amp;rft.genre=unknown&amp;rft.jtitle=YouTube&amp;rft.atitle=LipNet%3A+How+easy+do+you+think+lipreading+is%3F&amp;rft.date=2016-11-04&amp;rft_id=https%3A%2F%2Fwww.youtube.com%2Fwatch%3Fv%3Dfa5QGremQf8&amp;rfr_id=info%3Asid%2Fen.wikipedia.org%3ASpeech+recognition" class="Z3988"></span></span> </li> <li id="cite_note-93"><span class="mw-cite-backlink"><b><a href="#cite_ref-93">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222" /><cite id="CITEREFAssaelShillingfordWhitesonde_Freitas2016" class="citation arxiv cs1">Assael, Yannis; Shillingford, Brendan; Whiteson, Shimon; de Freitas, Nando (5 November 2016). "LipNet: End-to-End Sentence-level Lipreading". <a href="/wiki/ArXiv_(identifier)" class="mw-redirect" title="ArXiv (identifier)">arXiv</a>:<span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://arxiv.org/abs/1611.01599">1611.01599</a></span> [<a rel="nofollow" class="external text" href="https://arxiv.org/archive/cs.CV">cs.CV</a>].</cite><span title="ctx_ver=Z39.88-2004&amp;rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&amp;rft.genre=preprint&amp;rft.jtitle=arXiv&amp;rft.atitle=LipNet%3A+End-to-End+Sentence-level+Lipreading&amp;rft.date=2016-11-05&amp;rft_id=info%3Aarxiv%2F1611.01599&amp;rft.aulast=Assael&amp;rft.aufirst=Yannis&amp;rft.au=Shillingford%2C+Brendan&amp;rft.au=Whiteson%2C+Shimon&amp;rft.au=de+Freitas%2C+Nando&amp;rfr_id=info%3Asid%2Fen.wikipedia.org%3ASpeech+recognition" class="Z3988"></span></span> </li> <li id="cite_note-:0-94"><span class="mw-cite-backlink"><b><a href="#cite_ref-:0_94-0">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222" /><cite id="CITEREFShillingfordAssaelHoffmanPaine2018" class="citation arxiv cs1">Shillingford, Brendan; Assael, Yannis; Hoffman, Matthew W.; Paine, Thomas; Hughes, Cían; Prabhu, Utsav; Liao, Hank; Sak, Hasim; Rao, Kanishka (13 July 2018). "Large-Scale Visual Speech Recognition". <a href="/wiki/ArXiv_(identifier)" class="mw-redirect" title="ArXiv (identifier)">arXiv</a>:<span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://arxiv.org/abs/1807.05162">1807.05162</a></span> [<a rel="nofollow" class="external text" href="https://arxiv.org/archive/cs.CV">cs.CV</a>].</cite><span title="ctx_ver=Z39.88-2004&amp;rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&amp;rft.genre=preprint&amp;rft.jtitle=arXiv&amp;rft.atitle=Large-Scale+Visual+Speech+Recognition&amp;rft.date=2018-07-13&amp;rft_id=info%3Aarxiv%2F1807.05162&amp;rft.aulast=Shillingford&amp;rft.aufirst=Brendan&amp;rft.au=Assael%2C+Yannis&amp;rft.au=Hoffman%2C+Matthew+W.&amp;rft.au=Paine%2C+Thomas&amp;rft.au=Hughes%2C+C%C3%ADan&amp;rft.au=Prabhu%2C+Utsav&amp;rft.au=Liao%2C+Hank&amp;rft.au=Sak%2C+Hasim&amp;rft.au=Rao%2C+Kanishka&amp;rfr_id=info%3Asid%2Fen.wikipedia.org%3ASpeech+recognition" class="Z3988"></span></span> </li> <li id="cite_note-95"><span class="mw-cite-backlink"><b><a href="#cite_ref-95">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222" /><cite id="CITEREFLiLavrukhinGinsburgLeary2019" class="citation book cs1">Li, Jason; Lavrukhin, Vitaly; Ginsburg, Boris; Leary, Ryan; Kuchaiev, Oleksii; Cohen, Jonathan M.; Nguyen, Huyen; Gadde, Ravi Teja (2019). <a rel="nofollow" class="external text" href="https://www.isca-archive.org/interspeech_2019/li19_interspeech.html">"Jasper: An End-to-End Convolutional Neural Acoustic Model"</a>. <i>Interspeech 2019</i>. pp.&#160;<span class="nowrap">71–</span>75. <a href="/wiki/ArXiv_(identifier)" class="mw-redirect" title="ArXiv (identifier)">arXiv</a>:<span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://arxiv.org/abs/1904.03288">1904.03288</a></span>. <a href="/wiki/Doi_(identifier)" class="mw-redirect" title="Doi (identifier)">doi</a>:<a rel="nofollow" class="external text" href="https://doi.org/10.21437%2FInterspeech.2019-1819">10.21437/Interspeech.2019-1819</a>.</cite><span title="ctx_ver=Z39.88-2004&amp;rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&amp;rft.genre=bookitem&amp;rft.atitle=Jasper%3A+An+End-to-End+Convolutional+Neural+Acoustic+Model&amp;rft.btitle=Interspeech+2019&amp;rft.pages=%3Cspan+class%3D%22nowrap%22%3E71-%3C%2Fspan%3E75&amp;rft.date=2019&amp;rft_id=info%3Aarxiv%2F1904.03288&amp;rft_id=info%3Adoi%2F10.21437%2FInterspeech.2019-1819&amp;rft.aulast=Li&amp;rft.aufirst=Jason&amp;rft.au=Lavrukhin%2C+Vitaly&amp;rft.au=Ginsburg%2C+Boris&amp;rft.au=Leary%2C+Ryan&amp;rft.au=Kuchaiev%2C+Oleksii&amp;rft.au=Cohen%2C+Jonathan+M.&amp;rft.au=Nguyen%2C+Huyen&amp;rft.au=Gadde%2C+Ravi+Teja&amp;rft_id=https%3A%2F%2Fwww.isca-archive.org%2Finterspeech_2019%2Fli19_interspeech.html&amp;rfr_id=info%3Asid%2Fen.wikipedia.org%3ASpeech+recognition" class="Z3988"></span></span> </li> <li id="cite_note-96"><span class="mw-cite-backlink"><b><a href="#cite_ref-96">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222" /><cite id="CITEREFKrimanBeliaevGinsburgHuang2019" class="citation cs2">Kriman, Samuel; Beliaev, Stanislav; Ginsburg, Boris; Huang, Jocelyn; Kuchaiev, Oleksii; Lavrukhin, Vitaly; Leary, Ryan; Li, Jason; Zhang, Yang (22 October 2019), <i>QuartzNet: Deep Automatic Speech Recognition with 1D Time-Channel Separable Convolutions</i>, <a href="/wiki/ArXiv_(identifier)" class="mw-redirect" title="ArXiv (identifier)">arXiv</a>:<span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://arxiv.org/abs/1910.10261">1910.10261</a></span></cite><span title="ctx_ver=Z39.88-2004&amp;rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&amp;rft.genre=book&amp;rft.btitle=QuartzNet%3A+Deep+Automatic+Speech+Recognition+with+1D+Time-Channel+Separable+Convolutions&amp;rft.date=2019-10-22&amp;rft_id=info%3Aarxiv%2F1910.10261&amp;rft.aulast=Kriman&amp;rft.aufirst=Samuel&amp;rft.au=Beliaev%2C+Stanislav&amp;rft.au=Ginsburg%2C+Boris&amp;rft.au=Huang%2C+Jocelyn&amp;rft.au=Kuchaiev%2C+Oleksii&amp;rft.au=Lavrukhin%2C+Vitaly&amp;rft.au=Leary%2C+Ryan&amp;rft.au=Li%2C+Jason&amp;rft.au=Zhang%2C+Yang&amp;rfr_id=info%3Asid%2Fen.wikipedia.org%3ASpeech+recognition" class="Z3988"></span></span> </li> <li id="cite_note-97"><span class="mw-cite-backlink"><b><a href="#cite_ref-97">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222" /><cite id="CITEREFMedeirosCoradoRatoQuaresma2023" class="citation journal cs1">Medeiros, Eduardo; Corado, Leonel; Rato, Luís; Quaresma, Paulo; Salgueiro, Pedro (May 2023). <a rel="nofollow" class="external text" href="https://doi.org/10.3390%2Ffi15050159">"Domain Adaptation Speech-to-Text for Low-Resource European Portuguese Using Deep Learning"</a>. <i>Future Internet</i>. <b>15</b> (5): 159. <a href="/wiki/Doi_(identifier)" class="mw-redirect" title="Doi (identifier)">doi</a>:<span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://doi.org/10.3390%2Ffi15050159">10.3390/fi15050159</a></span>. <a href="/wiki/ISSN_(identifier)" class="mw-redirect" title="ISSN (identifier)">ISSN</a>&#160;<a rel="nofollow" class="external text" href="https://search.worldcat.org/issn/1999-5903">1999-5903</a>.</cite><span title="ctx_ver=Z39.88-2004&amp;rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&amp;rft.genre=article&amp;rft.jtitle=Future+Internet&amp;rft.atitle=Domain+Adaptation+Speech-to-Text+for+Low-Resource+European+Portuguese+Using+Deep+Learning&amp;rft.volume=15&amp;rft.issue=5&amp;rft.pages=159&amp;rft.date=2023-05&amp;rft_id=info%3Adoi%2F10.3390%2Ffi15050159&amp;rft.issn=1999-5903&amp;rft.aulast=Medeiros&amp;rft.aufirst=Eduardo&amp;rft.au=Corado%2C+Leonel&amp;rft.au=Rato%2C+Lu%C3%ADs&amp;rft.au=Quaresma%2C+Paulo&amp;rft.au=Salgueiro%2C+Pedro&amp;rft_id=https%3A%2F%2Fdoi.org%2F10.3390%252Ffi15050159&amp;rfr_id=info%3Asid%2Fen.wikipedia.org%3ASpeech+recognition" class="Z3988"></span></span> </li> <li id="cite_note-98"><span class="mw-cite-backlink"><b><a href="#cite_ref-98">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222" /><cite id="CITEREFJoshiSingh2022" class="citation journal cs1">Joshi, Raviraj; Singh, Anupam (May 2022). Malmasi, Shervin; Rokhlenko, Oleg; Ueffing, Nicola; Guy, Ido; Agichtein, Eugene; Kallumadi, Surya (eds.). <a rel="nofollow" class="external text" href="https://aclanthology.org/2022.ecnlp-1.28/">"A Simple Baseline for Domain Adaptation in End to End ASR Systems Using Synthetic Data"</a>. <i>Proceedings of the Fifth Workshop on E-Commerce and NLP (ECNLP 5)</i>. Dublin, Ireland: Association for Computational Linguistics: <span class="nowrap">244–</span>249. <a href="/wiki/ArXiv_(identifier)" class="mw-redirect" title="ArXiv (identifier)">arXiv</a>:<span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://arxiv.org/abs/2206.13240">2206.13240</a></span>. <a href="/wiki/Doi_(identifier)" class="mw-redirect" title="Doi (identifier)">doi</a>:<a rel="nofollow" class="external text" href="https://doi.org/10.18653%2Fv1%2F2022.ecnlp-1.28">10.18653/v1/2022.ecnlp-1.28</a>.</cite><span title="ctx_ver=Z39.88-2004&amp;rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&amp;rft.genre=article&amp;rft.jtitle=Proceedings+of+the+Fifth+Workshop+on+E-Commerce+and+NLP+%28ECNLP+5%29&amp;rft.atitle=A+Simple+Baseline+for+Domain+Adaptation+in+End+to+End+ASR+Systems+Using+Synthetic+Data&amp;rft.pages=%3Cspan+class%3D%22nowrap%22%3E244-%3C%2Fspan%3E249&amp;rft.date=2022-05&amp;rft_id=info%3Aarxiv%2F2206.13240&amp;rft_id=info%3Adoi%2F10.18653%2Fv1%2F2022.ecnlp-1.28&amp;rft.aulast=Joshi&amp;rft.aufirst=Raviraj&amp;rft.au=Singh%2C+Anupam&amp;rft_id=https%3A%2F%2Faclanthology.org%2F2022.ecnlp-1.28%2F&amp;rfr_id=info%3Asid%2Fen.wikipedia.org%3ASpeech+recognition" class="Z3988"></span></span> </li> <li id="cite_note-99"><span class="mw-cite-backlink"><b><a href="#cite_ref-99">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222" /><cite id="CITEREFSukhadiaUmesh2023" class="citation book cs1">Sukhadia, Vrunda N.; Umesh, S. (9 January 2023). <a rel="nofollow" class="external text" href="https://ieeexplore.ieee.org/document/10023233">"Domain Adaptation of Low-Resource Target-Domain Models Using Well-Trained ASR Conformer Models"</a>. <i>2022 IEEE Spoken Language Technology Workshop (SLT)</i>. IEEE. pp.&#160;<span class="nowrap">295–</span>301. <a href="/wiki/ArXiv_(identifier)" class="mw-redirect" title="ArXiv (identifier)">arXiv</a>:<span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://arxiv.org/abs/2202.09167">2202.09167</a></span>. <a href="/wiki/Doi_(identifier)" class="mw-redirect" title="Doi (identifier)">doi</a>:<a rel="nofollow" class="external text" href="https://doi.org/10.1109%2FSLT54892.2023.10023233">10.1109/SLT54892.2023.10023233</a>. <a href="/wiki/ISBN_(identifier)" class="mw-redirect" title="ISBN (identifier)">ISBN</a>&#160;<a href="/wiki/Special:BookSources/979-8-3503-9690-4" title="Special:BookSources/979-8-3503-9690-4"><bdi>979-8-3503-9690-4</bdi></a>.</cite><span title="ctx_ver=Z39.88-2004&amp;rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&amp;rft.genre=bookitem&amp;rft.atitle=Domain+Adaptation+of+Low-Resource+Target-Domain+Models+Using+Well-Trained+ASR+Conformer+Models&amp;rft.btitle=2022+IEEE+Spoken+Language+Technology+Workshop+%28SLT%29&amp;rft.pages=%3Cspan+class%3D%22nowrap%22%3E295-%3C%2Fspan%3E301&amp;rft.pub=IEEE&amp;rft.date=2023-01-09&amp;rft_id=info%3Aarxiv%2F2202.09167&amp;rft_id=info%3Adoi%2F10.1109%2FSLT54892.2023.10023233&amp;rft.isbn=979-8-3503-9690-4&amp;rft.aulast=Sukhadia&amp;rft.aufirst=Vrunda+N.&amp;rft.au=Umesh%2C+S.&amp;rft_id=https%3A%2F%2Fieeexplore.ieee.org%2Fdocument%2F10023233&amp;rfr_id=info%3Asid%2Fen.wikipedia.org%3ASpeech+recognition" class="Z3988"></span></span> </li> <li id="cite_note-100"><span class="mw-cite-backlink"><b><a href="#cite_ref-100">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222" /><cite id="CITEREFChanJaitlyLeVinyals2016" class="citation journal cs1">Chan, William; Jaitly, Navdeep; Le, Quoc; Vinyals, Oriol (2016). <a rel="nofollow" class="external text" href="https://storage.googleapis.com/pub-tools-public-publication-data/pdf/44926.pdf">"Listen, Attend and Spell: A Neural Network for Large Vocabulary Conversational Speech Recognition"</a> <span class="cs1-format">(PDF)</span>. <i>ICASSP</i>. <a rel="nofollow" class="external text" href="https://web.archive.org/web/20240909053931/https://storage.googleapis.com/pub-tools-public-publication-data/pdf/44926.pdf">Archived</a> <span class="cs1-format">(PDF)</span> from the original on 9 September 2024<span class="reference-accessdate">. Retrieved <span class="nowrap">9 September</span> 2024</span>.</cite><span title="ctx_ver=Z39.88-2004&amp;rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&amp;rft.genre=article&amp;rft.jtitle=ICASSP&amp;rft.atitle=Listen%2C+Attend+and+Spell%3A+A+Neural+Network+for+Large+Vocabulary+Conversational+Speech+Recognition&amp;rft.date=2016&amp;rft.aulast=Chan&amp;rft.aufirst=William&amp;rft.au=Jaitly%2C+Navdeep&amp;rft.au=Le%2C+Quoc&amp;rft.au=Vinyals%2C+Oriol&amp;rft_id=https%3A%2F%2Fstorage.googleapis.com%2Fpub-tools-public-publication-data%2Fpdf%2F44926.pdf&amp;rfr_id=info%3Asid%2Fen.wikipedia.org%3ASpeech+recognition" class="Z3988"></span></span> </li> <li id="cite_note-101"><span class="mw-cite-backlink"><b><a href="#cite_ref-101">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222" /><cite id="CITEREFBahdanau2016" class="citation arxiv cs1">Bahdanau, Dzmitry (2016). "End-to-End Attention-based Large Vocabulary Speech Recognition". <a href="/wiki/ArXiv_(identifier)" class="mw-redirect" title="ArXiv (identifier)">arXiv</a>:<span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://arxiv.org/abs/1508.04395">1508.04395</a></span> [<a rel="nofollow" class="external text" href="https://arxiv.org/archive/cs.CL">cs.CL</a>].</cite><span title="ctx_ver=Z39.88-2004&amp;rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&amp;rft.genre=preprint&amp;rft.jtitle=arXiv&amp;rft.atitle=End-to-End+Attention-based+Large+Vocabulary+Speech+Recognition&amp;rft.date=2016&amp;rft_id=info%3Aarxiv%2F1508.04395&amp;rft.aulast=Bahdanau&amp;rft.aufirst=Dzmitry&amp;rfr_id=info%3Asid%2Fen.wikipedia.org%3ASpeech+recognition" class="Z3988"></span></span> </li> <li id="cite_note-102"><span class="mw-cite-backlink"><b><a href="#cite_ref-102">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222" /><cite id="CITEREFChorowskiJaitly2016" class="citation arxiv cs1">Chorowski, Jan; Jaitly, Navdeep (8 December 2016). "Towards better decoding and language model integration in sequence to sequence models". <a href="/wiki/ArXiv_(identifier)" class="mw-redirect" title="ArXiv (identifier)">arXiv</a>:<span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://arxiv.org/abs/1612.02695">1612.02695</a></span> [<a rel="nofollow" class="external text" href="https://arxiv.org/archive/cs.NE">cs.NE</a>].</cite><span title="ctx_ver=Z39.88-2004&amp;rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&amp;rft.genre=preprint&amp;rft.jtitle=arXiv&amp;rft.atitle=Towards+better+decoding+and+language+model+integration+in+sequence+to+sequence+models&amp;rft.date=2016-12-08&amp;rft_id=info%3Aarxiv%2F1612.02695&amp;rft.aulast=Chorowski&amp;rft.aufirst=Jan&amp;rft.au=Jaitly%2C+Navdeep&amp;rfr_id=info%3Asid%2Fen.wikipedia.org%3ASpeech+recognition" class="Z3988"></span></span> </li> <li id="cite_note-103"><span class="mw-cite-backlink"><b><a href="#cite_ref-103">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222" /><cite id="CITEREFChanZhangLeJaitly2016" class="citation arxiv cs1">Chan, William; Zhang, Yu; Le, Quoc; Jaitly, Navdeep (10 October 2016). "Latent Sequence Decompositions". <a href="/wiki/ArXiv_(identifier)" class="mw-redirect" title="ArXiv (identifier)">arXiv</a>:<span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://arxiv.org/abs/1610.03035">1610.03035</a></span> [<a rel="nofollow" class="external text" href="https://arxiv.org/archive/stat.ML">stat.ML</a>].</cite><span title="ctx_ver=Z39.88-2004&amp;rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&amp;rft.genre=preprint&amp;rft.jtitle=arXiv&amp;rft.atitle=Latent+Sequence+Decompositions&amp;rft.date=2016-10-10&amp;rft_id=info%3Aarxiv%2F1610.03035&amp;rft.aulast=Chan&amp;rft.aufirst=William&amp;rft.au=Zhang%2C+Yu&amp;rft.au=Le%2C+Quoc&amp;rft.au=Jaitly%2C+Navdeep&amp;rfr_id=info%3Asid%2Fen.wikipedia.org%3ASpeech+recognition" class="Z3988"></span></span> </li> <li id="cite_note-104"><span class="mw-cite-backlink"><b><a href="#cite_ref-104">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222" /><cite id="CITEREFChungSeniorVinyalsZisserman2016" class="citation book cs1">Chung, Joon Son; Senior, Andrew; Vinyals, Oriol; Zisserman, Andrew (16 November 2016). "Lip Reading Sentences in the Wild". <i>2017 IEEE Conference on Computer Vision and Pattern Recognition (CVPR)</i>. pp.&#160;<span class="nowrap">3444–</span>3453. <a href="/wiki/ArXiv_(identifier)" class="mw-redirect" title="ArXiv (identifier)">arXiv</a>:<span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://arxiv.org/abs/1611.05358">1611.05358</a></span>. <a href="/wiki/Doi_(identifier)" class="mw-redirect" title="Doi (identifier)">doi</a>:<a rel="nofollow" class="external text" href="https://doi.org/10.1109%2FCVPR.2017.367">10.1109/CVPR.2017.367</a>. <a href="/wiki/ISBN_(identifier)" class="mw-redirect" title="ISBN (identifier)">ISBN</a>&#160;<a href="/wiki/Special:BookSources/978-1-5386-0457-1" title="Special:BookSources/978-1-5386-0457-1"><bdi>978-1-5386-0457-1</bdi></a>. <a href="/wiki/S2CID_(identifier)" class="mw-redirect" title="S2CID (identifier)">S2CID</a>&#160;<a rel="nofollow" class="external text" href="https://api.semanticscholar.org/CorpusID:1662180">1662180</a>.</cite><span title="ctx_ver=Z39.88-2004&amp;rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&amp;rft.genre=bookitem&amp;rft.atitle=Lip+Reading+Sentences+in+the+Wild&amp;rft.btitle=2017+IEEE+Conference+on+Computer+Vision+and+Pattern+Recognition+%28CVPR%29&amp;rft.pages=%3Cspan+class%3D%22nowrap%22%3E3444-%3C%2Fspan%3E3453&amp;rft.date=2016-11-16&amp;rft_id=info%3Aarxiv%2F1611.05358&amp;rft_id=https%3A%2F%2Fapi.semanticscholar.org%2FCorpusID%3A1662180%23id-name%3DS2CID&amp;rft_id=info%3Adoi%2F10.1109%2FCVPR.2017.367&amp;rft.isbn=978-1-5386-0457-1&amp;rft.aulast=Chung&amp;rft.aufirst=Joon+Son&amp;rft.au=Senior%2C+Andrew&amp;rft.au=Vinyals%2C+Oriol&amp;rft.au=Zisserman%2C+Andrew&amp;rfr_id=info%3Asid%2Fen.wikipedia.org%3ASpeech+recognition" class="Z3988"></span></span> </li> <li id="cite_note-105"><span class="mw-cite-backlink"><b><a href="#cite_ref-105">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222" /><cite id="CITEREFEl_KheirAli2023" class="citation cs2">El Kheir, Yassine; et&#160;al. (21 October 2023), <i>Automatic Pronunciation Assessment — A Review</i>, Conference on Empirical Methods in Natural Language Processing, <a href="/wiki/ArXiv_(identifier)" class="mw-redirect" title="ArXiv (identifier)">arXiv</a>:<span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://arxiv.org/abs/2310.13974">2310.13974</a></span>, <a href="/wiki/S2CID_(identifier)" class="mw-redirect" title="S2CID (identifier)">S2CID</a>&#160;<a rel="nofollow" class="external text" href="https://api.semanticscholar.org/CorpusID:264426545">264426545</a></cite><span title="ctx_ver=Z39.88-2004&amp;rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&amp;rft.genre=book&amp;rft.btitle=Automatic+Pronunciation+Assessment+%E2%80%94+A+Review&amp;rft.pub=Conference+on+Empirical+Methods+in+Natural+Language+Processing&amp;rft.date=2023-10-21&amp;rft_id=info%3Aarxiv%2F2310.13974&amp;rft_id=https%3A%2F%2Fapi.semanticscholar.org%2FCorpusID%3A264426545%23id-name%3DS2CID&amp;rft.aulast=El+Kheir&amp;rft.aufirst=Yassine&amp;rft.au=Ali%2C+Ahmed&amp;rfr_id=info%3Asid%2Fen.wikipedia.org%3ASpeech+recognition" class="Z3988"></span></span> </li> <li id="cite_note-106"><span class="mw-cite-backlink"><b><a href="#cite_ref-106">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222" /><cite id="CITEREFIsaacsHarding2017" class="citation journal cs1">Isaacs, Talia; Harding, Luke (July 2017). <a rel="nofollow" class="external text" href="https://doi.org/10.1017%2FS0261444817000118">"Pronunciation assessment"</a>. <i>Language Teaching</i>. <b>50</b> (3): <span class="nowrap">347–</span>366. <a href="/wiki/Doi_(identifier)" class="mw-redirect" title="Doi (identifier)">doi</a>:<span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://doi.org/10.1017%2FS0261444817000118">10.1017/S0261444817000118</a></span>. <a href="/wiki/ISSN_(identifier)" class="mw-redirect" title="ISSN (identifier)">ISSN</a>&#160;<a rel="nofollow" class="external text" href="https://search.worldcat.org/issn/0261-4448">0261-4448</a>. <a href="/wiki/S2CID_(identifier)" class="mw-redirect" title="S2CID (identifier)">S2CID</a>&#160;<a rel="nofollow" class="external text" href="https://api.semanticscholar.org/CorpusID:209353525">209353525</a>.</cite><span title="ctx_ver=Z39.88-2004&amp;rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&amp;rft.genre=article&amp;rft.jtitle=Language+Teaching&amp;rft.atitle=Pronunciation+assessment&amp;rft.volume=50&amp;rft.issue=3&amp;rft.pages=%3Cspan+class%3D%22nowrap%22%3E347-%3C%2Fspan%3E366&amp;rft.date=2017-07&amp;rft_id=https%3A%2F%2Fapi.semanticscholar.org%2FCorpusID%3A209353525%23id-name%3DS2CID&amp;rft.issn=0261-4448&amp;rft_id=info%3Adoi%2F10.1017%2FS0261444817000118&amp;rft.aulast=Isaacs&amp;rft.aufirst=Talia&amp;rft.au=Harding%2C+Luke&amp;rft_id=https%3A%2F%2Fdoi.org%2F10.1017%252FS0261444817000118&amp;rfr_id=info%3Asid%2Fen.wikipedia.org%3ASpeech+recognition" class="Z3988"></span></span> </li> <li id="cite_note-107"><span class="mw-cite-backlink"><b><a href="#cite_ref-107">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222" /><cite id="CITEREFLoukinaLopez2015" class="citation cs2">Loukina, Anastassia; et&#160;al. (6 September 2015), <a rel="nofollow" class="external text" href="https://www.isca-speech.org/archive/pdfs/interspeech_2015/loukina15_interspeech.pdf">"Pronunciation accuracy and intelligibility of non-native speech"</a> <span class="cs1-format">(PDF)</span>, <i>INTERSPEECH 2015</i>, Dresden, Germany: <a href="/wiki/International_Speech_Communication_Association" title="International Speech Communication Association">International Speech Communication Association</a>, pp.&#160;<span class="nowrap">1917–</span>1921, <a rel="nofollow" class="external text" href="https://web.archive.org/web/20240909053932/https://www.isca-speech.org/archive/pdfs/interspeech_2015/loukina15_interspeech.pdf">archived</a> <span class="cs1-format">(PDF)</span> from the original on 9 September 2024<span class="reference-accessdate">, retrieved <span class="nowrap">9 September</span> 2024</span>, <q>only 16% of the variability in word-level intelligibility can be explained by the presence of obvious mispronunciations.</q></cite><span title="ctx_ver=Z39.88-2004&amp;rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&amp;rft.genre=bookitem&amp;rft.atitle=Pronunciation+accuracy+and+intelligibility+of+non-native+speech&amp;rft.btitle=INTERSPEECH+2015&amp;rft.place=Dresden%2C+Germany&amp;rft.pages=%3Cspan+class%3D%22nowrap%22%3E1917-%3C%2Fspan%3E1921&amp;rft.pub=International+Speech+Communication+Association&amp;rft.date=2015-09-06&amp;rft.aulast=Loukina&amp;rft.aufirst=Anastassia&amp;rft.au=Lopez%2C+Melissa&amp;rft_id=https%3A%2F%2Fwww.isca-speech.org%2Farchive%2Fpdfs%2Finterspeech_2015%2Floukina15_interspeech.pdf&amp;rfr_id=info%3Asid%2Fen.wikipedia.org%3ASpeech+recognition" class="Z3988"></span></span> </li> <li id="cite_note-obrien-108"><span class="mw-cite-backlink"><b><a href="#cite_ref-obrien_108-0">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222" /><cite id="CITEREFO’BrienDerwing2018" class="citation journal cs1">O’Brien, Mary Grantham; et&#160;al. (31 December 2018). <a rel="nofollow" class="external text" href="https://doi.org/10.1075%2Fjslp.17001.obr">"Directions for the future of technology in pronunciation research and teaching"</a>. <i>Journal of Second Language Pronunciation</i>. <b>4</b> (2): <span class="nowrap">182–</span>207. <a href="/wiki/Doi_(identifier)" class="mw-redirect" title="Doi (identifier)">doi</a>:<span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://doi.org/10.1075%2Fjslp.17001.obr">10.1075/jslp.17001.obr</a></span>. <a href="/wiki/Hdl_(identifier)" class="mw-redirect" title="Hdl (identifier)">hdl</a>:<span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://hdl.handle.net/2066%2F199273">2066/199273</a></span>. <a href="/wiki/ISSN_(identifier)" class="mw-redirect" title="ISSN (identifier)">ISSN</a>&#160;<a rel="nofollow" class="external text" href="https://search.worldcat.org/issn/2215-1931">2215-1931</a>. <a href="/wiki/S2CID_(identifier)" class="mw-redirect" title="S2CID (identifier)">S2CID</a>&#160;<a rel="nofollow" class="external text" href="https://api.semanticscholar.org/CorpusID:86440885">86440885</a>. <q>pronunciation researchers are primarily interested in improving L2 learners' intelligibility and comprehensibility, but they have not yet collected sufficient amounts of representative and reliable data (speech recordings with corresponding annotations and judgments) indicating which errors affect these speech dimensions and which do not. These data are essential to train ASR algorithms to assess L2 learners' intelligibility.</q></cite><span title="ctx_ver=Z39.88-2004&amp;rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&amp;rft.genre=article&amp;rft.jtitle=Journal+of+Second+Language+Pronunciation&amp;rft.atitle=Directions+for+the+future+of+technology+in+pronunciation+research+and+teaching&amp;rft.volume=4&amp;rft.issue=2&amp;rft.pages=%3Cspan+class%3D%22nowrap%22%3E182-%3C%2Fspan%3E207&amp;rft.date=2018-12-31&amp;rft_id=info%3Ahdl%2F2066%2F199273&amp;rft_id=https%3A%2F%2Fapi.semanticscholar.org%2FCorpusID%3A86440885%23id-name%3DS2CID&amp;rft.issn=2215-1931&amp;rft_id=info%3Adoi%2F10.1075%2Fjslp.17001.obr&amp;rft.aulast=O%E2%80%99Brien&amp;rft.aufirst=Mary+Grantham&amp;rft.au=Derwing%2C+Tracey+M.&amp;rft_id=https%3A%2F%2Fdoi.org%2F10.1075%252Fjslp.17001.obr&amp;rfr_id=info%3Asid%2Fen.wikipedia.org%3ASpeech+recognition" class="Z3988"></span></span> </li> <li id="cite_note-109"><span class="mw-cite-backlink"><b><a href="#cite_ref-109">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222" /><cite id="CITEREFEskenazi1999" class="citation journal cs1">Eskenazi, Maxine (January 1999). <a rel="nofollow" class="external text" href="https://www.lltjournal.org/item/10125-25043/">"Using automatic speech processing for foreign language pronunciation tutoring: Some issues and a prototype"</a>. <i>Language Learning &amp; Technology</i>. <b>2</b> (2): <span class="nowrap">62–</span>76. <a rel="nofollow" class="external text" href="https://web.archive.org/web/20240909053942/https://www.lltjournal.org/item/10125-25043/">Archived</a> from the original on 9 September 2024<span class="reference-accessdate">. Retrieved <span class="nowrap">11 February</span> 2023</span>.</cite><span title="ctx_ver=Z39.88-2004&amp;rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&amp;rft.genre=article&amp;rft.jtitle=Language+Learning+%26+Technology&amp;rft.atitle=Using+automatic+speech+processing+for+foreign+language+pronunciation+tutoring%3A+Some+issues+and+a+prototype&amp;rft.volume=2&amp;rft.issue=2&amp;rft.pages=%3Cspan+class%3D%22nowrap%22%3E62-%3C%2Fspan%3E76&amp;rft.date=1999-01&amp;rft.aulast=Eskenazi&amp;rft.aufirst=Maxine&amp;rft_id=https%3A%2F%2Fwww.lltjournal.org%2Fitem%2F10125-25043%2F&amp;rfr_id=info%3Asid%2Fen.wikipedia.org%3ASpeech+recognition" class="Z3988"></span></span> </li> <li id="cite_note-110"><span class="mw-cite-backlink"><b><a href="#cite_ref-110">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222" /><cite id="CITEREFTholfsen2023" class="citation news cs1">Tholfsen, Mike (9 February 2023). <a rel="nofollow" class="external text" href="https://techcommunity.microsoft.com/t5/education-blog/reading-coach-in-immersive-reader-plus-new-features-coming-to/ba-p/3734079">"Reading Coach in Immersive Reader plus new features coming to Reading Progress in Microsoft Teams"</a>. <i>Techcommunity Education Blog</i>. Microsoft. <a rel="nofollow" class="external text" href="https://web.archive.org/web/20240909052822/https://techcommunity.microsoft.com/t5/education-blog/reading-coach-in-immersive-reader-plus-new-features-coming-to/ba-p/3734079">Archived</a> from the original on 9 September 2024<span class="reference-accessdate">. Retrieved <span class="nowrap">12 February</span> 2023</span>.</cite><span title="ctx_ver=Z39.88-2004&amp;rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&amp;rft.genre=article&amp;rft.jtitle=Techcommunity+Education+Blog&amp;rft.atitle=Reading+Coach+in+Immersive+Reader+plus+new+features+coming+to+Reading+Progress+in+Microsoft+Teams&amp;rft.date=2023-02-09&amp;rft.aulast=Tholfsen&amp;rft.aufirst=Mike&amp;rft_id=https%3A%2F%2Ftechcommunity.microsoft.com%2Ft5%2Feducation-blog%2Freading-coach-in-immersive-reader-plus-new-features-coming-to%2Fba-p%2F3734079&amp;rfr_id=info%3Asid%2Fen.wikipedia.org%3ASpeech+recognition" class="Z3988"></span></span> </li> <li id="cite_note-111"><span class="mw-cite-backlink"><b><a href="#cite_ref-111">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222" /><cite id="CITEREFBanerji2023" class="citation news cs1">Banerji, Olina (7 March 2023). <a rel="nofollow" class="external text" href="https://www.edsurge.com/news/2023-03-07-schools-are-using-voice-technology-to-teach-reading-is-it-helping">"Schools Are Using Voice Technology to Teach Reading. Is It Helping?"</a>. <i>EdSurge News</i>. <a rel="nofollow" class="external text" href="https://web.archive.org/web/20240909054611/https://www.edsurge.com/news/2023-03-07-schools-are-using-voice-technology-to-teach-reading-is-it-helping">Archived</a> from the original on 9 September 2024<span class="reference-accessdate">. Retrieved <span class="nowrap">7 March</span> 2023</span>.</cite><span title="ctx_ver=Z39.88-2004&amp;rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&amp;rft.genre=article&amp;rft.jtitle=EdSurge+News&amp;rft.atitle=Schools+Are+Using+Voice+Technology+to+Teach+Reading.+Is+It+Helping%3F&amp;rft.date=2023-03-07&amp;rft.aulast=Banerji&amp;rft.aufirst=Olina&amp;rft_id=https%3A%2F%2Fwww.edsurge.com%2Fnews%2F2023-03-07-schools-are-using-voice-technology-to-teach-reading-is-it-helping&amp;rfr_id=info%3Asid%2Fen.wikipedia.org%3ASpeech+recognition" class="Z3988"></span></span> </li> <li id="cite_note-112"><span class="mw-cite-backlink"><b><a href="#cite_ref-112">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222" /><cite id="CITEREFHairMonroe2018" class="citation book cs1">Hair, Adam; et&#160;al. (19 June 2018). "Apraxia world: A speech therapy game for children with speech sound disorders". <a rel="nofollow" class="external text" href="https://psi.engr.tamu.edu/wp-content/uploads/2018/04/hair2018idc.pdf"><i>Proceedings of the 17th ACM Conference on Interaction Design and Children</i></a> <span class="cs1-format">(PDF)</span>. pp.&#160;<span class="nowrap">119–</span>131. <a href="/wiki/Doi_(identifier)" class="mw-redirect" title="Doi (identifier)">doi</a>:<a rel="nofollow" class="external text" href="https://doi.org/10.1145%2F3202185.3202733">10.1145/3202185.3202733</a>. <a href="/wiki/ISBN_(identifier)" class="mw-redirect" title="ISBN (identifier)">ISBN</a>&#160;<a href="/wiki/Special:BookSources/9781450351522" title="Special:BookSources/9781450351522"><bdi>9781450351522</bdi></a>. <a href="/wiki/S2CID_(identifier)" class="mw-redirect" title="S2CID (identifier)">S2CID</a>&#160;<a rel="nofollow" class="external text" href="https://api.semanticscholar.org/CorpusID:13790002">13790002</a>. <a rel="nofollow" class="external text" href="https://web.archive.org/web/20240909052803/https://psi.engr.tamu.edu/wp-content/uploads/2018/04/hair2018idc.pdf">Archived</a> <span class="cs1-format">(PDF)</span> from the original on 9 September 2024<span class="reference-accessdate">. Retrieved <span class="nowrap">9 September</span> 2024</span>.</cite><span title="ctx_ver=Z39.88-2004&amp;rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&amp;rft.genre=bookitem&amp;rft.atitle=Apraxia+world%3A+A+speech+therapy+game+for+children+with+speech+sound+disorders&amp;rft.btitle=Proceedings+of+the+17th+ACM+Conference+on+Interaction+Design+and+Children&amp;rft.pages=%3Cspan+class%3D%22nowrap%22%3E119-%3C%2Fspan%3E131&amp;rft.date=2018-06-19&amp;rft_id=https%3A%2F%2Fapi.semanticscholar.org%2FCorpusID%3A13790002%23id-name%3DS2CID&amp;rft_id=info%3Adoi%2F10.1145%2F3202185.3202733&amp;rft.isbn=9781450351522&amp;rft.aulast=Hair&amp;rft.aufirst=Adam&amp;rft.au=Monroe%2C+Penelope&amp;rft_id=https%3A%2F%2Fpsi.engr.tamu.edu%2Fwp-content%2Fuploads%2F2018%2F04%2Fhair2018idc.pdf&amp;rfr_id=info%3Asid%2Fen.wikipedia.org%3ASpeech+recognition" class="Z3988"></span></span> </li> <li id="cite_note-113"><span class="mw-cite-backlink"><b><a href="#cite_ref-113">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222" /><cite class="citation news cs1"><a rel="nofollow" class="external text" href="https://www.theguardian.com/australia-news/2017/aug/08/computer-says-no-irish-vet-fails-oral-english-test-needed-to-stay-in-australia">"Computer says no: Irish vet fails oral English test needed to stay in Australia"</a>. <i>The Guardian</i>. Australian Associated Press. 8 August 2017. <a rel="nofollow" class="external text" href="https://web.archive.org/web/20240909052806/https://www.theguardian.com/australia-news/2017/aug/08/computer-says-no-irish-vet-fails-oral-english-test-needed-to-stay-in-australia">Archived</a> from the original on 9 September 2024<span class="reference-accessdate">. Retrieved <span class="nowrap">12 February</span> 2023</span>.</cite><span title="ctx_ver=Z39.88-2004&amp;rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&amp;rft.genre=article&amp;rft.jtitle=The+Guardian&amp;rft.atitle=Computer+says+no%3A+Irish+vet+fails+oral+English+test+needed+to+stay+in+Australia&amp;rft.date=2017-08-08&amp;rft_id=https%3A%2F%2Fwww.theguardian.com%2Faustralia-news%2F2017%2Faug%2F08%2Fcomputer-says-no-irish-vet-fails-oral-english-test-needed-to-stay-in-australia&amp;rfr_id=info%3Asid%2Fen.wikipedia.org%3ASpeech+recognition" class="Z3988"></span></span> </li> <li id="cite_note-114"><span class="mw-cite-backlink"><b><a href="#cite_ref-114">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222" /><cite id="CITEREFFerrier2017" class="citation news cs1">Ferrier, Tracey (9 August 2017). <a rel="nofollow" class="external text" href="https://www.smh.com.au/technology/australian-exnews-reader-with-english-degree-fails-robots-english-test-20170809-gxsjv2.html">"Australian ex-news reader with English degree fails robot's English test"</a>. <i>The Sydney Morning Herald</i>. <a rel="nofollow" class="external text" href="https://web.archive.org/web/20240909053307/https://www.smh.com.au/technology/australian-exnews-reader-with-english-degree-fails-robots-english-test-20170809-gxsjv2.html">Archived</a> from the original on 9 September 2024<span class="reference-accessdate">. Retrieved <span class="nowrap">12 February</span> 2023</span>.</cite><span title="ctx_ver=Z39.88-2004&amp;rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&amp;rft.genre=article&amp;rft.jtitle=The+Sydney+Morning+Herald&amp;rft.atitle=Australian+ex-news+reader+with+English+degree+fails+robot%27s+English+test&amp;rft.date=2017-08-09&amp;rft.aulast=Ferrier&amp;rft.aufirst=Tracey&amp;rft_id=https%3A%2F%2Fwww.smh.com.au%2Ftechnology%2Faustralian-exnews-reader-with-english-degree-fails-robots-english-test-20170809-gxsjv2.html&amp;rfr_id=info%3Asid%2Fen.wikipedia.org%3ASpeech+recognition" class="Z3988"></span></span> </li> <li id="cite_note-115"><span class="mw-cite-backlink"><b><a href="#cite_ref-115">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222" /><cite id="CITEREFMainWatson2022" class="citation news cs1">Main, Ed; Watson, Richard (9 February 2022). <a rel="nofollow" class="external text" href="https://www.bbc.com/news/uk-60264106">"The English test that ruined thousands of lives"</a>. <i>BBC News</i>. <a rel="nofollow" class="external text" href="https://web.archive.org/web/20240909054614/https://www.bbc.com/news/uk-60264106">Archived</a> from the original on 9 September 2024<span class="reference-accessdate">. Retrieved <span class="nowrap">12 February</span> 2023</span>.</cite><span title="ctx_ver=Z39.88-2004&amp;rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&amp;rft.genre=article&amp;rft.jtitle=BBC+News&amp;rft.atitle=The+English+test+that+ruined+thousands+of+lives&amp;rft.date=2022-02-09&amp;rft.aulast=Main&amp;rft.aufirst=Ed&amp;rft.au=Watson%2C+Richard&amp;rft_id=https%3A%2F%2Fwww.bbc.com%2Fnews%2Fuk-60264106&amp;rfr_id=info%3Asid%2Fen.wikipedia.org%3ASpeech+recognition" class="Z3988"></span></span> </li> <li id="cite_note-116"><span class="mw-cite-backlink"><b><a href="#cite_ref-116">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222" /><cite id="CITEREFJoyce2023" class="citation web cs1">Joyce, Katy Spratte (24 January 2023). <a rel="nofollow" class="external text" href="https://www.rd.com/list/words-that-can-be-pronounced-two-ways/">"13 Words That Can Be Pronounced Two Ways"</a>. Reader's Digest. <a rel="nofollow" class="external text" href="https://web.archive.org/web/20240909054447/https://www.rd.com/list/words-that-can-be-pronounced-two-ways/">Archived</a> from the original on 9 September 2024<span class="reference-accessdate">. Retrieved <span class="nowrap">23 February</span> 2023</span>.</cite><span title="ctx_ver=Z39.88-2004&amp;rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&amp;rft.genre=unknown&amp;rft.btitle=13+Words+That+Can+Be+Pronounced+Two+Ways&amp;rft.pub=Reader%27s+Digest&amp;rft.date=2023-01-24&amp;rft.aulast=Joyce&amp;rft.aufirst=Katy+Spratte&amp;rft_id=https%3A%2F%2Fwww.rd.com%2Flist%2Fwords-that-can-be-pronounced-two-ways%2F&amp;rfr_id=info%3Asid%2Fen.wikipedia.org%3ASpeech+recognition" class="Z3988"></span></span> </li> <li id="cite_note-117"><span class="mw-cite-backlink"><b><a href="#cite_ref-117">^</a></b></span> <span class="reference-text">E.g., <a href="/wiki/CMU_Pronouncing_Dictionary" title="CMU Pronouncing Dictionary">CMUDICT</a>, <link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222" /><cite class="citation web cs1"><a rel="nofollow" class="external text" href="http://www.speech.cs.cmu.edu/cgi-bin/cmudict">"The CMU Pronouncing Dictionary"</a>. <i>www.speech.cs.cmu.edu</i>. <a rel="nofollow" class="external text" href="https://web.archive.org/web/20100815023012/http://www.speech.cs.cmu.edu/cgi-bin/cmudict">Archived</a> from the original on 15 August 2010<span class="reference-accessdate">. Retrieved <span class="nowrap">15 February</span> 2023</span>.</cite><span title="ctx_ver=Z39.88-2004&amp;rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&amp;rft.genre=unknown&amp;rft.jtitle=www.speech.cs.cmu.edu&amp;rft.atitle=The+CMU+Pronouncing+Dictionary&amp;rft_id=http%3A%2F%2Fwww.speech.cs.cmu.edu%2Fcgi-bin%2Fcmudict&amp;rfr_id=info%3Asid%2Fen.wikipedia.org%3ASpeech+recognition" class="Z3988"></span> Compare "four" given as "F AO R" with the vowel AO as in "caught," to "row" given as "R OW" with the vowel OW as in "oat."</span> </li> <li id="cite_note-118"><span class="mw-cite-backlink"><b><a href="#cite_ref-118">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222" /><cite id="CITEREFTuMaBarker2022" class="citation conference cs1">Tu, Zehai; Ma, Ning; Barker, Jon (2022). <a rel="nofollow" class="external text" href="https://www.isca-speech.org/archive/pdfs/interspeech_2022/tu22b_interspeech.pdf">"Unsupervised Uncertainty Measures of Automatic Speech Recognition for Non-intrusive Speech Intelligibility Prediction"</a> <span class="cs1-format">(PDF)</span>. <i>Proc. Interspeech 2022</i>. INTERSPEECH 2022. ISCA. pp.&#160;<span class="nowrap">3493–</span>3497. <a href="/wiki/Doi_(identifier)" class="mw-redirect" title="Doi (identifier)">doi</a>:<a rel="nofollow" class="external text" href="https://doi.org/10.21437%2FInterspeech.2022-10408">10.21437/Interspeech.2022-10408</a>. <a rel="nofollow" class="external text" href="https://web.archive.org/web/20240909053824/https://www.isca-speech.org/archive/pdfs/interspeech_2022/tu22b_interspeech.pdf">Archived</a> <span class="cs1-format">(PDF)</span> from the original on 9 September 2024<span class="reference-accessdate">. Retrieved <span class="nowrap">17 December</span> 2023</span>.</cite><span title="ctx_ver=Z39.88-2004&amp;rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&amp;rft.genre=conference&amp;rft.atitle=Unsupervised+Uncertainty+Measures+of+Automatic+Speech+Recognition+for+Non-intrusive+Speech+Intelligibility+Prediction&amp;rft.btitle=Proc.+Interspeech+2022&amp;rft.pages=%3Cspan+class%3D%22nowrap%22%3E3493-%3C%2Fspan%3E3497&amp;rft.pub=ISCA&amp;rft.date=2022&amp;rft_id=info%3Adoi%2F10.21437%2FInterspeech.2022-10408&amp;rft.aulast=Tu&amp;rft.aufirst=Zehai&amp;rft.au=Ma%2C+Ning&amp;rft.au=Barker%2C+Jon&amp;rft_id=https%3A%2F%2Fwww.isca-speech.org%2Farchive%2Fpdfs%2Finterspeech_2022%2Ftu22b_interspeech.pdf&amp;rfr_id=info%3Asid%2Fen.wikipedia.org%3ASpeech+recognition" class="Z3988"></span></span> </li> <li id="cite_note-119"><span class="mw-cite-backlink"><b><a href="#cite_ref-119">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222" /><cite class="citation book cs1"><a rel="nofollow" class="external text" href="https://rm.coe.int/cefr-companion-volume-with-new-descriptors-2018/1680787989"><i>Common European framework of reference for languages learning, teaching, assessment: Companion volume with new descriptors</i></a>. Language Policy Programme, Education Policy Division, Education Department, <a href="/wiki/Council_of_Europe" title="Council of Europe">Council of Europe</a>. February 2018. p.&#160;136. <a href="/wiki/OCLC_(identifier)" class="mw-redirect" title="OCLC (identifier)">OCLC</a>&#160;<a rel="nofollow" class="external text" href="https://search.worldcat.org/oclc/1090351600">1090351600</a>. <a rel="nofollow" class="external text" href="https://web.archive.org/web/20240909053825/https://rm.coe.int/cefr-companion-volume-with-new-descriptors-2018/1680787989">Archived</a> from the original on 9 September 2024<span class="reference-accessdate">. Retrieved <span class="nowrap">9 September</span> 2024</span>.</cite><span title="ctx_ver=Z39.88-2004&amp;rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&amp;rft.genre=book&amp;rft.btitle=Common+European+framework+of+reference+for+languages+learning%2C+teaching%2C+assessment%3A+Companion+volume+with+new+descriptors&amp;rft.pages=136&amp;rft.pub=Language+Policy+Programme%2C+Education+Policy+Division%2C+Education+Department%2C+Council+of+Europe&amp;rft.date=2018-02&amp;rft_id=info%3Aoclcnum%2F1090351600&amp;rft_id=https%3A%2F%2Frm.coe.int%2Fcefr-companion-volume-with-new-descriptors-2018%2F1680787989&amp;rfr_id=info%3Asid%2Fen.wikipedia.org%3ASpeech+recognition" class="Z3988"></span></span> </li> <li id="cite_note-120"><span class="mw-cite-backlink"><b><a href="#cite_ref-120">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222" /><cite id="CITEREFEnglund2004" class="citation thesis cs1">Englund, Christine (2004). <a rel="nofollow" class="external text" href="http://www.speech.kth.se/prod/publications/files/1664.pdf"><i>Speech recognition in the JAS&#160;39 Gripen aircraft: Adaptation to speech at different G-loads</i></a> <span class="cs1-format">(PDF)</span> (Masters thesis thesis). <a href="/wiki/Stockholm_University" title="Stockholm University">Stockholm Royal Institute of Technology</a>. <a rel="nofollow" class="external text" href="https://web.archive.org/web/20081002002102/http://www.speech.kth.se/prod/publications/files/1664.pdf">Archived</a> <span class="cs1-format">(PDF)</span> from the original on 2 October 2008.</cite><span title="ctx_ver=Z39.88-2004&amp;rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Adissertation&amp;rft.title=Speech+recognition+in+the+JAS+39+Gripen+aircraft%3A+Adaptation+to+speech+at+different+G-loads&amp;rft.degree=Masters+thesis&amp;rft.inst=Stockholm+Royal+Institute+of+Technology&amp;rft.date=2004&amp;rft.aulast=Englund&amp;rft.aufirst=Christine&amp;rft_id=http%3A%2F%2Fwww.speech.kth.se%2Fprod%2Fpublications%2Ffiles%2F1664.pdf&amp;rfr_id=info%3Asid%2Fen.wikipedia.org%3ASpeech+recognition" class="Z3988"></span></span> </li> <li id="cite_note-121"><span class="mw-cite-backlink"><b><a href="#cite_ref-121">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222" /><cite class="citation web cs1"><a rel="nofollow" class="external text" href="https://www.eurofighter.com/the-aircraft#cockpit">"The Cockpit"</a>. <i>Eurofighter Typhoon</i>. <a rel="nofollow" class="external text" href="https://web.archive.org/web/20170301222529/https://www.eurofighter.com/the-aircraft#cockpit">Archived</a> from the original on 1 March 2017.</cite><span title="ctx_ver=Z39.88-2004&amp;rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&amp;rft.genre=unknown&amp;rft.jtitle=Eurofighter+Typhoon&amp;rft.atitle=The+Cockpit&amp;rft_id=https%3A%2F%2Fwww.eurofighter.com%2Fthe-aircraft%23cockpit&amp;rfr_id=info%3Asid%2Fen.wikipedia.org%3ASpeech+recognition" class="Z3988"></span></span> </li> <li id="cite_note-122"><span class="mw-cite-backlink"><b><a href="#cite_ref-122">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222" /><cite class="citation web cs1"><a rel="nofollow" class="external text" href="http://www.eurofighter.com/capabilities/technology/voice-throttle-stick/direct-voice-input.html">"Eurofighter Typhoon – The world's most advanced fighter aircraft"</a>. <i>www.eurofighter.com</i>. <a rel="nofollow" class="external text" href="https://web.archive.org/web/20130511025203/http://www.eurofighter.com/capabilities/technology/voice-throttle-stick/direct-voice-input.html">Archived</a> from the original on 11 May 2013<span class="reference-accessdate">. Retrieved <span class="nowrap">1 May</span> 2018</span>.</cite><span title="ctx_ver=Z39.88-2004&amp;rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&amp;rft.genre=unknown&amp;rft.jtitle=www.eurofighter.com&amp;rft.atitle=Eurofighter+Typhoon+%E2%80%93+The+world%27s+most+advanced+fighter+aircraft&amp;rft_id=http%3A%2F%2Fwww.eurofighter.com%2Fcapabilities%2Ftechnology%2Fvoice-throttle-stick%2Fdirect-voice-input.html&amp;rfr_id=info%3Asid%2Fen.wikipedia.org%3ASpeech+recognition" class="Z3988"></span></span> </li> <li id="cite_note-123"><span class="mw-cite-backlink"><b><a href="#cite_ref-123">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222" /><cite id="CITEREFSchutte2007" class="citation web cs1">Schutte, John (15 October 2007). <a rel="nofollow" class="external text" href="https://www.af.mil/News/story/id/123071861/">"Researchers fine-tune F-35 pilot-aircraft speech system"</a>. United States Air Force. <a rel="nofollow" class="external text" href="https://web.archive.org/web/20071020030310/http://www.af.mil/news/story.asp?id=123071861">Archived</a> from the original on 20 October 2007.</cite><span title="ctx_ver=Z39.88-2004&amp;rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&amp;rft.genre=unknown&amp;rft.btitle=Researchers+fine-tune+F-35+pilot-aircraft+speech+system&amp;rft.pub=United+States+Air+Force&amp;rft.date=2007-10-15&amp;rft.aulast=Schutte&amp;rft.aufirst=John&amp;rft_id=https%3A%2F%2Fwww.af.mil%2FNews%2Fstory%2Fid%2F123071861%2F&amp;rfr_id=info%3Asid%2Fen.wikipedia.org%3ASpeech+recognition" class="Z3988"></span></span> </li> <li id="cite_note-124"><span class="mw-cite-backlink"><b><a href="#cite_ref-124">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222" /><cite class="citation web cs1 cs1-prop-unfit"><a rel="nofollow" class="external text" href="https://web.archive.org/web/20130725024622/http://www.massmatch.org/aboutus/listserv/2010/2010-03-31.html">"Overcoming Communication Barriers in the Classroom"</a>. MassMATCH. 18 March 2010. Archived from the original on 25 July 2013<span class="reference-accessdate">. Retrieved <span class="nowrap">15 June</span> 2013</span>.</cite><span title="ctx_ver=Z39.88-2004&amp;rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&amp;rft.genre=unknown&amp;rft.btitle=Overcoming+Communication+Barriers+in+the+Classroom&amp;rft.pub=MassMATCH&amp;rft.date=2010-03-18&amp;rft_id=http%3A%2F%2Fwww.massmatch.org%2Faboutus%2Flistserv%2F2010%2F2010-03-31.html&amp;rfr_id=info%3Asid%2Fen.wikipedia.org%3ASpeech+recognition" class="Z3988"></span></span> </li> <li id="cite_note-brainline-125"><span class="mw-cite-backlink">^ <a href="#cite_ref-brainline_125-0"><sup><i><b>a</b></i></sup></a> <a href="#cite_ref-brainline_125-1"><sup><i><b>b</b></i></sup></a></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222" /><cite class="citation web cs1"><a rel="nofollow" class="external text" href="http://www.brainline.org/content/2010/12/speech-recognition-for-learning_pageall.html">"Speech Recognition for Learning"</a>. National Center for Technology Innovation. 2010. <a rel="nofollow" class="external text" href="https://web.archive.org/web/20140413100513/http://www.brainline.org/content/2010/12/speech-recognition-for-learning_pageall.html">Archived</a> from the original on 13 April 2014<span class="reference-accessdate">. Retrieved <span class="nowrap">26 March</span> 2014</span>.</cite><span title="ctx_ver=Z39.88-2004&amp;rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&amp;rft.genre=unknown&amp;rft.btitle=Speech+Recognition+for+Learning&amp;rft.pub=National+Center+for+Technology+Innovation&amp;rft.date=2010&amp;rft_id=http%3A%2F%2Fwww.brainline.org%2Fcontent%2F2010%2F12%2Fspeech-recognition-for-learning_pageall.html&amp;rfr_id=info%3Asid%2Fen.wikipedia.org%3ASpeech+recognition" class="Z3988"></span></span> </li> <li id="cite_note-126"><span class="mw-cite-backlink"><b><a href="#cite_ref-126">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222" /><cite id="CITEREFFollensbeeMcCloskey-Dale2000" class="citation web cs1">Follensbee, Bob; McCloskey-Dale, Susan (2000). <a rel="nofollow" class="external text" href="http://www.csun.edu/~hfdss006/conf/2000/proceedings/0219Follansbee.htm">"Speech recognition in schools: An update from the field"</a>. <i>Technology And Persons With Disabilities Conference 2000</i>. <a rel="nofollow" class="external text" href="https://web.archive.org/web/20060821213145/http://www.csun.edu/~hfdss006/conf/2000/proceedings/0219Follansbee.htm">Archived</a> from the original on 21 August 2006<span class="reference-accessdate">. Retrieved <span class="nowrap">26 March</span> 2014</span>.</cite><span title="ctx_ver=Z39.88-2004&amp;rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&amp;rft.genre=unknown&amp;rft.jtitle=Technology+And+Persons+With+Disabilities+Conference+2000&amp;rft.atitle=Speech+recognition+in+schools%3A+An+update+from+the+field&amp;rft.date=2000&amp;rft.aulast=Follensbee&amp;rft.aufirst=Bob&amp;rft.au=McCloskey-Dale%2C+Susan&amp;rft_id=http%3A%2F%2Fwww.csun.edu%2F~hfdss006%2Fconf%2F2000%2Fproceedings%2F0219Follansbee.htm&amp;rfr_id=info%3Asid%2Fen.wikipedia.org%3ASpeech+recognition" class="Z3988"></span></span> </li> <li id="cite_note-127"><span class="mw-cite-backlink"><b><a href="#cite_ref-127">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222" /><cite class="citation web cs1"><a rel="nofollow" class="external text" href="https://web.archive.org/web/20080404013302/http://www.businessweek.com/1998/08/b3566022.htm">"Speech recognition for disabled people"</a>. Archived from <a rel="nofollow" class="external text" href="http://www.businessweek.com/1998/08/b3566022.htm">the original</a> on 4 April 2008.</cite><span title="ctx_ver=Z39.88-2004&amp;rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&amp;rft.genre=unknown&amp;rft.btitle=Speech+recognition+for+disabled+people&amp;rft_id=http%3A%2F%2Fwww.businessweek.com%2F1998%2F08%2Fb3566022.htm&amp;rfr_id=info%3Asid%2Fen.wikipedia.org%3ASpeech+recognition" class="Z3988"></span></span> </li> <li id="cite_note-128"><span class="mw-cite-backlink"><b><a href="#cite_ref-128">^</a></b></span> <span class="reference-text"><a href="/wiki/Friends_International_Support_Group" title="Friends International Support Group">Friends International Support Group</a></span> </li> <li id="cite_note-129"><span class="mw-cite-backlink"><b><a href="#cite_ref-129">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222" /><cite id="CITEREFGarrett2011" class="citation journal cs1">Garrett, Jennifer Tumlin; et&#160;al. (2011). <a rel="nofollow" class="external text" href="https://scholarworks.gsu.edu/epse_diss/46">"Using Speech Recognition Software to Increase Writing Fluency for Individuals with Physical Disabilities"</a>. <i>Journal of Special Education Technology</i>. <b>26</b> (1): <span class="nowrap">25–</span>41. <a href="/wiki/Doi_(identifier)" class="mw-redirect" title="Doi (identifier)">doi</a>:<a rel="nofollow" class="external text" href="https://doi.org/10.1177%2F016264341102600104">10.1177/016264341102600104</a>. <a href="/wiki/S2CID_(identifier)" class="mw-redirect" title="S2CID (identifier)">S2CID</a>&#160;<a rel="nofollow" class="external text" href="https://api.semanticscholar.org/CorpusID:142730664">142730664</a>. <a rel="nofollow" class="external text" href="https://web.archive.org/web/20240909053848/https://scholarworks.gsu.edu/epse_diss/46/">Archived</a> from the original on 9 September 2024<span class="reference-accessdate">. Retrieved <span class="nowrap">9 September</span> 2024</span>.</cite><span title="ctx_ver=Z39.88-2004&amp;rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&amp;rft.genre=article&amp;rft.jtitle=Journal+of+Special+Education+Technology&amp;rft.atitle=Using+Speech+Recognition+Software+to+Increase+Writing+Fluency+for+Individuals+with+Physical+Disabilities&amp;rft.volume=26&amp;rft.issue=1&amp;rft.pages=%3Cspan+class%3D%22nowrap%22%3E25-%3C%2Fspan%3E41&amp;rft.date=2011&amp;rft_id=info%3Adoi%2F10.1177%2F016264341102600104&amp;rft_id=https%3A%2F%2Fapi.semanticscholar.org%2FCorpusID%3A142730664%23id-name%3DS2CID&amp;rft.aulast=Garrett&amp;rft.aufirst=Jennifer+Tumlin&amp;rft_id=https%3A%2F%2Fscholarworks.gsu.edu%2Fepse_diss%2F46&amp;rfr_id=info%3Asid%2Fen.wikipedia.org%3ASpeech+recognition" class="Z3988"></span></span> </li> <li id="cite_note-130"><span class="mw-cite-backlink"><b><a href="#cite_ref-130">^</a></b></span> <span class="reference-text">Forgrave, Karen E. "Assistive Technology: Empowering Students with Disabilities." Clearing House 75.3 (2002): 122–6. Web.</span> </li> <li id="cite_note-131"><span class="mw-cite-backlink"><b><a href="#cite_ref-131">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222" /><cite id="CITEREFTangKamouaSutan2004" class="citation journal cs1">Tang, K. W.; Kamoua, Ridha; Sutan, Victor (2004). "Speech Recognition Technology for Disabilities Education". <i>Journal of Educational Technology Systems</i>. <b>33</b> (2): <span class="nowrap">173–</span>84. <a href="/wiki/CiteSeerX_(identifier)" class="mw-redirect" title="CiteSeerX (identifier)">CiteSeerX</a>&#160;<span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.631.3736">10.1.1.631.3736</a></span>. <a href="/wiki/Doi_(identifier)" class="mw-redirect" title="Doi (identifier)">doi</a>:<a rel="nofollow" class="external text" href="https://doi.org/10.2190%2FK6K8-78K2-59Y7-R9R2">10.2190/K6K8-78K2-59Y7-R9R2</a>. <a href="/wiki/S2CID_(identifier)" class="mw-redirect" title="S2CID (identifier)">S2CID</a>&#160;<a rel="nofollow" class="external text" href="https://api.semanticscholar.org/CorpusID:143159997">143159997</a>.</cite><span title="ctx_ver=Z39.88-2004&amp;rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&amp;rft.genre=article&amp;rft.jtitle=Journal+of+Educational+Technology+Systems&amp;rft.atitle=Speech+Recognition+Technology+for+Disabilities+Education&amp;rft.volume=33&amp;rft.issue=2&amp;rft.pages=%3Cspan+class%3D%22nowrap%22%3E173-%3C%2Fspan%3E84&amp;rft.date=2004&amp;rft_id=https%3A%2F%2Fciteseerx.ist.psu.edu%2Fviewdoc%2Fsummary%3Fdoi%3D10.1.1.631.3736%23id-name%3DCiteSeerX&amp;rft_id=https%3A%2F%2Fapi.semanticscholar.org%2FCorpusID%3A143159997%23id-name%3DS2CID&amp;rft_id=info%3Adoi%2F10.2190%2FK6K8-78K2-59Y7-R9R2&amp;rft.aulast=Tang&amp;rft.aufirst=K.+W.&amp;rft.au=Kamoua%2C+Ridha&amp;rft.au=Sutan%2C+Victor&amp;rfr_id=info%3Asid%2Fen.wikipedia.org%3ASpeech+recognition" class="Z3988"></span></span> </li> <li id="cite_note-Planetary_Society_article-132"><span class="mw-cite-backlink"><b><a href="#cite_ref-Planetary_Society_article_132-0">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222" /><cite class="citation web cs1"><a rel="nofollow" class="external text" href="https://web.archive.org/web/20120127161038/http://www.planetary.org/programs/projects/planetary_microphones/mars_microphone.html">"Projects: Planetary Microphones"</a>. The Planetary Society. Archived from <a rel="nofollow" class="external text" href="http://www.planetary.org/programs/projects/planetary_microphones/mars_microphone.html">the original</a> on 27 January 2012.</cite><span title="ctx_ver=Z39.88-2004&amp;rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&amp;rft.genre=unknown&amp;rft.btitle=Projects%3A+Planetary+Microphones&amp;rft.pub=The+Planetary+Society&amp;rft_id=http%3A%2F%2Fwww.planetary.org%2Fprograms%2Fprojects%2Fplanetary_microphones%2Fmars_microphone.html&amp;rfr_id=info%3Asid%2Fen.wikipedia.org%3ASpeech+recognition" class="Z3988"></span></span> </li> <li id="cite_note-133"><span class="mw-cite-backlink"><b><a href="#cite_ref-133">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222" /><cite id="CITEREFCaridakisCastellanoKessousRaouzaiou2007" class="citation book cs1">Caridakis, George; Castellano, Ginevra; Kessous, Loic; Raouzaiou, Amaryllis; Malatesta, Lori; Asteriadis, Stelios; Karpouzis, Kostas (19 September 2007). "Multimodal emotion recognition from expressive faces, body gestures and speech". <i>Artificial Intelligence and Innovations 2007: From Theory to Applications</i>. IFIP the International Federation for Information Processing. Vol.&#160;247. Springer US. pp.&#160;<span class="nowrap">375–</span>388. <a href="/wiki/Doi_(identifier)" class="mw-redirect" title="Doi (identifier)">doi</a>:<a rel="nofollow" class="external text" href="https://doi.org/10.1007%2F978-0-387-74161-1_41">10.1007/978-0-387-74161-1_41</a>. <a href="/wiki/ISBN_(identifier)" class="mw-redirect" title="ISBN (identifier)">ISBN</a>&#160;<a href="/wiki/Special:BookSources/978-0-387-74160-4" title="Special:BookSources/978-0-387-74160-4"><bdi>978-0-387-74160-4</bdi></a>.</cite><span title="ctx_ver=Z39.88-2004&amp;rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&amp;rft.genre=bookitem&amp;rft.atitle=Multimodal+emotion+recognition+from+expressive+faces%2C+body+gestures+and+speech&amp;rft.btitle=Artificial+Intelligence+and+Innovations+2007%3A+From+Theory+to+Applications&amp;rft.series=IFIP+the+International+Federation+for+Information+Processing&amp;rft.pages=%3Cspan+class%3D%22nowrap%22%3E375-%3C%2Fspan%3E388&amp;rft.pub=Springer+US&amp;rft.date=2007-09-19&amp;rft_id=info%3Adoi%2F10.1007%2F978-0-387-74161-1_41&amp;rft.isbn=978-0-387-74160-4&amp;rft.aulast=Caridakis&amp;rft.aufirst=George&amp;rft.au=Castellano%2C+Ginevra&amp;rft.au=Kessous%2C+Loic&amp;rft.au=Raouzaiou%2C+Amaryllis&amp;rft.au=Malatesta%2C+Lori&amp;rft.au=Asteriadis%2C+Stelios&amp;rft.au=Karpouzis%2C+Kostas&amp;rfr_id=info%3Asid%2Fen.wikipedia.org%3ASpeech+recognition" class="Z3988"></span></span> </li> <li id="cite_note-134"><span class="mw-cite-backlink"><b><a href="#cite_ref-134">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222" /><cite class="citation web cs1"><a rel="nofollow" class="external text" href="https://www.washington.edu/doit/what-real-time-captioning">"What is real-time captioning? | DO-IT"</a>. <i>www.washington.edu</i>. <a rel="nofollow" class="external text" href="https://web.archive.org/web/20240909054510/https://www.washington.edu/doit/what-real-time-captioning">Archived</a> from the original on 9 September 2024<span class="reference-accessdate">. Retrieved <span class="nowrap">11 April</span> 2021</span>.</cite><span title="ctx_ver=Z39.88-2004&amp;rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&amp;rft.genre=unknown&amp;rft.jtitle=www.washington.edu&amp;rft.atitle=What+is+real-time+captioning%3F+%7C+DO-IT&amp;rft_id=https%3A%2F%2Fwww.washington.edu%2Fdoit%2Fwhat-real-time-captioning&amp;rfr_id=info%3Asid%2Fen.wikipedia.org%3ASpeech+recognition" class="Z3988"></span></span> </li> <li id="cite_note-135"><span class="mw-cite-backlink"><b><a href="#cite_ref-135">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222" /><cite id="CITEREFZhengLi2017" class="citation book cs1">Zheng, Thomas Fang; Li, Lantian (2017). <a rel="nofollow" class="external text" href="http://link.springer.com/10.1007/978-981-10-3238-7"><i>Robustness-Related Issues in Speaker Recognition</i></a>. SpringerBriefs in Electrical and Computer Engineering. Singapore: Springer Singapore. <a href="/wiki/Doi_(identifier)" class="mw-redirect" title="Doi (identifier)">doi</a>:<a rel="nofollow" class="external text" href="https://doi.org/10.1007%2F978-981-10-3238-7">10.1007/978-981-10-3238-7</a>. <a href="/wiki/ISBN_(identifier)" class="mw-redirect" title="ISBN (identifier)">ISBN</a>&#160;<a href="/wiki/Special:BookSources/978-981-10-3237-0" title="Special:BookSources/978-981-10-3237-0"><bdi>978-981-10-3237-0</bdi></a>. <a rel="nofollow" class="external text" href="https://web.archive.org/web/20240909053948/https://link.springer.com/book/10.1007/978-981-10-3238-7">Archived</a> from the original on 9 September 2024<span class="reference-accessdate">. Retrieved <span class="nowrap">9 September</span> 2024</span>.</cite><span title="ctx_ver=Z39.88-2004&amp;rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&amp;rft.genre=book&amp;rft.btitle=Robustness-Related+Issues+in+Speaker+Recognition&amp;rft.place=Singapore&amp;rft.series=SpringerBriefs+in+Electrical+and+Computer+Engineering&amp;rft.pub=Springer+Singapore&amp;rft.date=2017&amp;rft_id=info%3Adoi%2F10.1007%2F978-981-10-3238-7&amp;rft.isbn=978-981-10-3237-0&amp;rft.aulast=Zheng&amp;rft.aufirst=Thomas+Fang&amp;rft.au=Li%2C+Lantian&amp;rft_id=http%3A%2F%2Flink.springer.com%2F10.1007%2F978-981-10-3238-7&amp;rfr_id=info%3Asid%2Fen.wikipedia.org%3ASpeech+recognition" class="Z3988"></span></span> </li> <li id="cite_note-136"><span class="mw-cite-backlink"><b><a href="#cite_ref-136">^</a></b></span> <span class="reference-text">Ciaramella, Alberto. "A prototype performance evaluation report." Sundial workpackage 8000 (1993).</span> </li> <li id="cite_note-137"><span class="mw-cite-backlink"><b><a href="#cite_ref-137">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222" /><cite id="CITEREFGerbinoBaggiaCiaramellaRullent1993" class="citation book cs1">Gerbino, E.; Baggia, P.; Ciaramella, A.; Rullent, C. (1993). "Test and evaluation of a spoken dialogue system". <i>IEEE International Conference on Acoustics Speech and Signal Processing</i>. pp.&#160;135–138 vol.2. <a href="/wiki/Doi_(identifier)" class="mw-redirect" title="Doi (identifier)">doi</a>:<a rel="nofollow" class="external text" href="https://doi.org/10.1109%2FICASSP.1993.319250">10.1109/ICASSP.1993.319250</a>. <a href="/wiki/ISBN_(identifier)" class="mw-redirect" title="ISBN (identifier)">ISBN</a>&#160;<a href="/wiki/Special:BookSources/0-7803-0946-4" title="Special:BookSources/0-7803-0946-4"><bdi>0-7803-0946-4</bdi></a>. <a href="/wiki/S2CID_(identifier)" class="mw-redirect" title="S2CID (identifier)">S2CID</a>&#160;<a rel="nofollow" class="external text" href="https://api.semanticscholar.org/CorpusID:57374050">57374050</a>.</cite><span title="ctx_ver=Z39.88-2004&amp;rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&amp;rft.genre=bookitem&amp;rft.atitle=Test+and+evaluation+of+a+spoken+dialogue+system&amp;rft.btitle=IEEE+International+Conference+on+Acoustics+Speech+and+Signal+Processing&amp;rft.pages=135-138+vol.2&amp;rft.date=1993&amp;rft_id=https%3A%2F%2Fapi.semanticscholar.org%2FCorpusID%3A57374050%23id-name%3DS2CID&amp;rft_id=info%3Adoi%2F10.1109%2FICASSP.1993.319250&amp;rft.isbn=0-7803-0946-4&amp;rft.aulast=Gerbino&amp;rft.aufirst=E.&amp;rft.au=Baggia%2C+P.&amp;rft.au=Ciaramella%2C+A.&amp;rft.au=Rullent%2C+C.&amp;rfr_id=info%3Asid%2Fen.wikipedia.org%3ASpeech+recognition" class="Z3988"></span></span> </li> <li id="cite_note-138"><span class="mw-cite-backlink"><b><a href="#cite_ref-138">^</a></b></span> <span class="reference-text">National Institute of Standards and Technology. "<a rel="nofollow" class="external text" href="http://www.itl.nist.gov/iad/mig/publications/ASRhistory/">The History of Automatic Speech Recognition Evaluation at NIST</a> <a rel="nofollow" class="external text" href="https://web.archive.org/web/20131008210040/http://www.itl.nist.gov/iad/mig/publications/ASRhistory/">Archived</a> 8 October 2013 at the <a href="/wiki/Wayback_Machine" title="Wayback Machine">Wayback Machine</a>".</span> </li> <li id="cite_note-139"><span class="mw-cite-backlink"><b><a href="#cite_ref-139">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222" /><cite class="citation web cs1"><a rel="nofollow" class="external text" href="https://www.naeyc.org/resources/pubs/yc/mar2015/letter-sound-relationships">"Letter Names Can Cause Confusion and Other Things to Know About Letter–Sound Relationships"</a>. <i>NAEYC</i>. <a rel="nofollow" class="external text" href="https://web.archive.org/web/20240909054452/https://www.naeyc.org/resources/pubs/yc/mar2015/letter-sound-relationships">Archived</a> from the original on 9 September 2024<span class="reference-accessdate">. Retrieved <span class="nowrap">27 October</span> 2023</span>.</cite><span title="ctx_ver=Z39.88-2004&amp;rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&amp;rft.genre=unknown&amp;rft.jtitle=NAEYC&amp;rft.atitle=Letter+Names+Can+Cause+Confusion+and+Other+Things+to+Know+About+Letter%E2%80%93Sound+Relationships&amp;rft_id=https%3A%2F%2Fwww.naeyc.org%2Fresources%2Fpubs%2Fyc%2Fmar2015%2Fletter-sound-relationships&amp;rfr_id=info%3Asid%2Fen.wikipedia.org%3ASpeech+recognition" class="Z3988"></span></span> </li> <li id="cite_note-140"><span class="mw-cite-backlink"><b><a href="#cite_ref-140">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222" /><cite class="citation news cs1"><a rel="nofollow" class="external text" href="https://www.npr.org/2016/03/06/469383361/listen-up-your-ai-assistant-goes-crazy-for-npr-too">"Listen Up: Your AI Assistant Goes Crazy For NPR Too"</a>. <i><a href="/wiki/NPR" title="NPR">NPR</a></i>. 6 March 2016. <a rel="nofollow" class="external text" href="https://web.archive.org/web/20170723210358/http://www.npr.org/2016/03/06/469383361/listen-up-your-ai-assistant-goes-crazy-for-npr-too">Archived</a> from the original on 23 July 2017.</cite><span title="ctx_ver=Z39.88-2004&amp;rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&amp;rft.genre=article&amp;rft.jtitle=NPR&amp;rft.atitle=Listen+Up%3A+Your+AI+Assistant+Goes+Crazy+For+NPR+Too&amp;rft.date=2016-03-06&amp;rft_id=https%3A%2F%2Fwww.npr.org%2F2016%2F03%2F06%2F469383361%2Flisten-up-your-ai-assistant-goes-crazy-for-npr-too&amp;rfr_id=info%3Asid%2Fen.wikipedia.org%3ASpeech+recognition" class="Z3988"></span></span> </li> <li id="cite_note-141"><span class="mw-cite-backlink"><b><a href="#cite_ref-141">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222" /><cite id="CITEREFClaburn2017" class="citation news cs1">Claburn, Thomas (25 August 2017). <a rel="nofollow" class="external text" href="https://www.theregister.co.uk/2017/08/25/amazon_alexa_answers_inaudible_commands/?mt=1504024969000">"Is it possible to control Amazon Alexa, Google Now using inaudible commands? Absolutely"</a>. <i><a href="/wiki/The_Register" title="The Register">The Register</a></i>. <a rel="nofollow" class="external text" href="https://web.archive.org/web/20170902051123/https://www.theregister.co.uk/2017/08/25/amazon_alexa_answers_inaudible_commands/?mt=1504024969000">Archived</a> from the original on 2 September 2017.</cite><span title="ctx_ver=Z39.88-2004&amp;rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&amp;rft.genre=article&amp;rft.jtitle=The+Register&amp;rft.atitle=Is+it+possible+to+control+Amazon+Alexa%2C+Google+Now+using+inaudible+commands%3F+Absolutely&amp;rft.date=2017-08-25&amp;rft.aulast=Claburn&amp;rft.aufirst=Thomas&amp;rft_id=https%3A%2F%2Fwww.theregister.co.uk%2F2017%2F08%2F25%2Famazon_alexa_answers_inaudible_commands%2F%3Fmt%3D1504024969000&amp;rfr_id=info%3Asid%2Fen.wikipedia.org%3ASpeech+recognition" class="Z3988"></span></span> </li> <li id="cite_note-142"><span class="mw-cite-backlink"><b><a href="#cite_ref-142">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222" /><cite class="citation web cs1"><a rel="nofollow" class="external text" href="https://motherboard.vice.com/en_us/article/d34nnz/attack-targets-automatic-speech-recognition-systems">"Attack Targets Automatic Speech Recognition Systems"</a>. <i>vice.com</i>. 31 January 2018. <a rel="nofollow" class="external text" href="https://web.archive.org/web/20180303050744/https://motherboard.vice.com/en_us/article/d34nnz/attack-targets-automatic-speech-recognition-systems">Archived</a> from the original on 3 March 2018<span class="reference-accessdate">. Retrieved <span class="nowrap">1 May</span> 2018</span>.</cite><span title="ctx_ver=Z39.88-2004&amp;rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&amp;rft.genre=unknown&amp;rft.jtitle=vice.com&amp;rft.atitle=Attack+Targets+Automatic+Speech+Recognition+Systems&amp;rft.date=2018-01-31&amp;rft_id=https%3A%2F%2Fmotherboard.vice.com%2Fen_us%2Farticle%2Fd34nnz%2Fattack-targets-automatic-speech-recognition-systems&amp;rfr_id=info%3Asid%2Fen.wikipedia.org%3ASpeech+recognition" class="Z3988"></span></span> </li> <li id="cite_note-auto-143"><span class="mw-cite-backlink"><b><a href="#cite_ref-auto_143-0">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222" /><cite id="CITEREFBeigi2011" class="citation book cs1">Beigi, Homayoon (2011). <a rel="nofollow" class="external text" href="http://www.fundamentalsofspeakerrecognition.org"><i>Fundamentals of Speaker Recognition</i></a>. New York: Springer. <a href="/wiki/ISBN_(identifier)" class="mw-redirect" title="ISBN (identifier)">ISBN</a>&#160;<a href="/wiki/Special:BookSources/978-0-387-77591-3" title="Special:BookSources/978-0-387-77591-3"><bdi>978-0-387-77591-3</bdi></a>. <a rel="nofollow" class="external text" href="https://web.archive.org/web/20180131140911/http://www.fundamentalsofspeakerrecognition.org/">Archived</a> from the original on 31 January 2018.</cite><span title="ctx_ver=Z39.88-2004&amp;rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&amp;rft.genre=book&amp;rft.btitle=Fundamentals+of+Speaker+Recognition&amp;rft.place=New+York&amp;rft.pub=Springer&amp;rft.date=2011&amp;rft.isbn=978-0-387-77591-3&amp;rft.aulast=Beigi&amp;rft.aufirst=Homayoon&amp;rft_id=http%3A%2F%2Fwww.fundamentalsofspeakerrecognition.org&amp;rfr_id=info%3Asid%2Fen.wikipedia.org%3ASpeech+recognition" class="Z3988"></span></span> </li> <li id="cite_note-144"><span class="mw-cite-backlink"><b><a href="#cite_ref-144">^</a></b></span> <span class="reference-text">Povey, D., Ghoshal, A., Boulianne, G., Burget, L., Glembek, O., Goel, N., ... &amp; Vesely, K. (2011). The Kaldi speech recognition toolkit. In IEEE 2011 workshop on automatic speech recognition and understanding (No. CONF). IEEE Signal Processing Society.</span> </li> <li id="cite_note-145"><span class="mw-cite-backlink"><b><a href="#cite_ref-145">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222" /><cite class="citation web cs1"><a rel="nofollow" class="external text" href="https://web.archive.org/web/20200227020208/https://voice.mozilla.org/">"Common Voice by Mozilla"</a>. <i>voice.mozilla.org</i>. Archived from <a rel="nofollow" class="external text" href="https://voice.mozilla.org/">the original</a> on 27 February 2020<span class="reference-accessdate">. Retrieved <span class="nowrap">9 November</span> 2019</span>.</cite><span title="ctx_ver=Z39.88-2004&amp;rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&amp;rft.genre=unknown&amp;rft.jtitle=voice.mozilla.org&amp;rft.atitle=Common+Voice+by+Mozilla&amp;rft_id=https%3A%2F%2Fvoice.mozilla.org%2F&amp;rfr_id=info%3Asid%2Fen.wikipedia.org%3ASpeech+recognition" class="Z3988"></span></span> </li> <li id="cite_note-146"><span class="mw-cite-backlink"><b><a href="#cite_ref-146">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222" /><cite class="citation web cs1"><a rel="nofollow" class="external text" href="https://github.com/mozilla/DeepSpeech">"A TensorFlow implementation of Baidu's DeepSpeech architecture: mozilla/DeepSpeech"</a>. 9 November 2019. <a rel="nofollow" class="external text" href="https://web.archive.org/web/20240909053949/https://github.com/mozilla/DeepSpeech">Archived</a> from the original on 9 September 2024<span class="reference-accessdate">. Retrieved <span class="nowrap">9 September</span> 2024</span> &#8211; via GitHub.</cite><span title="ctx_ver=Z39.88-2004&amp;rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&amp;rft.genre=unknown&amp;rft.btitle=A+TensorFlow+implementation+of+Baidu%27s+DeepSpeech+architecture%3A+mozilla%2FDeepSpeech&amp;rft.date=2019-11-09&amp;rft_id=https%3A%2F%2Fgithub.com%2Fmozilla%2FDeepSpeech&amp;rfr_id=info%3Asid%2Fen.wikipedia.org%3ASpeech+recognition" class="Z3988"></span></span> </li> <li id="cite_note-147"><span class="mw-cite-backlink"><b><a href="#cite_ref-147">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222" /><cite class="citation web cs1"><a rel="nofollow" class="external text" href="https://github.com/tensorflow/docs">"GitHub - tensorflow/docs: TensorFlow documentation"</a>. 9 November 2019. <a rel="nofollow" class="external text" href="https://web.archive.org/web/20240909053830/https://github.com/tensorflow/docs">Archived</a> from the original on 9 September 2024<span class="reference-accessdate">. Retrieved <span class="nowrap">9 September</span> 2024</span> &#8211; via GitHub.</cite><span title="ctx_ver=Z39.88-2004&amp;rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&amp;rft.genre=unknown&amp;rft.btitle=GitHub+-+tensorflow%2Fdocs%3A+TensorFlow+documentation&amp;rft.date=2019-11-09&amp;rft_id=https%3A%2F%2Fgithub.com%2Ftensorflow%2Fdocs&amp;rfr_id=info%3Asid%2Fen.wikipedia.org%3ASpeech+recognition" class="Z3988"></span></span> </li> <li id="cite_note-148"><span class="mw-cite-backlink"><b><a href="#cite_ref-148">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222" /><cite class="citation web cs1"><a rel="nofollow" class="external text" href="https://github.com/coqui-ai">"Coqui, a startup providing open speech tech for everyone"</a>. <i>GitHub</i>. <a rel="nofollow" class="external text" href="https://web.archive.org/web/20240909054614/https://github.com/coqui-ai">Archived</a> from the original on 9 September 2024<span class="reference-accessdate">. Retrieved <span class="nowrap">7 March</span> 2022</span>.</cite><span title="ctx_ver=Z39.88-2004&amp;rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&amp;rft.genre=unknown&amp;rft.jtitle=GitHub&amp;rft.atitle=Coqui%2C+a+startup+providing+open+speech+tech+for+everyone&amp;rft_id=https%3A%2F%2Fgithub.com%2Fcoqui-ai&amp;rfr_id=info%3Asid%2Fen.wikipedia.org%3ASpeech+recognition" class="Z3988"></span></span> </li> <li id="cite_note-149"><span class="mw-cite-backlink"><b><a href="#cite_ref-149">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222" /><cite id="CITEREFCoffey2021" class="citation magazine cs1">Coffey, Donavyn (28 April 2021). <a rel="nofollow" class="external text" href="https://www.wired.co.uk/article/maori-language-tech">"Māori are trying to save their language from Big Tech"</a>. <i>Wired UK</i>. <a href="/wiki/ISSN_(identifier)" class="mw-redirect" title="ISSN (identifier)">ISSN</a>&#160;<a rel="nofollow" class="external text" href="https://search.worldcat.org/issn/1357-0978">1357-0978</a>. <a rel="nofollow" class="external text" href="https://web.archive.org/web/20240909053950/https://www.wired.com/story/maori-language-tech/">Archived</a> from the original on 9 September 2024<span class="reference-accessdate">. Retrieved <span class="nowrap">16 October</span> 2021</span>.</cite><span title="ctx_ver=Z39.88-2004&amp;rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&amp;rft.genre=article&amp;rft.jtitle=Wired+UK&amp;rft.atitle=M%C4%81ori+are+trying+to+save+their+language+from+Big+Tech&amp;rft.date=2021-04-28&amp;rft.issn=1357-0978&amp;rft.aulast=Coffey&amp;rft.aufirst=Donavyn&amp;rft_id=https%3A%2F%2Fwww.wired.co.uk%2Farticle%2Fmaori-language-tech&amp;rfr_id=info%3Asid%2Fen.wikipedia.org%3ASpeech+recognition" class="Z3988"></span></span> </li> <li id="cite_note-150"><span class="mw-cite-backlink"><b><a href="#cite_ref-150">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222" /><cite class="citation web cs1"><a rel="nofollow" class="external text" href="https://discourse.mozilla.org/t/why-you-should-move-from-deepspeech-to-coqui-ai/82798">"Why you should move from DeepSpeech to coqui.ai"</a>. <i>Mozilla Discourse</i>. 7 July 2021<span class="reference-accessdate">. Retrieved <span class="nowrap">16 October</span> 2021</span>.</cite><span title="ctx_ver=Z39.88-2004&amp;rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&amp;rft.genre=unknown&amp;rft.jtitle=Mozilla+Discourse&amp;rft.atitle=Why+you+should+move+from+DeepSpeech+to+coqui.ai&amp;rft.date=2021-07-07&amp;rft_id=https%3A%2F%2Fdiscourse.mozilla.org%2Ft%2Fwhy-you-should-move-from-deepspeech-to-coqui-ai%2F82798&amp;rfr_id=info%3Asid%2Fen.wikipedia.org%3ASpeech+recognition" class="Z3988"></span></span> </li> <li id="cite_note-151"><span class="mw-cite-backlink"><b><a href="#cite_ref-151">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222" /><cite class="citation web cs1"><a rel="nofollow" class="external text" href="https://support.google.com/gboard/answer/2781851?hl=en&amp;co=GENIE.Platform%3DAndroid">"Type with your voice"</a>. <a rel="nofollow" class="external text" href="https://web.archive.org/web/20240909054332/https://support.google.com/gboard/answer/2781851?hl=en&amp;co=GENIE.Platform%3DAndroid">Archived</a> from the original on 9 September 2024<span class="reference-accessdate">. Retrieved <span class="nowrap">9 September</span> 2024</span>.</cite><span title="ctx_ver=Z39.88-2004&amp;rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&amp;rft.genre=unknown&amp;rft.btitle=Type+with+your+voice&amp;rft_id=https%3A%2F%2Fsupport.google.com%2Fgboard%2Fanswer%2F2781851%3Fhl%3Den%26co%3DGENIE.Platform%253DAndroid&amp;rfr_id=info%3Asid%2Fen.wikipedia.org%3ASpeech+recognition" class="Z3988"></span></span> </li> </ol></div></div> <div class="mw-heading mw-heading2"><h2 id="Further_reading">Further reading</h2><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Speech_recognition&amp;action=edit&amp;section=35" title="Edit section: Further reading"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <ul><li><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222" /><cite id="CITEREFColeMarianiUszkoreitVarile1997" class="citation book cs1 cs1-prop-long-vol">Cole, Ronald; <a href="/wiki/Joseph_mariani" class="mw-redirect" title="Joseph mariani">Mariani, Joseph</a>; Uszkoreit, Hans; Varile, Giovanni Battista; Zaenen, Annie; Zampolli; Zue, Victor, eds. (1997). <i>Survey of the state of the art in human language technology</i>. Cambridge Studies in Natural Language Processing. Vol.&#160;<span class="nowrap">XII–</span>XIII. Cambridge University Press. <a href="/wiki/ISBN_(identifier)" class="mw-redirect" title="ISBN (identifier)">ISBN</a>&#160;<a href="/wiki/Special:BookSources/978-0-521-59277-2" title="Special:BookSources/978-0-521-59277-2"><bdi>978-0-521-59277-2</bdi></a>.</cite><span title="ctx_ver=Z39.88-2004&amp;rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&amp;rft.genre=book&amp;rft.btitle=Survey+of+the+state+of+the+art+in+human+language+technology&amp;rft.series=Cambridge+Studies+in+Natural+Language+Processing&amp;rft.pub=Cambridge+University+Press&amp;rft.date=1997&amp;rft.isbn=978-0-521-59277-2&amp;rfr_id=info%3Asid%2Fen.wikipedia.org%3ASpeech+recognition" class="Z3988"></span></li> <li><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222" /><cite id="CITEREFJunquaHaton1995" class="citation book cs1">Junqua, J.-C.; Haton, J.-P. (1995). <i>Robustness in Automatic Speech Recognition: Fundamentals and Applications</i>. Kluwer Academic Publishers. <a href="/wiki/ISBN_(identifier)" class="mw-redirect" title="ISBN (identifier)">ISBN</a>&#160;<a href="/wiki/Special:BookSources/978-0-7923-9646-8" title="Special:BookSources/978-0-7923-9646-8"><bdi>978-0-7923-9646-8</bdi></a>.</cite><span title="ctx_ver=Z39.88-2004&amp;rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&amp;rft.genre=book&amp;rft.btitle=Robustness+in+Automatic+Speech+Recognition%3A+Fundamentals+and+Applications&amp;rft.pub=Kluwer+Academic+Publishers&amp;rft.date=1995&amp;rft.isbn=978-0-7923-9646-8&amp;rft.aulast=Junqua&amp;rft.aufirst=J.-C.&amp;rft.au=Haton%2C+J.-P.&amp;rfr_id=info%3Asid%2Fen.wikipedia.org%3ASpeech+recognition" class="Z3988"></span></li> <li><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222" /><cite id="CITEREFKaratVergoNahamoo2007" class="citation book cs1">Karat, Clare-Marie; Vergo, John; Nahamoo, David (2007). "Conversational Interface Technologies". In <a href="/wiki/Andrew_Sears" title="Andrew Sears">Sears, Andrew</a>; Jacko, Julie A. (eds.). <i>The Human-Computer Interaction Handbook: Fundamentals, Evolving Technologies, and Emerging Applications (Human Factors and Ergonomics)</i>. Lawrence Erlbaum Associates Inc. <a href="/wiki/ISBN_(identifier)" class="mw-redirect" title="ISBN (identifier)">ISBN</a>&#160;<a href="/wiki/Special:BookSources/978-0-8058-5870-9" title="Special:BookSources/978-0-8058-5870-9"><bdi>978-0-8058-5870-9</bdi></a>.</cite><span title="ctx_ver=Z39.88-2004&amp;rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&amp;rft.genre=bookitem&amp;rft.atitle=Conversational+Interface+Technologies&amp;rft.btitle=The+Human-Computer+Interaction+Handbook%3A+Fundamentals%2C+Evolving+Technologies%2C+and+Emerging+Applications+%28Human+Factors+and+Ergonomics%29&amp;rft.pub=Lawrence+Erlbaum+Associates+Inc&amp;rft.date=2007&amp;rft.isbn=978-0-8058-5870-9&amp;rft.aulast=Karat&amp;rft.aufirst=Clare-Marie&amp;rft.au=Vergo%2C+John&amp;rft.au=Nahamoo%2C+David&amp;rfr_id=info%3Asid%2Fen.wikipedia.org%3ASpeech+recognition" class="Z3988"></span></li> <li><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222" /><cite id="CITEREFPieraccini2012" class="citation book cs1">Pieraccini, Roberto (2012). <i>The Voice in the Machine. Building Computers That Understand Speech</i>. The MIT Press. <a href="/wiki/ISBN_(identifier)" class="mw-redirect" title="ISBN (identifier)">ISBN</a>&#160;<a href="/wiki/Special:BookSources/978-0262016858" title="Special:BookSources/978-0262016858"><bdi>978-0262016858</bdi></a>.</cite><span title="ctx_ver=Z39.88-2004&amp;rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&amp;rft.genre=book&amp;rft.btitle=The+Voice+in+the+Machine.+Building+Computers+That+Understand+Speech.&amp;rft.pub=The+MIT+Press&amp;rft.date=2012&amp;rft.isbn=978-0262016858&amp;rft.aulast=Pieraccini&amp;rft.aufirst=Roberto&amp;rfr_id=info%3Asid%2Fen.wikipedia.org%3ASpeech+recognition" class="Z3988"></span></li> <li><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222" /><cite id="CITEREFPirani2013" class="citation book cs1">Pirani, Giancarlo, ed. (2013). <i>Advanced algorithms and architectures for speech understanding</i>. Springer Science &amp; Business Media. <a href="/wiki/ISBN_(identifier)" class="mw-redirect" title="ISBN (identifier)">ISBN</a>&#160;<a href="/wiki/Special:BookSources/978-3-642-84341-9" title="Special:BookSources/978-3-642-84341-9"><bdi>978-3-642-84341-9</bdi></a>.</cite><span title="ctx_ver=Z39.88-2004&amp;rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&amp;rft.genre=book&amp;rft.btitle=Advanced+algorithms+and+architectures+for+speech+understanding&amp;rft.pub=Springer+Science+%26+Business+Media&amp;rft.date=2013&amp;rft.isbn=978-3-642-84341-9&amp;rfr_id=info%3Asid%2Fen.wikipedia.org%3ASpeech+recognition" class="Z3988"></span></li> <li><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222" /><cite id="CITEREFSignerHoste2013" class="citation conference cs1">Signer, Beat; Hoste, Lode (December 2013). <a rel="nofollow" class="external text" href="https://www.academia.edu/4685517">"SpeeG2: A Speech- and Gesture-based Interface for Efficient Controller-free Text Entry"</a>. <i>Proceedings of ICMI 2013</i>. 15th International Conference on Multimodal Interaction. Sydney, Australia.</cite><span title="ctx_ver=Z39.88-2004&amp;rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&amp;rft.genre=conference&amp;rft.atitle=SpeeG2%3A+A+Speech-+and+Gesture-based+Interface+for+Efficient+Controller-free+Text+Entry&amp;rft.btitle=Proceedings+of+ICMI+2013&amp;rft.place=Sydney%2C+Australia&amp;rft.date=2013-12&amp;rft.aulast=Signer&amp;rft.aufirst=Beat&amp;rft.au=Hoste%2C+Lode&amp;rft_id=https%3A%2F%2Fwww.academia.edu%2F4685517&amp;rfr_id=info%3Asid%2Fen.wikipedia.org%3ASpeech+recognition" class="Z3988"></span></li> <li><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222" /><cite id="CITEREFWoelfelMcDonough2009" class="citation book cs1">Woelfel, Matthias; McDonough, John (26 May 2009). <i>Distant Speech Recognition</i>. Wiley. <a href="/wiki/ISBN_(identifier)" class="mw-redirect" title="ISBN (identifier)">ISBN</a>&#160;<a href="/wiki/Special:BookSources/978-0470517048" title="Special:BookSources/978-0470517048"><bdi>978-0470517048</bdi></a>.</cite><span title="ctx_ver=Z39.88-2004&amp;rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&amp;rft.genre=book&amp;rft.btitle=Distant+Speech+Recognition&amp;rft.pub=Wiley&amp;rft.date=2009-05-26&amp;rft.isbn=978-0470517048&amp;rft.aulast=Woelfel&amp;rft.aufirst=Matthias&amp;rft.au=McDonough%2C+John&amp;rfr_id=info%3Asid%2Fen.wikipedia.org%3ASpeech+recognition" class="Z3988"></span></li></ul> <div class="navbox-styles"><style data-mw-deduplicate="TemplateStyles:r1129693374">.mw-parser-output .hlist dl,.mw-parser-output .hlist ol,.mw-parser-output .hlist ul{margin:0;padding:0}.mw-parser-output .hlist dd,.mw-parser-output .hlist dt,.mw-parser-output .hlist li{margin:0;display:inline}.mw-parser-output .hlist.inline,.mw-parser-output .hlist.inline dl,.mw-parser-output .hlist.inline ol,.mw-parser-output .hlist.inline ul,.mw-parser-output .hlist dl dl,.mw-parser-output .hlist dl ol,.mw-parser-output .hlist dl ul,.mw-parser-output .hlist ol dl,.mw-parser-output .hlist ol ol,.mw-parser-output .hlist ol ul,.mw-parser-output .hlist ul dl,.mw-parser-output .hlist ul ol,.mw-parser-output .hlist ul ul{display:inline}.mw-parser-output .hlist .mw-empty-li{display:none}.mw-parser-output .hlist dt::after{content:": "}.mw-parser-output .hlist dd::after,.mw-parser-output .hlist li::after{content:" · ";font-weight:bold}.mw-parser-output .hlist dd:last-child::after,.mw-parser-output .hlist dt:last-child::after,.mw-parser-output .hlist li:last-child::after{content:none}.mw-parser-output .hlist dd dd:first-child::before,.mw-parser-output .hlist dd dt:first-child::before,.mw-parser-output .hlist dd li:first-child::before,.mw-parser-output .hlist dt dd:first-child::before,.mw-parser-output .hlist dt dt:first-child::before,.mw-parser-output .hlist dt li:first-child::before,.mw-parser-output .hlist li dd:first-child::before,.mw-parser-output .hlist li dt:first-child::before,.mw-parser-output .hlist li li:first-child::before{content:" (";font-weight:normal}.mw-parser-output .hlist dd dd:last-child::after,.mw-parser-output .hlist dd dt:last-child::after,.mw-parser-output .hlist dd li:last-child::after,.mw-parser-output .hlist dt dd:last-child::after,.mw-parser-output .hlist dt dt:last-child::after,.mw-parser-output .hlist dt li:last-child::after,.mw-parser-output .hlist li dd:last-child::after,.mw-parser-output .hlist li dt:last-child::after,.mw-parser-output .hlist li li:last-child::after{content:")";font-weight:normal}.mw-parser-output .hlist ol{counter-reset:listitem}.mw-parser-output .hlist ol>li{counter-increment:listitem}.mw-parser-output .hlist ol>li::before{content:" "counter(listitem)"\a0 "}.mw-parser-output .hlist dd ol>li:first-child::before,.mw-parser-output .hlist dt ol>li:first-child::before,.mw-parser-output .hlist li ol>li:first-child::before{content:" ("counter(listitem)"\a0 "}</style><style data-mw-deduplicate="TemplateStyles:r1236075235">.mw-parser-output .navbox{box-sizing:border-box;border:1px solid #a2a9b1;width:100%;clear:both;font-size:88%;text-align:center;padding:1px;margin:1em auto 0}.mw-parser-output .navbox .navbox{margin-top:0}.mw-parser-output .navbox+.navbox,.mw-parser-output .navbox+.navbox-styles+.navbox{margin-top:-1px}.mw-parser-output .navbox-inner,.mw-parser-output .navbox-subgroup{width:100%}.mw-parser-output .navbox-group,.mw-parser-output .navbox-title,.mw-parser-output .navbox-abovebelow{padding:0.25em 1em;line-height:1.5em;text-align:center}.mw-parser-output .navbox-group{white-space:nowrap;text-align:right}.mw-parser-output .navbox,.mw-parser-output .navbox-subgroup{background-color:#fdfdfd}.mw-parser-output .navbox-list{line-height:1.5em;border-color:#fdfdfd}.mw-parser-output .navbox-list-with-group{text-align:left;border-left-width:2px;border-left-style:solid}.mw-parser-output tr+tr>.navbox-abovebelow,.mw-parser-output tr+tr>.navbox-group,.mw-parser-output tr+tr>.navbox-image,.mw-parser-output tr+tr>.navbox-list{border-top:2px solid #fdfdfd}.mw-parser-output .navbox-title{background-color:#ccf}.mw-parser-output .navbox-abovebelow,.mw-parser-output .navbox-group,.mw-parser-output .navbox-subgroup .navbox-title{background-color:#ddf}.mw-parser-output .navbox-subgroup .navbox-group,.mw-parser-output .navbox-subgroup .navbox-abovebelow{background-color:#e6e6ff}.mw-parser-output .navbox-even{background-color:#f7f7f7}.mw-parser-output .navbox-odd{background-color:transparent}.mw-parser-output .navbox .hlist td dl,.mw-parser-output .navbox .hlist td ol,.mw-parser-output .navbox .hlist td ul,.mw-parser-output .navbox td.hlist dl,.mw-parser-output .navbox td.hlist ol,.mw-parser-output .navbox td.hlist ul{padding:0.125em 0}.mw-parser-output .navbox .navbar{display:block;font-size:100%}.mw-parser-output .navbox-title .navbar{float:left;text-align:left;margin-right:0.5em}body.skin--responsive .mw-parser-output .navbox-image img{max-width:none!important}@media print{body.ns-0 .mw-parser-output .navbox{display:none!important}}</style></div><div role="navigation" class="navbox" aria-labelledby="Natural_language_processing454" style="padding:3px"><table class="nowraplinks hlist mw-collapsible autocollapse navbox-inner" style="border-spacing:0;background:transparent;color:inherit"><tbody><tr><th scope="col" class="navbox-title" colspan="2"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1129693374" /><style data-mw-deduplicate="TemplateStyles:r1239400231">.mw-parser-output .navbar{display:inline;font-size:88%;font-weight:normal}.mw-parser-output .navbar-collapse{float:left;text-align:left}.mw-parser-output .navbar-boxtext{word-spacing:0}.mw-parser-output .navbar ul{display:inline-block;white-space:nowrap;line-height:inherit}.mw-parser-output .navbar-brackets::before{margin-right:-0.125em;content:"[ "}.mw-parser-output .navbar-brackets::after{margin-left:-0.125em;content:" ]"}.mw-parser-output .navbar li{word-spacing:-0.125em}.mw-parser-output .navbar a>span,.mw-parser-output .navbar a>abbr{text-decoration:inherit}.mw-parser-output .navbar-mini abbr{font-variant:small-caps;border-bottom:none;text-decoration:none;cursor:inherit}.mw-parser-output .navbar-ct-full{font-size:114%;margin:0 7em}.mw-parser-output .navbar-ct-mini{font-size:114%;margin:0 4em}html.skin-theme-clientpref-night .mw-parser-output .navbar li a abbr{color:var(--color-base)!important}@media(prefers-color-scheme:dark){html.skin-theme-clientpref-os .mw-parser-output .navbar li a abbr{color:var(--color-base)!important}}@media print{.mw-parser-output .navbar{display:none!important}}</style><div class="navbar plainlinks hlist navbar-mini"><ul><li class="nv-view"><a href="/wiki/Template:Natural_language_processing" title="Template:Natural language processing"><abbr title="View this template">v</abbr></a></li><li class="nv-talk"><a href="/wiki/Template_talk:Natural_language_processing" title="Template talk:Natural language processing"><abbr title="Discuss this template">t</abbr></a></li><li class="nv-edit"><a href="/wiki/Special:EditPage/Template:Natural_language_processing" title="Special:EditPage/Template:Natural language processing"><abbr title="Edit this template">e</abbr></a></li></ul></div><div id="Natural_language_processing454" style="font-size:114%;margin:0 4em"><a href="/wiki/Natural_language_processing" title="Natural language processing">Natural language processing</a></div></th></tr><tr><th scope="row" class="navbox-group" style="width:1%">General terms</th><td class="navbox-list-with-group navbox-list navbox-odd" style="width:100%;padding:0"><div style="padding:0 0.25em"> <ul><li><a href="/wiki/AI-complete" title="AI-complete">AI-complete</a></li> <li><a href="/wiki/Bag-of-words_model" title="Bag-of-words model">Bag-of-words</a></li> <li><a href="/wiki/N-gram" title="N-gram">n-gram</a> <ul><li><a href="/wiki/Bigram" title="Bigram">Bigram</a></li> <li><a href="/wiki/Trigram" title="Trigram">Trigram</a></li></ul></li> <li><a href="/wiki/Computational_linguistics" title="Computational linguistics">Computational linguistics</a></li> <li><a href="/wiki/Natural_language_understanding" title="Natural language understanding">Natural language understanding</a></li> <li><a href="/wiki/Stop_word" title="Stop word">Stop words</a></li> <li><a href="/wiki/Text_processing" title="Text processing">Text processing</a></li></ul> </div></td></tr><tr><th scope="row" class="navbox-group" style="width:1%"><a href="/wiki/Text_mining" title="Text mining">Text analysis</a></th><td class="navbox-list-with-group navbox-list navbox-even" style="width:100%;padding:0"><div style="padding:0 0.25em"> <ul><li><a href="/wiki/Argument_mining" title="Argument mining">Argument mining</a></li> <li><a href="/wiki/Collocation_extraction" title="Collocation extraction">Collocation extraction</a></li> <li><a href="/wiki/Concept_mining" title="Concept mining">Concept mining</a></li> <li><a href="/wiki/Coreference#Coreference_resolution" title="Coreference">Coreference resolution</a></li> <li><a href="/wiki/Deep_linguistic_processing" title="Deep linguistic processing">Deep linguistic processing</a></li> <li><a href="/wiki/Distant_reading" title="Distant reading">Distant reading</a></li> <li><a href="/wiki/Information_extraction" title="Information extraction">Information extraction</a></li> <li><a href="/wiki/Named-entity_recognition" title="Named-entity recognition">Named-entity recognition</a></li> <li><a href="/wiki/Ontology_learning" title="Ontology learning">Ontology learning</a></li> <li><a href="/wiki/Parsing" title="Parsing">Parsing</a> <ul><li><a href="/wiki/Semantic_parsing" title="Semantic parsing">Semantic parsing</a></li> <li><a href="/wiki/Syntactic_parsing_(computational_linguistics)" title="Syntactic parsing (computational linguistics)">Syntactic parsing</a></li></ul></li> <li><a href="/wiki/Part-of-speech_tagging" title="Part-of-speech tagging">Part-of-speech tagging</a></li> <li><a href="/wiki/Semantic_analysis_(machine_learning)" title="Semantic analysis (machine learning)">Semantic analysis</a></li> <li><a href="/wiki/Semantic_role_labeling" title="Semantic role labeling">Semantic role labeling</a></li> <li><a href="/wiki/Semantic_decomposition_(natural_language_processing)" title="Semantic decomposition (natural language processing)">Semantic decomposition</a></li> <li><a href="/wiki/Semantic_similarity" title="Semantic similarity">Semantic similarity</a></li> <li><a href="/wiki/Sentiment_analysis" title="Sentiment analysis">Sentiment analysis</a></li></ul> <ul><li><a href="/wiki/Terminology_extraction" title="Terminology extraction">Terminology extraction</a></li> <li><a href="/wiki/Text_mining" title="Text mining">Text mining</a></li> <li><a href="/wiki/Textual_entailment" title="Textual entailment">Textual entailment</a></li> <li><a href="/wiki/Truecasing" title="Truecasing">Truecasing</a></li> <li><a href="/wiki/Word-sense_disambiguation" title="Word-sense disambiguation">Word-sense disambiguation</a></li> <li><a href="/wiki/Word-sense_induction" title="Word-sense induction">Word-sense induction</a></li></ul> </div><table class="nowraplinks navbox-subgroup" style="border-spacing:0"><tbody><tr><th id="Text_segmentation21" scope="row" class="navbox-group" style="width:1%"><a href="/wiki/Text_segmentation" title="Text segmentation">Text segmentation</a></th><td class="navbox-list-with-group navbox-list navbox-odd" style="width:100%;padding:0"><div style="padding:0 0.25em"> <ul><li><a href="/wiki/Compound-term_processing" title="Compound-term processing">Compound-term processing</a></li> <li><a href="/wiki/Lemmatisation" class="mw-redirect" title="Lemmatisation">Lemmatisation</a></li> <li><a href="/wiki/Lexical_analysis" title="Lexical analysis">Lexical analysis</a></li> <li><a href="/wiki/Shallow_parsing" title="Shallow parsing">Text chunking</a></li> <li><a href="/wiki/Stemming" title="Stemming">Stemming</a></li> <li><a href="/wiki/Sentence_boundary_disambiguation" title="Sentence boundary disambiguation">Sentence segmentation</a></li> <li><a href="/wiki/Word#Word_boundaries" title="Word">Word segmentation</a></li></ul> </div></td></tr></tbody></table><div> </div></td></tr><tr><th scope="row" class="navbox-group" style="width:1%"><a href="/wiki/Automatic_summarization" title="Automatic summarization">Automatic summarization</a></th><td class="navbox-list-with-group navbox-list navbox-even" style="width:100%;padding:0"><div style="padding:0 0.25em"> <ul><li><a href="/wiki/Multi-document_summarization" title="Multi-document summarization">Multi-document summarization</a></li> <li><a href="/wiki/Sentence_extraction" title="Sentence extraction">Sentence extraction</a></li> <li><a href="/wiki/Text_simplification" title="Text simplification">Text simplification</a></li></ul> </div></td></tr><tr><th scope="row" class="navbox-group" style="width:1%"><a href="/wiki/Machine_translation" title="Machine translation">Machine translation</a></th><td class="navbox-list-with-group navbox-list navbox-odd" style="width:100%;padding:0"><div style="padding:0 0.25em"> <ul><li><a href="/wiki/Computer-assisted_translation" title="Computer-assisted translation">Computer-assisted</a></li> <li><a href="/wiki/Example-based_machine_translation" title="Example-based machine translation">Example-based</a></li> <li><a href="/wiki/Rule-based_machine_translation" title="Rule-based machine translation">Rule-based</a></li> <li><a href="/wiki/Statistical_machine_translation" title="Statistical machine translation">Statistical</a></li> <li><a href="/wiki/Transfer-based_machine_translation" title="Transfer-based machine translation">Transfer-based</a></li> <li><a href="/wiki/Neural_machine_translation" title="Neural machine translation">Neural</a></li></ul> </div></td></tr><tr><th scope="row" class="navbox-group" style="width:1%"><a href="/wiki/Distributional_semantics" title="Distributional semantics">Distributional semantics</a> models</th><td class="navbox-list-with-group navbox-list navbox-even" style="width:100%;padding:0"><div style="padding:0 0.25em"> <ul><li><a href="/wiki/BERT_(language_model)" title="BERT (language model)">BERT</a></li> <li><a href="/wiki/Document-term_matrix" title="Document-term matrix">Document-term matrix</a></li> <li><a href="/wiki/Explicit_semantic_analysis" title="Explicit semantic analysis">Explicit semantic analysis</a></li> <li><a href="/wiki/FastText" title="FastText">fastText</a></li> <li><a href="/wiki/GloVe" title="GloVe">GloVe</a></li> <li><a href="/wiki/Language_model" title="Language model">Language model</a> (<a href="/wiki/Large_language_model" title="Large language model">large</a>)</li> <li><a href="/wiki/Latent_semantic_analysis" title="Latent semantic analysis">Latent semantic analysis</a></li> <li><a href="/wiki/Seq2seq" title="Seq2seq">Seq2seq</a></li> <li><a href="/wiki/Word_embedding" title="Word embedding">Word embedding</a></li> <li><a href="/wiki/Word2vec" title="Word2vec">Word2vec</a></li></ul> </div></td></tr><tr><th scope="row" class="navbox-group" style="width:1%"><a href="/wiki/Language_resource" title="Language resource">Language resources</a>,<br />datasets and corpora</th><td class="navbox-list-with-group navbox-list navbox-odd" style="width:100%;padding:0"><div style="padding:0 0.25em"></div><table class="nowraplinks navbox-subgroup" style="border-spacing:0"><tbody><tr><th scope="row" class="navbox-group" style="width:1%">Types and<br />standards</th><td class="navbox-list-with-group navbox-list navbox-odd" style="width:100%;padding:0"><div style="padding:0 0.25em"> <ul><li><a href="/wiki/Corpus_linguistics" title="Corpus linguistics">Corpus linguistics</a></li> <li><a href="/wiki/Lexical_resource" title="Lexical resource">Lexical resource</a></li> <li><a href="/wiki/Linguistic_Linked_Open_Data" title="Linguistic Linked Open Data">Linguistic Linked Open Data</a></li> <li><a href="/wiki/Machine-readable_dictionary" title="Machine-readable dictionary">Machine-readable dictionary</a></li> <li><a href="/wiki/Parallel_text" title="Parallel text">Parallel text</a></li> <li><a href="/wiki/PropBank" title="PropBank">PropBank</a></li> <li><a href="/wiki/Semantic_network" title="Semantic network">Semantic network</a></li> <li><a href="/wiki/Simple_Knowledge_Organization_System" title="Simple Knowledge Organization System">Simple Knowledge Organization System</a></li> <li><a href="/wiki/Speech_corpus" title="Speech corpus">Speech corpus</a></li> <li><a href="/wiki/Text_corpus" title="Text corpus">Text corpus</a></li> <li><a href="/wiki/Thesaurus_(information_retrieval)" title="Thesaurus (information retrieval)">Thesaurus (information retrieval)</a></li> <li><a href="/wiki/Treebank" title="Treebank">Treebank</a></li> <li><a href="/wiki/Universal_Dependencies" title="Universal Dependencies">Universal Dependencies</a></li></ul> </div></td></tr><tr><th scope="row" class="navbox-group" style="width:1%">Data</th><td class="navbox-list-with-group navbox-list navbox-even" style="width:100%;padding:0"><div style="padding:0 0.25em"> <ul><li><a href="/wiki/BabelNet" title="BabelNet">BabelNet</a></li> <li><a href="/wiki/Bank_of_English" title="Bank of English">Bank of English</a></li> <li><a href="/wiki/DBpedia" title="DBpedia">DBpedia</a></li> <li><a href="/wiki/FrameNet" title="FrameNet">FrameNet</a></li> <li><a href="/wiki/Google_Ngram_Viewer" class="mw-redirect" title="Google Ngram Viewer">Google Ngram Viewer</a></li> <li><a href="/wiki/UBY" title="UBY">UBY</a></li> <li><a href="/wiki/WordNet" title="WordNet">WordNet</a></li> <li><a href="/wiki/Wikidata" title="Wikidata">Wikidata</a></li></ul> </div></td></tr></tbody></table><div></div></td></tr><tr><th scope="row" class="navbox-group" style="width:1%"><a href="/wiki/Automatic_identification_and_data_capture" title="Automatic identification and data capture">Automatic identification<br />and data capture</a></th><td class="navbox-list-with-group navbox-list navbox-odd" style="width:100%;padding:0"><div style="padding:0 0.25em"> <ul><li><a class="mw-selflink selflink">Speech recognition</a></li> <li><a href="/wiki/Speech_segmentation" title="Speech segmentation">Speech segmentation</a></li> <li><a href="/wiki/Speech_synthesis" title="Speech synthesis">Speech synthesis</a></li> <li><a href="/wiki/Natural_language_generation" title="Natural language generation">Natural language generation</a></li> <li><a href="/wiki/Optical_character_recognition" title="Optical character recognition">Optical character recognition</a></li></ul> </div></td></tr><tr><th scope="row" class="navbox-group" style="width:1%"><a href="/wiki/Topic_model" title="Topic model">Topic model</a></th><td class="navbox-list-with-group navbox-list navbox-even" style="width:100%;padding:0"><div style="padding:0 0.25em"> <ul><li><a href="/wiki/Document_classification" title="Document classification">Document classification</a></li> <li><a href="/wiki/Latent_Dirichlet_allocation" title="Latent Dirichlet allocation">Latent Dirichlet allocation</a></li> <li><a href="/wiki/Pachinko_allocation" title="Pachinko allocation">Pachinko allocation</a></li></ul> </div></td></tr><tr><th scope="row" class="navbox-group" style="width:1%"><a href="/wiki/Computer-assisted_reviewing" title="Computer-assisted reviewing">Computer-assisted<br />reviewing</a></th><td class="navbox-list-with-group navbox-list navbox-odd" style="width:100%;padding:0"><div style="padding:0 0.25em"> <ul><li><a href="/wiki/Automated_essay_scoring" title="Automated essay scoring">Automated essay scoring</a></li> <li><a href="/wiki/Concordancer" title="Concordancer">Concordancer</a></li> <li><a href="/wiki/Grammar_checker" title="Grammar checker">Grammar checker</a></li> <li><a href="/wiki/Predictive_text" title="Predictive text">Predictive text</a></li> <li><a href="/wiki/Pronunciation_assessment" title="Pronunciation assessment">Pronunciation assessment</a></li> <li><a href="/wiki/Spell_checker" title="Spell checker">Spell checker</a></li></ul> </div></td></tr><tr><th scope="row" class="navbox-group" style="width:1%"><a href="/wiki/Natural-language_user_interface" title="Natural-language user interface">Natural language<br />user interface</a></th><td class="navbox-list-with-group navbox-list navbox-even" style="width:100%;padding:0"><div style="padding:0 0.25em"> <ul><li><a href="/wiki/Chatbot" title="Chatbot">Chatbot</a></li> <li><a href="/wiki/Interactive_fiction" title="Interactive fiction">Interactive fiction</a> (c.f. <a href="/wiki/Syntax_guessing" class="mw-redirect" title="Syntax guessing">Syntax guessing</a>)</li> <li><a href="/wiki/Question_answering" title="Question answering">Question answering</a></li> <li><a href="/wiki/Virtual_assistant" title="Virtual assistant">Virtual assistant</a></li> <li><a href="/wiki/Voice_user_interface" title="Voice user interface">Voice user interface</a></li></ul> </div></td></tr><tr><th scope="row" class="navbox-group" style="width:1%">Related</th><td class="navbox-list-with-group navbox-list navbox-odd" style="width:100%;padding:0"><div style="padding:0 0.25em"> <ul><li><a href="/wiki/Formal_semantics_(natural_language)" title="Formal semantics (natural language)">Formal semantics</a></li> <li><a href="/wiki/Hallucination_(artificial_intelligence)" title="Hallucination (artificial intelligence)">Hallucination</a></li> <li><a href="/wiki/Natural_Language_Toolkit" title="Natural Language Toolkit">Natural Language Toolkit</a></li> <li><a href="/wiki/SpaCy" title="SpaCy">spaCy</a></li></ul> </div></td></tr></tbody></table></div> <div class="navbox-styles"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1129693374" /><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1236075235" /></div><div role="navigation" class="navbox" aria-labelledby="Artificial_intelligence_(AI)752" style="padding:3px"><table class="nowraplinks hlist mw-collapsible autocollapse navbox-inner" style="border-spacing:0;background:transparent;color:inherit"><tbody><tr><th scope="col" class="navbox-title" colspan="2"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1129693374" /><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1239400231" /><div class="navbar plainlinks hlist navbar-mini"><ul><li class="nv-view"><a href="/wiki/Template:Artificial_intelligence_navbox" title="Template:Artificial intelligence navbox"><abbr title="View this template">v</abbr></a></li><li class="nv-talk"><a href="/wiki/Template_talk:Artificial_intelligence_navbox" title="Template talk:Artificial intelligence navbox"><abbr title="Discuss this template">t</abbr></a></li><li class="nv-edit"><a href="/wiki/Special:EditPage/Template:Artificial_intelligence_navbox" title="Special:EditPage/Template:Artificial intelligence navbox"><abbr title="Edit this template">e</abbr></a></li></ul></div><div id="Artificial_intelligence_(AI)752" style="font-size:114%;margin:0 4em"><a href="/wiki/Artificial_intelligence" title="Artificial intelligence">Artificial intelligence</a> (AI)</div></th></tr><tr><td class="navbox-abovebelow" colspan="2"><div><a href="/wiki/History_of_artificial_intelligence" title="History of artificial intelligence">History</a> (<a href="/wiki/Timeline_of_artificial_intelligence" title="Timeline of artificial intelligence">timeline</a>)</div></td></tr><tr><th scope="row" class="navbox-group" style="width:1%">Concepts</th><td class="navbox-list-with-group navbox-list navbox-odd" style="width:100%;padding:0"><div style="padding:0 0.25em"> <ul><li><a href="/wiki/Parameter" title="Parameter">Parameter</a> <ul><li><a href="/wiki/Hyperparameter_(machine_learning)" title="Hyperparameter (machine learning)">Hyperparameter</a></li></ul></li> <li><a href="/wiki/Loss_functions_for_classification" title="Loss functions for classification">Loss functions</a></li> <li><a href="/wiki/Regression_analysis" title="Regression analysis">Regression</a> <ul><li><a href="/wiki/Bias%E2%80%93variance_tradeoff" title="Bias–variance tradeoff">Bias–variance tradeoff</a></li> <li><a href="/wiki/Double_descent" title="Double descent">Double descent</a></li> <li><a href="/wiki/Overfitting" title="Overfitting">Overfitting</a></li></ul></li> <li><a href="/wiki/Cluster_analysis" title="Cluster analysis">Clustering</a></li> <li><a href="/wiki/Gradient_descent" title="Gradient descent">Gradient descent</a> <ul><li><a href="/wiki/Stochastic_gradient_descent" title="Stochastic gradient descent">SGD</a></li> <li><a href="/wiki/Quasi-Newton_method" title="Quasi-Newton method">Quasi-Newton method</a></li> <li><a href="/wiki/Conjugate_gradient_method" title="Conjugate gradient method">Conjugate gradient method</a></li></ul></li> <li><a href="/wiki/Backpropagation" title="Backpropagation">Backpropagation</a></li> <li><a href="/wiki/Attention_(machine_learning)" title="Attention (machine learning)">Attention</a></li> <li><a href="/wiki/Convolution" title="Convolution">Convolution</a></li> <li><a href="/wiki/Normalization_(machine_learning)" title="Normalization (machine learning)">Normalization</a> <ul><li><a href="/wiki/Batch_normalization" title="Batch normalization">Batchnorm</a></li></ul></li> <li><a href="/wiki/Activation_function" title="Activation function">Activation</a> <ul><li><a href="/wiki/Softmax_function" title="Softmax function">Softmax</a></li> <li><a href="/wiki/Sigmoid_function" title="Sigmoid function">Sigmoid</a></li> <li><a href="/wiki/Rectifier_(neural_networks)" title="Rectifier (neural networks)">Rectifier</a></li></ul></li> <li><a href="/wiki/Gating_mechanism" title="Gating mechanism">Gating</a></li> <li><a href="/wiki/Weight_initialization" title="Weight initialization">Weight initialization</a></li> <li><a href="/wiki/Regularization_(mathematics)" title="Regularization (mathematics)">Regularization</a></li> <li><a href="/wiki/Training,_validation,_and_test_data_sets" title="Training, validation, and test data sets">Datasets</a> <ul><li><a href="/wiki/Data_augmentation" title="Data augmentation">Augmentation</a></li></ul></li> <li><a href="/wiki/Prompt_engineering" title="Prompt engineering">Prompt engineering</a></li> <li><a href="/wiki/Reinforcement_learning" title="Reinforcement learning">Reinforcement learning</a> <ul><li><a href="/wiki/Q-learning" title="Q-learning">Q-learning</a></li> <li><a href="/wiki/State%E2%80%93action%E2%80%93reward%E2%80%93state%E2%80%93action" title="State–action–reward–state–action">SARSA</a></li> <li><a href="/wiki/Imitation_learning" title="Imitation learning">Imitation</a></li> <li><a href="/wiki/Policy_gradient_method" title="Policy gradient method">Policy gradient</a></li></ul></li> <li><a href="/wiki/Diffusion_process" title="Diffusion process">Diffusion</a></li> <li><a href="/wiki/Latent_diffusion_model" title="Latent diffusion model">Latent diffusion model</a></li> <li><a href="/wiki/Autoregressive_model" title="Autoregressive model">Autoregression</a></li> <li><a href="/wiki/Adversarial_machine_learning" title="Adversarial machine learning">Adversary</a></li> <li><a href="/wiki/Retrieval-augmented_generation" title="Retrieval-augmented generation">RAG</a></li> <li><a href="/wiki/Uncanny_valley" title="Uncanny valley">Uncanny valley</a></li> <li><a href="/wiki/Reinforcement_learning_from_human_feedback" title="Reinforcement learning from human feedback">RLHF</a></li> <li><a href="/wiki/Self-supervised_learning" title="Self-supervised learning">Self-supervised learning</a></li> <li><a href="/wiki/Recursive_self-improvement" title="Recursive self-improvement">Recursive self-improvement</a></li> <li><a href="/wiki/Word_embedding" title="Word embedding">Word embedding</a></li> <li><a href="/wiki/Hallucination_(artificial_intelligence)" title="Hallucination (artificial intelligence)">Hallucination</a></li></ul> </div></td></tr><tr><th scope="row" class="navbox-group" style="width:1%">Applications</th><td class="navbox-list-with-group navbox-list navbox-even" style="width:100%;padding:0"><div style="padding:0 0.25em"> <ul><li><a href="/wiki/Machine_learning" title="Machine learning">Machine learning</a> <ul><li><a href="/wiki/Prompt_engineering#In-context_learning" title="Prompt engineering">In-context learning</a></li></ul></li> <li><a href="/wiki/Neural_network_(machine_learning)" title="Neural network (machine learning)">Artificial neural network</a> <ul><li><a href="/wiki/Deep_learning" title="Deep learning">Deep learning</a></li></ul></li> <li><a href="/wiki/Language_model" title="Language model">Language model</a> <ul><li><a href="/wiki/Large_language_model" title="Large language model">Large language model</a></li> <li><a href="/wiki/Neural_machine_translation" title="Neural machine translation">NMT</a></li></ul></li> <li><a href="/wiki/Artificial_general_intelligence" title="Artificial general intelligence">Artificial general intelligence</a></li></ul> </div></td></tr><tr><th scope="row" class="navbox-group" style="width:1%">Implementations</th><td class="navbox-list-with-group navbox-list navbox-odd" style="width:100%;padding:0"><div style="padding:0 0.25em"></div><table class="nowraplinks navbox-subgroup" style="border-spacing:0"><tbody><tr><th scope="row" class="navbox-group" style="width:1%">Audio–visual</th><td class="navbox-list-with-group navbox-list navbox-odd" style="width:100%;padding:0"><div style="padding:0 0.25em"> <ul><li><a href="/wiki/AlexNet" title="AlexNet">AlexNet</a></li> <li><a href="/wiki/WaveNet" title="WaveNet">WaveNet</a></li> <li><a href="/wiki/Human_image_synthesis" title="Human image synthesis">Human image synthesis</a></li> <li><a href="/wiki/Handwriting_recognition" title="Handwriting recognition">HWR</a></li> <li><a href="/wiki/Optical_character_recognition" title="Optical character recognition">OCR</a></li> <li><a href="/wiki/Deep_learning_speech_synthesis" title="Deep learning speech synthesis">Speech synthesis</a> <ul><li><a href="/wiki/15.ai" title="15.ai">15.ai</a></li> <li><a href="/wiki/ElevenLabs" title="ElevenLabs">ElevenLabs</a></li></ul></li> <li><a class="mw-selflink selflink">Speech recognition</a> <ul><li><a href="/wiki/Whisper_(speech_recognition_system)" title="Whisper (speech recognition system)">Whisper</a></li></ul></li> <li><a href="/wiki/Facial_recognition_system" title="Facial recognition system">Facial recognition</a></li> <li><a href="/wiki/AlphaFold" title="AlphaFold">AlphaFold</a></li> <li><a href="/wiki/Text-to-image_model" title="Text-to-image model">Text-to-image models</a> <ul><li><a href="/wiki/Aurora_(text-to-image_model)" class="mw-redirect" title="Aurora (text-to-image model)">Aurora</a></li> <li><a href="/wiki/DALL-E" title="DALL-E">DALL-E</a></li> <li><a href="/wiki/Adobe_Firefly" title="Adobe Firefly">Firefly</a></li> <li><a href="/wiki/Flux_(text-to-image_model)" title="Flux (text-to-image model)">Flux</a></li> <li><a href="/wiki/Ideogram_(text-to-image_model)" title="Ideogram (text-to-image model)">Ideogram</a></li> <li><a href="/wiki/Imagen_(text-to-image_model)" title="Imagen (text-to-image model)">Imagen</a></li> <li><a href="/wiki/Midjourney" title="Midjourney">Midjourney</a></li> <li><a href="/wiki/Stable_Diffusion" title="Stable Diffusion">Stable Diffusion</a></li></ul></li> <li><a href="/wiki/Text-to-video_model" title="Text-to-video model">Text-to-video models</a> <ul><li><a href="/wiki/Dream_Machine_(text-to-video_model)" title="Dream Machine (text-to-video model)">Dream Machine</a></li> <li><a href="/wiki/Runway_(company)#Gen-3_Alpha" title="Runway (company)">Gen-3 Alpha</a></li> <li><a href="/wiki/MiniMax_(company)#Hailuo_AI" title="MiniMax (company)">Hailuo AI</a></li> <li><a href="/wiki/Kling_(text-to-video_model)" class="mw-redirect" title="Kling (text-to-video model)">Kling</a></li> <li><a href="/wiki/Sora_(text-to-video_model)" title="Sora (text-to-video model)">Sora</a></li> <li><a href="/wiki/Google_DeepMind#Video_model" title="Google DeepMind">Veo</a></li></ul></li> <li><a href="/wiki/Music_and_artificial_intelligence" title="Music and artificial intelligence">Music generation</a> <ul><li><a href="/wiki/Suno_AI" title="Suno AI">Suno AI</a></li> <li><a href="/wiki/Udio" title="Udio">Udio</a></li></ul></li></ul> </div></td></tr><tr><th scope="row" class="navbox-group" style="width:1%">Text</th><td class="navbox-list-with-group navbox-list navbox-even" style="width:100%;padding:0"><div style="padding:0 0.25em"> <ul><li><a href="/wiki/Word2vec" title="Word2vec">Word2vec</a></li> <li><a href="/wiki/Seq2seq" title="Seq2seq">Seq2seq</a></li> <li><a href="/wiki/GloVe" title="GloVe">GloVe</a></li> <li><a href="/wiki/BERT_(language_model)" title="BERT (language model)">BERT</a></li> <li><a href="/wiki/T5_(language_model)" title="T5 (language model)">T5</a></li> <li><a href="/wiki/Llama_(language_model)" title="Llama (language model)">Llama</a></li> <li><a href="/wiki/Chinchilla_(language_model)" title="Chinchilla (language model)">Chinchilla AI</a></li> <li><a href="/wiki/PaLM" title="PaLM">PaLM</a></li> <li><a href="/wiki/Generative_pre-trained_transformer" title="Generative pre-trained transformer">GPT</a> <ul><li><a href="/wiki/GPT-1" title="GPT-1">1</a></li> <li><a href="/wiki/GPT-2" title="GPT-2">2</a></li> <li><a href="/wiki/GPT-3" title="GPT-3">3</a></li> <li><a href="/wiki/GPT-J" title="GPT-J">J</a></li> <li><a href="/wiki/ChatGPT" title="ChatGPT">ChatGPT</a></li> <li><a href="/wiki/GPT-4" title="GPT-4">4</a></li> <li><a href="/wiki/GPT-4o" title="GPT-4o">4o</a></li> <li><a href="/wiki/OpenAI_o1" title="OpenAI o1">o1</a></li> <li><a href="/wiki/OpenAI_o3" title="OpenAI o3">o3</a></li> <li><a href="/wiki/GPT-4.5" title="GPT-4.5">4.5</a></li></ul></li> <li><a href="/wiki/Claude_(language_model)" title="Claude (language model)">Claude</a></li> <li><a href="/wiki/Gemini_(language_model)" title="Gemini (language model)">Gemini</a> <ul><li><a href="/wiki/Gemini_(chatbot)" title="Gemini (chatbot)">chatbot</a></li></ul></li> <li><a href="/wiki/Grok_(chatbot)" title="Grok (chatbot)">Grok</a></li> <li><a href="/wiki/LaMDA" title="LaMDA">LaMDA</a></li> <li><a href="/wiki/BLOOM_(language_model)" title="BLOOM (language model)">BLOOM</a></li> <li><a href="/wiki/Project_Debater" title="Project Debater">Project Debater</a></li> <li><a href="/wiki/IBM_Watson" title="IBM Watson">IBM Watson</a></li> <li><a href="/wiki/IBM_Watsonx" title="IBM Watsonx">IBM Watsonx</a></li> <li><a href="/wiki/IBM_Granite" title="IBM Granite">Granite</a></li> <li><a href="/wiki/Huawei_PanGu" title="Huawei PanGu">PanGu-Σ</a></li> <li><a href="/wiki/DeepSeek_(chatbot)" title="DeepSeek (chatbot)">DeepSeek</a></li> <li><a href="/wiki/Qwen" title="Qwen">Qwen</a></li></ul> </div></td></tr><tr><th scope="row" class="navbox-group" style="width:1%">Decisional</th><td class="navbox-list-with-group navbox-list navbox-odd" style="width:100%;padding:0"><div style="padding:0 0.25em"> <ul><li><a href="/wiki/AlphaGo" title="AlphaGo">AlphaGo</a></li> <li><a href="/wiki/AlphaZero" title="AlphaZero">AlphaZero</a></li> <li><a href="/wiki/OpenAI_Five" title="OpenAI Five">OpenAI Five</a></li> <li><a href="/wiki/Self-driving_car" title="Self-driving car">Self-driving car</a></li> <li><a href="/wiki/MuZero" title="MuZero">MuZero</a></li> <li><a href="/wiki/Action_selection" title="Action selection">Action selection</a> <ul><li><a href="/wiki/AutoGPT" title="AutoGPT">AutoGPT</a></li></ul></li> <li><a href="/wiki/Robot_control" title="Robot control">Robot control</a></li></ul> </div></td></tr></tbody></table><div></div></td></tr><tr><th scope="row" class="navbox-group" style="width:1%">People</th><td class="navbox-list-with-group navbox-list navbox-even" style="width:100%;padding:0"><div style="padding:0 0.25em"> <ul><li><a href="/wiki/Alan_Turing" title="Alan Turing">Alan Turing</a></li> <li><a href="/wiki/Warren_Sturgis_McCulloch" title="Warren Sturgis McCulloch">Warren Sturgis McCulloch</a></li> <li><a href="/wiki/Walter_Pitts" title="Walter Pitts">Walter Pitts</a></li> <li><a href="/wiki/John_von_Neumann" title="John von Neumann">John von Neumann</a></li> <li><a href="/wiki/Claude_Shannon" title="Claude Shannon">Claude Shannon</a></li> <li><a href="/wiki/Marvin_Minsky" title="Marvin Minsky">Marvin Minsky</a></li> <li><a href="/wiki/John_McCarthy_(computer_scientist)" title="John McCarthy (computer scientist)">John McCarthy</a></li> <li><a href="/wiki/Nathaniel_Rochester_(computer_scientist)" title="Nathaniel Rochester (computer scientist)">Nathaniel Rochester</a></li> <li><a href="/wiki/Allen_Newell" title="Allen Newell">Allen Newell</a></li> <li><a href="/wiki/Cliff_Shaw" title="Cliff Shaw">Cliff Shaw</a></li> <li><a href="/wiki/Herbert_A._Simon" title="Herbert A. Simon">Herbert A. Simon</a></li> <li><a href="/wiki/Oliver_Selfridge" title="Oliver Selfridge">Oliver Selfridge</a></li> <li><a href="/wiki/Frank_Rosenblatt" title="Frank Rosenblatt">Frank Rosenblatt</a></li> <li><a href="/wiki/Bernard_Widrow" title="Bernard Widrow">Bernard Widrow</a></li> <li><a href="/wiki/Joseph_Weizenbaum" title="Joseph Weizenbaum">Joseph Weizenbaum</a></li> <li><a href="/wiki/Seymour_Papert" title="Seymour Papert">Seymour Papert</a></li> <li><a href="/wiki/Seppo_Linnainmaa" title="Seppo Linnainmaa">Seppo Linnainmaa</a></li> <li><a href="/wiki/Paul_Werbos" title="Paul Werbos">Paul Werbos</a></li> <li><a href="/wiki/J%C3%BCrgen_Schmidhuber" title="Jürgen Schmidhuber">Jürgen Schmidhuber</a></li> <li><a href="/wiki/Yann_LeCun" title="Yann LeCun">Yann LeCun</a></li> <li><a href="/wiki/Geoffrey_Hinton" title="Geoffrey Hinton">Geoffrey Hinton</a></li> <li><a href="/wiki/John_Hopfield" title="John Hopfield">John Hopfield</a></li> <li><a href="/wiki/Yoshua_Bengio" title="Yoshua Bengio">Yoshua Bengio</a></li> <li><a href="/wiki/Lotfi_A._Zadeh" title="Lotfi A. Zadeh">Lotfi A. Zadeh</a></li> <li><a href="/wiki/Stephen_Grossberg" title="Stephen Grossberg">Stephen Grossberg</a></li> <li><a href="/wiki/Alex_Graves_(computer_scientist)" title="Alex Graves (computer scientist)">Alex Graves</a></li> <li><a href="/wiki/Andrew_Ng" title="Andrew Ng">Andrew Ng</a></li> <li><a href="/wiki/Fei-Fei_Li" title="Fei-Fei Li">Fei-Fei Li</a></li> <li><a href="/wiki/Alex_Krizhevsky" title="Alex Krizhevsky">Alex Krizhevsky</a></li> <li><a href="/wiki/Ilya_Sutskever" title="Ilya Sutskever">Ilya Sutskever</a></li> <li><a href="/wiki/Demis_Hassabis" title="Demis Hassabis">Demis Hassabis</a></li> <li><a href="/wiki/David_Silver_(computer_scientist)" title="David Silver (computer scientist)">David Silver</a></li> <li><a href="/wiki/Ian_Goodfellow" title="Ian Goodfellow">Ian Goodfellow</a></li> <li><a href="/wiki/Andrej_Karpathy" title="Andrej Karpathy">Andrej Karpathy</a></li></ul> </div></td></tr><tr><th scope="row" class="navbox-group" style="width:1%">Architectures</th><td class="navbox-list-with-group navbox-list navbox-odd" style="width:100%;padding:0"><div style="padding:0 0.25em"> <ul><li><a href="/wiki/Neural_Turing_machine" title="Neural Turing machine">Neural Turing machine</a></li> <li><a href="/wiki/Differentiable_neural_computer" title="Differentiable neural computer">Differentiable neural computer</a></li> <li><a href="/wiki/Transformer_(deep_learning_architecture)" title="Transformer (deep learning architecture)">Transformer</a> <ul><li><a href="/wiki/Vision_transformer" title="Vision transformer">Vision transformer (ViT)</a></li></ul></li> <li><a href="/wiki/Recurrent_neural_network" title="Recurrent neural network">Recurrent neural network (RNN)</a></li> <li><a href="/wiki/Long_short-term_memory" title="Long short-term memory">Long short-term memory (LSTM)</a></li> <li><a href="/wiki/Gated_recurrent_unit" title="Gated recurrent unit">Gated recurrent unit (GRU)</a></li> <li><a href="/wiki/Echo_state_network" title="Echo state network">Echo state network</a></li> <li><a href="/wiki/Multilayer_perceptron" title="Multilayer perceptron">Multilayer perceptron (MLP)</a></li> <li><a href="/wiki/Convolutional_neural_network" title="Convolutional neural network">Convolutional neural network (CNN)</a></li> <li><a href="/wiki/Residual_neural_network" title="Residual neural network">Residual neural network (RNN)</a></li> <li><a href="/wiki/Highway_network" title="Highway network">Highway network</a></li> <li><a href="/wiki/Mamba_(deep_learning_architecture)" title="Mamba (deep learning architecture)">Mamba</a></li> <li><a href="/wiki/Autoencoder" title="Autoencoder">Autoencoder</a></li> <li><a href="/wiki/Variational_autoencoder" title="Variational autoencoder">Variational autoencoder (VAE)</a></li> <li><a href="/wiki/Generative_adversarial_network" title="Generative adversarial network">Generative adversarial network (GAN)</a></li> <li><a href="/wiki/Graph_neural_network" title="Graph neural network">Graph neural network (GNN)</a></li></ul> </div></td></tr><tr><td class="navbox-abovebelow" colspan="2"><div> <ul><li><span class="noviewer" typeof="mw:File"><a href="/wiki/File:Symbol_portal_class.svg" class="mw-file-description" title="Portal"><img alt="" src="//upload.wikimedia.org/wikipedia/en/thumb/e/e2/Symbol_portal_class.svg/20px-Symbol_portal_class.svg.png" decoding="async" width="16" height="16" class="mw-file-element" srcset="//upload.wikimedia.org/wikipedia/en/thumb/e/e2/Symbol_portal_class.svg/40px-Symbol_portal_class.svg.png 1.5x" data-file-width="180" data-file-height="185" /></a></span> Portals <ul><li><a href="/wiki/Portal:Technology" title="Portal:Technology">Technology</a></li></ul></li> <li><span class="noviewer" typeof="mw:File"><span title="Category"><img alt="" src="//upload.wikimedia.org/wikipedia/en/thumb/9/96/Symbol_category_class.svg/16px-Symbol_category_class.svg.png" decoding="async" width="16" height="16" class="mw-file-element" srcset="//upload.wikimedia.org/wikipedia/en/thumb/9/96/Symbol_category_class.svg/23px-Symbol_category_class.svg.png 1.5x, //upload.wikimedia.org/wikipedia/en/thumb/9/96/Symbol_category_class.svg/31px-Symbol_category_class.svg.png 2x" data-file-width="180" data-file-height="185" /></span></span> <a href="/wiki/Category:Artificial_intelligence" title="Category:Artificial intelligence">Category</a> <ul><li><a href="/wiki/Category:Artificial_neural_networks" title="Category:Artificial neural networks">Artificial neural networks</a></li> <li><a href="/wiki/Category:Machine_learning" title="Category:Machine learning">Machine learning</a></li></ul></li> <li><span class="noviewer" typeof="mw:File"><span title="List-Class article"><img alt="" src="//upload.wikimedia.org/wikipedia/en/thumb/d/db/Symbol_list_class.svg/16px-Symbol_list_class.svg.png" decoding="async" width="16" height="16" class="mw-file-element" srcset="//upload.wikimedia.org/wikipedia/en/thumb/d/db/Symbol_list_class.svg/23px-Symbol_list_class.svg.png 1.5x, //upload.wikimedia.org/wikipedia/en/thumb/d/db/Symbol_list_class.svg/31px-Symbol_list_class.svg.png 2x" data-file-width="180" data-file-height="185" /></span></span> List <ul><li><a href="/wiki/List_of_artificial_intelligence_companies" title="List of artificial intelligence companies">Companies</a></li> <li><a href="/wiki/List_of_artificial_intelligence_projects" title="List of artificial intelligence projects">Projects</a></li></ul></li></ul> </div></td></tr></tbody></table></div> <!-- NewPP limit report Parsed by mw‐web.codfw.main‐5c6f46dcf‐zkdlt Cached time: 20250331080725 Cache expiry: 2592000 Reduced expiry: false Complications: [vary‐revision‐sha1, show‐toc] CPU time usage: 1.444 seconds Real time usage: 1.648 seconds Preprocessor visited node count: 10259/1000000 Post‐expand include size: 394963/2097152 bytes Template argument size: 6516/2097152 bytes Highest expansion depth: 17/100 Expensive parser function count: 15/500 Unstrip recursion depth: 1/20 Unstrip post‐expand size: 552640/5000000 bytes Lua time usage: 0.929/10.000 seconds Lua memory usage: 6994257/52428800 bytes Number of Wikibase entities loaded: 0/400 --> <!-- Transclusion expansion time report (%,ms,calls,template) 100.00% 1339.245 1 -total 68.82% 921.650 1 Template:Reflist 19.63% 262.930 46 Template:Cite_web 16.05% 214.895 35 Template:Cite_journal 11.05% 147.976 24 Template:Cite_book 7.21% 96.550 5 Template:Navbox 6.02% 80.621 13 Template:Cite_arXiv 5.83% 78.121 1 Template:Natural_Language_Processing 5.54% 74.175 1 Template:Short_description 5.46% 73.071 7 Template:Fix --> <!-- Saved in parser cache with key enwiki:pcache:29468:|#|:idhash:canonical and timestamp 20250331080725 and revision id 1283243349. Rendering was triggered because: page-view --> </div><!--esi <esi:include src="/esitest-fa8a495983347898/content" /> --><noscript><img src="https://login.wikimedia.org/wiki/Special:CentralAutoLogin/start?useformat=desktop&amp;type=1x1&amp;usesul3=0" alt="" width="1" height="1" style="border: none; position: absolute;"></noscript> <div class="printfooter" data-nosnippet="">Retrieved from "<a dir="ltr" href="https://en.wikipedia.org/w/index.php?title=Speech_recognition&amp;oldid=1283243349">https://en.wikipedia.org/w/index.php?title=Speech_recognition&amp;oldid=1283243349</a>"</div></div> <div id="catlinks" class="catlinks" data-mw="interface"><div id="mw-normal-catlinks" class="mw-normal-catlinks"><a href="/wiki/Help:Category" title="Help:Category">Categories</a>: <ul><li><a href="/wiki/Category:Speech_recognition" title="Category:Speech recognition">Speech recognition</a></li><li><a href="/wiki/Category:Automatic_identification_and_data_capture" title="Category:Automatic identification and data capture">Automatic identification and data capture</a></li><li><a href="/wiki/Category:Computational_linguistics" title="Category:Computational linguistics">Computational linguistics</a></li><li><a href="/wiki/Category:User_interface_techniques" title="Category:User interface techniques">User interface techniques</a></li><li><a href="/wiki/Category:History_of_human%E2%80%93computer_interaction" title="Category:History of human–computer interaction">History of human–computer interaction</a></li><li><a href="/wiki/Category:Computer_accessibility" title="Category:Computer accessibility">Computer accessibility</a></li><li><a href="/wiki/Category:Machine_learning_task" title="Category:Machine learning task">Machine learning task</a></li></ul></div><div id="mw-hidden-catlinks" class="mw-hidden-catlinks mw-hidden-cats-hidden">Hidden categories: <ul><li><a href="/wiki/Category:Webarchive_template_wayback_links" title="Category:Webarchive template wayback links">Webarchive template wayback links</a></li><li><a href="/wiki/Category:All_articles_with_dead_external_links" title="Category:All articles with dead external links">All articles with dead external links</a></li><li><a href="/wiki/Category:Articles_with_dead_external_links_from_March_2023" title="Category:Articles with dead external links from March 2023">Articles with dead external links from March 2023</a></li><li><a href="/wiki/Category:Articles_with_permanently_dead_external_links" title="Category:Articles with permanently dead external links">Articles with permanently dead external links</a></li><li><a href="/wiki/Category:CS1_errors:_missing_periodical" title="Category:CS1 errors: missing periodical">CS1 errors: missing periodical</a></li><li><a href="/wiki/Category:CS1:_unfit_URL" title="Category:CS1: unfit URL">CS1: unfit URL</a></li><li><a href="/wiki/Category:Articles_with_short_description" title="Category:Articles with short description">Articles with short description</a></li><li><a href="/wiki/Category:Short_description_matches_Wikidata" title="Category:Short description matches Wikidata">Short description matches Wikidata</a></li><li><a href="/wiki/Category:Use_dmy_dates_from_February_2017" title="Category:Use dmy dates from February 2017">Use dmy dates from February 2017</a></li><li><a href="/wiki/Category:Articles_containing_potentially_dated_statements_from_2017" title="Category:Articles containing potentially dated statements from 2017">Articles containing potentially dated statements from 2017</a></li><li><a href="/wiki/Category:All_articles_containing_potentially_dated_statements" title="Category:All articles containing potentially dated statements">All articles containing potentially dated statements</a></li><li><a href="/wiki/Category:All_articles_with_unsourced_statements" title="Category:All articles with unsourced statements">All articles with unsourced statements</a></li><li><a href="/wiki/Category:Articles_with_unsourced_statements_from_March_2014" title="Category:Articles with unsourced statements from March 2014">Articles with unsourced statements from March 2014</a></li><li><a href="/wiki/Category:All_articles_with_vague_or_ambiguous_time" title="Category:All articles with vague or ambiguous time">All articles with vague or ambiguous time</a></li><li><a href="/wiki/Category:Vague_or_ambiguous_time_from_April_2014" title="Category:Vague or ambiguous time from April 2014">Vague or ambiguous time from April 2014</a></li><li><a href="/wiki/Category:Articles_with_unsourced_statements_from_November_2016" title="Category:Articles with unsourced statements from November 2016">Articles with unsourced statements from November 2016</a></li><li><a href="/wiki/Category:Articles_with_unsourced_statements_from_December_2012" title="Category:Articles with unsourced statements from December 2012">Articles with unsourced statements from December 2012</a></li><li><a href="/wiki/Category:Articles_with_unsourced_statements_from_May_2013" title="Category:Articles with unsourced statements from May 2013">Articles with unsourced statements from May 2013</a></li><li><a href="/wiki/Category:CS1:_long_volume_value" title="Category:CS1: long volume value">CS1: long volume value</a></li></ul></div></div> </div> </main> </div> <div class="mw-footer-container"> <footer id="footer" class="mw-footer" > <ul id="footer-info"> <li id="footer-info-lastmod"> This page was last edited on 31 March 2025, at 08:06<span class="anonymous-show">&#160;(UTC)</span>.</li> <li id="footer-info-copyright">Text is available under the <a href="/wiki/Wikipedia:Text_of_the_Creative_Commons_Attribution-ShareAlike_4.0_International_License" title="Wikipedia:Text of the Creative Commons Attribution-ShareAlike 4.0 International License">Creative Commons Attribution-ShareAlike 4.0 License</a>; additional terms may apply. By using this site, you agree to the <a href="https://foundation.wikimedia.org/wiki/Special:MyLanguage/Policy:Terms_of_Use" class="extiw" title="foundation:Special:MyLanguage/Policy:Terms of Use">Terms of Use</a> and <a href="https://foundation.wikimedia.org/wiki/Special:MyLanguage/Policy:Privacy_policy" class="extiw" title="foundation:Special:MyLanguage/Policy:Privacy policy">Privacy Policy</a>. Wikipedia® is a registered trademark of the <a rel="nofollow" class="external text" href="https://wikimediafoundation.org/">Wikimedia Foundation, Inc.</a>, a non-profit organization.</li> </ul> <ul id="footer-places"> <li id="footer-places-privacy"><a href="https://foundation.wikimedia.org/wiki/Special:MyLanguage/Policy:Privacy_policy">Privacy policy</a></li> <li id="footer-places-about"><a href="/wiki/Wikipedia:About">About Wikipedia</a></li> <li id="footer-places-disclaimers"><a href="/wiki/Wikipedia:General_disclaimer">Disclaimers</a></li> <li id="footer-places-contact"><a href="//en.wikipedia.org/wiki/Wikipedia:Contact_us">Contact Wikipedia</a></li> <li id="footer-places-wm-codeofconduct"><a href="https://foundation.wikimedia.org/wiki/Special:MyLanguage/Policy:Universal_Code_of_Conduct">Code of Conduct</a></li> <li id="footer-places-developers"><a href="https://developer.wikimedia.org">Developers</a></li> <li id="footer-places-statslink"><a href="https://stats.wikimedia.org/#/en.wikipedia.org">Statistics</a></li> <li id="footer-places-cookiestatement"><a href="https://foundation.wikimedia.org/wiki/Special:MyLanguage/Policy:Cookie_statement">Cookie statement</a></li> <li id="footer-places-mobileview"><a href="//en.m.wikipedia.org/w/index.php?title=Speech_recognition&amp;mobileaction=toggle_view_mobile" class="noprint stopMobileRedirectToggle">Mobile view</a></li> </ul> <ul id="footer-icons" class="noprint"> <li id="footer-copyrightico"><a href="https://www.wikimedia.org/" class="cdx-button cdx-button--fake-button cdx-button--size-large cdx-button--fake-button--enabled"><picture><source media="(min-width: 500px)" srcset="/static/images/footer/wikimedia-button.svg" width="84" height="29"><img src="/static/images/footer/wikimedia.svg" width="25" height="25" alt="Wikimedia Foundation" lang="en" loading="lazy"></picture></a></li> <li id="footer-poweredbyico"><a href="https://www.mediawiki.org/" class="cdx-button cdx-button--fake-button cdx-button--size-large cdx-button--fake-button--enabled"><picture><source media="(min-width: 500px)" srcset="/w/resources/assets/poweredby_mediawiki.svg" width="88" height="31"><img src="/w/resources/assets/mediawiki_compact.svg" alt="Powered by MediaWiki" lang="en" width="25" height="25" loading="lazy"></picture></a></li> </ul> </footer> </div> </div> </div> <div class="vector-header-container vector-sticky-header-container"> <div id="vector-sticky-header" class="vector-sticky-header"> <div class="vector-sticky-header-start"> <div class="vector-sticky-header-icon-start vector-button-flush-left vector-button-flush-right" aria-hidden="true"> <button class="cdx-button cdx-button--weight-quiet cdx-button--icon-only vector-sticky-header-search-toggle" tabindex="-1" data-event-name="ui.vector-sticky-search-form.icon"><span class="vector-icon mw-ui-icon-search mw-ui-icon-wikimedia-search"></span> <span>Search</span> </button> </div> <div role="search" class="vector-search-box-vue vector-search-box-show-thumbnail vector-search-box"> <div class="vector-typeahead-search-container"> <div class="cdx-typeahead-search cdx-typeahead-search--show-thumbnail"> <form action="/w/index.php" id="vector-sticky-search-form" class="cdx-search-input cdx-search-input--has-end-button"> <div class="cdx-search-input__input-wrapper" data-search-loc="header-moved"> <div class="cdx-text-input cdx-text-input--has-start-icon"> <input class="cdx-text-input__input" type="search" name="search" placeholder="Search Wikipedia"> <span class="cdx-text-input__icon cdx-text-input__start-icon"></span> </div> <input type="hidden" name="title" value="Special:Search"> </div> <button class="cdx-button cdx-search-input__end-button">Search</button> </form> </div> </div> </div> <div class="vector-sticky-header-context-bar"> <nav aria-label="Contents" class="vector-toc-landmark"> <div id="vector-sticky-header-toc" class="vector-dropdown mw-portlet mw-portlet-sticky-header-toc vector-sticky-header-toc vector-button-flush-left" > <input type="checkbox" id="vector-sticky-header-toc-checkbox" role="button" aria-haspopup="true" data-event-name="ui.dropdown-vector-sticky-header-toc" class="vector-dropdown-checkbox " aria-label="Toggle the table of contents" > <label id="vector-sticky-header-toc-label" for="vector-sticky-header-toc-checkbox" class="vector-dropdown-label cdx-button cdx-button--fake-button cdx-button--fake-button--enabled cdx-button--weight-quiet cdx-button--icon-only " aria-hidden="true" ><span class="vector-icon mw-ui-icon-listBullet mw-ui-icon-wikimedia-listBullet"></span> <span class="vector-dropdown-label-text">Toggle the table of contents</span> </label> <div class="vector-dropdown-content"> <div id="vector-sticky-header-toc-unpinned-container" class="vector-unpinned-container"> </div> </div> </div> </nav> <div class="vector-sticky-header-context-bar-primary" aria-hidden="true" ><span class="mw-page-title-main">Speech recognition</span></div> </div> </div> <div class="vector-sticky-header-end" aria-hidden="true"> <div class="vector-sticky-header-icons"> <a href="#" class="cdx-button cdx-button--fake-button cdx-button--fake-button--enabled cdx-button--weight-quiet cdx-button--icon-only" id="ca-talk-sticky-header" tabindex="-1" data-event-name="talk-sticky-header"><span class="vector-icon mw-ui-icon-speechBubbles mw-ui-icon-wikimedia-speechBubbles"></span> <span></span> </a> <a href="#" class="cdx-button cdx-button--fake-button cdx-button--fake-button--enabled cdx-button--weight-quiet cdx-button--icon-only" id="ca-subject-sticky-header" tabindex="-1" data-event-name="subject-sticky-header"><span class="vector-icon mw-ui-icon-article mw-ui-icon-wikimedia-article"></span> <span></span> </a> <a href="#" class="cdx-button cdx-button--fake-button cdx-button--fake-button--enabled cdx-button--weight-quiet cdx-button--icon-only" id="ca-history-sticky-header" tabindex="-1" data-event-name="history-sticky-header"><span class="vector-icon mw-ui-icon-wikimedia-history mw-ui-icon-wikimedia-wikimedia-history"></span> <span></span> </a> <a href="#" class="cdx-button cdx-button--fake-button cdx-button--fake-button--enabled cdx-button--weight-quiet cdx-button--icon-only mw-watchlink" id="ca-watchstar-sticky-header" tabindex="-1" data-event-name="watch-sticky-header"><span class="vector-icon mw-ui-icon-wikimedia-star mw-ui-icon-wikimedia-wikimedia-star"></span> <span></span> </a> <a href="#" class="cdx-button cdx-button--fake-button cdx-button--fake-button--enabled cdx-button--weight-quiet cdx-button--icon-only" id="ca-edit-sticky-header" tabindex="-1" data-event-name="wikitext-edit-sticky-header"><span class="vector-icon mw-ui-icon-wikimedia-wikiText mw-ui-icon-wikimedia-wikimedia-wikiText"></span> <span></span> </a> <a href="#" class="cdx-button cdx-button--fake-button cdx-button--fake-button--enabled cdx-button--weight-quiet cdx-button--icon-only" id="ca-ve-edit-sticky-header" tabindex="-1" data-event-name="ve-edit-sticky-header"><span class="vector-icon mw-ui-icon-wikimedia-edit mw-ui-icon-wikimedia-wikimedia-edit"></span> <span></span> </a> <a href="#" class="cdx-button cdx-button--fake-button cdx-button--fake-button--enabled cdx-button--weight-quiet cdx-button--icon-only" id="ca-viewsource-sticky-header" tabindex="-1" data-event-name="ve-edit-protected-sticky-header"><span class="vector-icon mw-ui-icon-wikimedia-editLock mw-ui-icon-wikimedia-wikimedia-editLock"></span> <span></span> </a> </div> <div class="vector-sticky-header-buttons"> <button class="cdx-button cdx-button--weight-quiet mw-interlanguage-selector" id="p-lang-btn-sticky-header" tabindex="-1" data-event-name="ui.dropdown-p-lang-btn-sticky-header"><span class="vector-icon mw-ui-icon-wikimedia-language mw-ui-icon-wikimedia-wikimedia-language"></span> <span>49 languages</span> </button> <a href="#" class="cdx-button cdx-button--fake-button cdx-button--fake-button--enabled cdx-button--weight-quiet cdx-button--action-progressive" id="ca-addsection-sticky-header" tabindex="-1" data-event-name="addsection-sticky-header"><span class="vector-icon mw-ui-icon-speechBubbleAdd-progressive mw-ui-icon-wikimedia-speechBubbleAdd-progressive"></span> <span>Add topic</span> </a> </div> <div class="vector-sticky-header-icon-end"> <div class="vector-user-links"> </div> </div> </div> </div> </div> <div class="mw-portlet mw-portlet-dock-bottom emptyPortlet" id="p-dock-bottom"> <ul> </ul> </div> <script>(RLQ=window.RLQ||[]).push(function(){mw.config.set({"wgHostname":"mw-web.codfw.main-5c6f46dcf-fdbhq","wgBackendResponseTime":251,"wgPageParseReport":{"limitreport":{"cputime":"1.444","walltime":"1.648","ppvisitednodes":{"value":10259,"limit":1000000},"postexpandincludesize":{"value":394963,"limit":2097152},"templateargumentsize":{"value":6516,"limit":2097152},"expansiondepth":{"value":17,"limit":100},"expensivefunctioncount":{"value":15,"limit":500},"unstrip-depth":{"value":1,"limit":20},"unstrip-size":{"value":552640,"limit":5000000},"entityaccesscount":{"value":0,"limit":400},"timingprofile":["100.00% 1339.245 1 -total"," 68.82% 921.650 1 Template:Reflist"," 19.63% 262.930 46 Template:Cite_web"," 16.05% 214.895 35 Template:Cite_journal"," 11.05% 147.976 24 Template:Cite_book"," 7.21% 96.550 5 Template:Navbox"," 6.02% 80.621 13 Template:Cite_arXiv"," 5.83% 78.121 1 Template:Natural_Language_Processing"," 5.54% 74.175 1 Template:Short_description"," 5.46% 73.071 7 Template:Fix"]},"scribunto":{"limitreport-timeusage":{"value":"0.929","limit":"10.000"},"limitreport-memusage":{"value":6994257,"limit":52428800}},"cachereport":{"origin":"mw-web.codfw.main-5c6f46dcf-zkdlt","timestamp":"20250331080725","ttl":2592000,"transientcontent":false}}});});</script> <script type="application/ld+json">{"@context":"https:\/\/schema.org","@type":"Article","name":"Speech recognition","url":"https:\/\/en.wikipedia.org\/wiki\/Speech_recognition","sameAs":"http:\/\/www.wikidata.org\/entity\/Q189436","mainEntity":"http:\/\/www.wikidata.org\/entity\/Q189436","author":{"@type":"Organization","name":"Contributors to Wikimedia projects"},"publisher":{"@type":"Organization","name":"Wikimedia Foundation, Inc.","logo":{"@type":"ImageObject","url":"https:\/\/www.wikimedia.org\/static\/images\/wmf-hor-googpub.png"}},"datePublished":"2002-01-08T05:06:41Z","dateModified":"2025-03-31T08:06:40Z","headline":"automatic conversion of spoken language into text"}</script> </body> </html>

Pages: 1 2 3 4 5 6 7 8 9 10