CINXE.COM
Lexical analysis - Wikipedia
<!DOCTYPE html> <html class="client-nojs vector-feature-language-in-header-enabled vector-feature-language-in-main-page-header-disabled vector-feature-sticky-header-disabled vector-feature-page-tools-pinned-disabled vector-feature-toc-pinned-clientpref-1 vector-feature-main-menu-pinned-disabled vector-feature-limited-width-clientpref-1 vector-feature-limited-width-content-enabled vector-feature-custom-font-size-clientpref-1 vector-feature-appearance-pinned-clientpref-1 vector-feature-night-mode-enabled skin-theme-clientpref-day vector-toc-available" lang="en" dir="ltr"> <head> <meta charset="UTF-8"> <title>Lexical analysis - Wikipedia</title> <script>(function(){var className="client-js vector-feature-language-in-header-enabled vector-feature-language-in-main-page-header-disabled vector-feature-sticky-header-disabled vector-feature-page-tools-pinned-disabled vector-feature-toc-pinned-clientpref-1 vector-feature-main-menu-pinned-disabled vector-feature-limited-width-clientpref-1 vector-feature-limited-width-content-enabled vector-feature-custom-font-size-clientpref-1 vector-feature-appearance-pinned-clientpref-1 vector-feature-night-mode-enabled skin-theme-clientpref-day vector-toc-available";var cookie=document.cookie.match(/(?:^|; )enwikimwclientpreferences=([^;]+)/);if(cookie){cookie[1].split('%2C').forEach(function(pref){className=className.replace(new RegExp('(^| )'+pref.replace(/-clientpref-\w+$|[^\w-]+/g,'')+'-clientpref-\\w+( |$)'),'$1'+pref+'$2');});}document.documentElement.className=className;}());RLCONF={"wgBreakFrames":false,"wgSeparatorTransformTable":["",""],"wgDigitTransformTable":["",""],"wgDefaultDateFormat":"dmy", "wgMonthNames":["","January","February","March","April","May","June","July","August","September","October","November","December"],"wgRequestId":"548c8a84-b9e8-4c56-a5d8-7c17840c69a3","wgCanonicalNamespace":"","wgCanonicalSpecialPageName":false,"wgNamespaceNumber":0,"wgPageName":"Lexical_analysis","wgTitle":"Lexical analysis","wgCurRevisionId":1241869300,"wgRevisionId":1241869300,"wgArticleId":81251,"wgIsArticle":true,"wgIsRedirect":false,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgCategories":["Articles with short description","Short description is different from Wikidata","All accuracy disputes","Articles with disputed statements from May 2010","All articles with unsourced statements","Articles with unsourced statements from April 2008","Lexical analysis","Compiler construction","Programming language implementation","Parsing"],"wgPageViewLanguage":"en","wgPageContentLanguage":"en","wgPageContentModel":"wikitext","wgRelevantPageName":"Lexical_analysis", "wgRelevantArticleId":81251,"wgIsProbablyEditable":true,"wgRelevantPageIsProbablyEditable":true,"wgRestrictionEdit":[],"wgRestrictionMove":[],"wgNoticeProject":"wikipedia","wgCiteReferencePreviewsActive":false,"wgFlaggedRevsParams":{"tags":{"status":{"levels":1}}},"wgMediaViewerOnClick":true,"wgMediaViewerEnabledByDefault":true,"wgPopupsFlags":0,"wgVisualEditor":{"pageLanguageCode":"en","pageLanguageDir":"ltr","pageVariantFallbacks":"en"},"wgMFDisplayWikibaseDescriptions":{"search":true,"watchlist":true,"tagline":false,"nearby":true},"wgWMESchemaEditAttemptStepOversample":false,"wgWMEPageLength":30000,"wgRelatedArticlesCompat":[],"wgCentralAuthMobileDomain":false,"wgEditSubmitButtonLabelPublish":true,"wgULSPosition":"interlanguage","wgULSisCompactLinksEnabled":false,"wgVector2022LanguageInHeader":true,"wgULSisLanguageSelectorEmpty":false,"wgWikibaseItemId":"Q835922","wgCheckUserClientHintsHeadersJsApi":["brands","architecture","bitness","fullVersionList","mobile","model","platform", "platformVersion"],"GEHomepageSuggestedEditsEnableTopics":true,"wgGETopicsMatchModeEnabled":false,"wgGEStructuredTaskRejectionReasonTextInputEnabled":false,"wgGELevelingUpEnabledForUser":false};RLSTATE={"ext.globalCssJs.user.styles":"ready","site.styles":"ready","user.styles":"ready","ext.globalCssJs.user":"ready","user":"ready","user.options":"loading","ext.cite.styles":"ready","ext.pygments":"ready","skins.vector.search.codex.styles":"ready","skins.vector.styles":"ready","skins.vector.icons":"ready","jquery.makeCollapsible.styles":"ready","ext.wikimediamessages.styles":"ready","ext.visualEditor.desktopArticleTarget.noscript":"ready","ext.uls.interlanguage":"ready","wikibase.client.init":"ready","ext.wikimediaBadges":"ready"};RLPAGEMODULES=["ext.cite.ux-enhancements","ext.pygments.view","site","mediawiki.page.ready","jquery.makeCollapsible","mediawiki.toc","skins.vector.js","ext.centralNotice.geoIP","ext.centralNotice.startUp","ext.gadget.ReferenceTooltips","ext.gadget.switcher", "ext.urlShortener.toolbar","ext.centralauth.centralautologin","ext.popups","ext.visualEditor.desktopArticleTarget.init","ext.visualEditor.targetLoader","ext.echo.centralauth","ext.eventLogging","ext.wikimediaEvents","ext.navigationTiming","ext.uls.interface","ext.cx.eventlogging.campaigns","ext.cx.uls.quick.actions","wikibase.client.vector-2022","ext.checkUser.clientHints","ext.growthExperiments.SuggestedEditSession","wikibase.sidebar.tracking"];</script> <script>(RLQ=window.RLQ||[]).push(function(){mw.loader.impl(function(){return["user.options@12s5i",function($,jQuery,require,module){mw.user.tokens.set({"patrolToken":"+\\","watchToken":"+\\","csrfToken":"+\\"}); }];});});</script> <link rel="stylesheet" href="/w/load.php?lang=en&modules=ext.cite.styles%7Cext.pygments%2CwikimediaBadges%7Cext.uls.interlanguage%7Cext.visualEditor.desktopArticleTarget.noscript%7Cext.wikimediamessages.styles%7Cjquery.makeCollapsible.styles%7Cskins.vector.icons%2Cstyles%7Cskins.vector.search.codex.styles%7Cwikibase.client.init&only=styles&skin=vector-2022"> <script async="" src="/w/load.php?lang=en&modules=startup&only=scripts&raw=1&skin=vector-2022"></script> <meta name="ResourceLoaderDynamicStyles" content=""> <link rel="stylesheet" href="/w/load.php?lang=en&modules=site.styles&only=styles&skin=vector-2022"> <meta name="generator" content="MediaWiki 1.44.0-wmf.4"> <meta name="referrer" content="origin"> <meta name="referrer" content="origin-when-cross-origin"> <meta name="robots" content="max-image-preview:standard"> <meta name="format-detection" content="telephone=no"> <meta name="viewport" content="width=1120"> <meta property="og:title" content="Lexical analysis - Wikipedia"> <meta property="og:type" content="website"> <link rel="alternate" media="only screen and (max-width: 640px)" href="//en.m.wikipedia.org/wiki/Lexical_analysis"> <link rel="alternate" type="application/x-wiki" title="Edit this page" href="/w/index.php?title=Lexical_analysis&action=edit"> <link rel="apple-touch-icon" href="/static/apple-touch/wikipedia.png"> <link rel="icon" href="/static/favicon/wikipedia.ico"> <link rel="search" type="application/opensearchdescription+xml" href="/w/rest.php/v1/search" title="Wikipedia (en)"> <link rel="EditURI" type="application/rsd+xml" href="//en.wikipedia.org/w/api.php?action=rsd"> <link rel="canonical" href="https://en.wikipedia.org/wiki/Lexical_analysis"> <link rel="license" href="https://creativecommons.org/licenses/by-sa/4.0/deed.en"> <link rel="alternate" type="application/atom+xml" title="Wikipedia Atom feed" href="/w/index.php?title=Special:RecentChanges&feed=atom"> <link rel="dns-prefetch" href="//meta.wikimedia.org" /> <link rel="dns-prefetch" href="//login.wikimedia.org"> </head> <body class="skin--responsive skin-vector skin-vector-search-vue mediawiki ltr sitedir-ltr mw-hide-empty-elt ns-0 ns-subject mw-editable page-Lexical_analysis rootpage-Lexical_analysis skin-vector-2022 action-view"><a class="mw-jump-link" href="#bodyContent">Jump to content</a> <div class="vector-header-container"> <header class="vector-header mw-header"> <div class="vector-header-start"> <nav class="vector-main-menu-landmark" aria-label="Site"> <div id="vector-main-menu-dropdown" class="vector-dropdown vector-main-menu-dropdown vector-button-flush-left vector-button-flush-right" > <input type="checkbox" id="vector-main-menu-dropdown-checkbox" role="button" aria-haspopup="true" data-event-name="ui.dropdown-vector-main-menu-dropdown" class="vector-dropdown-checkbox " aria-label="Main menu" > <label id="vector-main-menu-dropdown-label" for="vector-main-menu-dropdown-checkbox" class="vector-dropdown-label cdx-button cdx-button--fake-button cdx-button--fake-button--enabled cdx-button--weight-quiet cdx-button--icon-only " aria-hidden="true" ><span class="vector-icon mw-ui-icon-menu mw-ui-icon-wikimedia-menu"></span> <span class="vector-dropdown-label-text">Main menu</span> </label> <div class="vector-dropdown-content"> <div id="vector-main-menu-unpinned-container" class="vector-unpinned-container"> <div id="vector-main-menu" class="vector-main-menu vector-pinnable-element"> <div class="vector-pinnable-header vector-main-menu-pinnable-header vector-pinnable-header-unpinned" data-feature-name="main-menu-pinned" data-pinnable-element-id="vector-main-menu" data-pinned-container-id="vector-main-menu-pinned-container" data-unpinned-container-id="vector-main-menu-unpinned-container" > <div class="vector-pinnable-header-label">Main menu</div> <button class="vector-pinnable-header-toggle-button vector-pinnable-header-pin-button" data-event-name="pinnable-header.vector-main-menu.pin">move to sidebar</button> <button class="vector-pinnable-header-toggle-button vector-pinnable-header-unpin-button" data-event-name="pinnable-header.vector-main-menu.unpin">hide</button> </div> <div id="p-navigation" class="vector-menu mw-portlet mw-portlet-navigation" > <div class="vector-menu-heading"> Navigation </div> <div class="vector-menu-content"> <ul class="vector-menu-content-list"> <li id="n-mainpage-description" class="mw-list-item"><a href="/wiki/Main_Page" title="Visit the main page [z]" accesskey="z"><span>Main page</span></a></li><li id="n-contents" class="mw-list-item"><a href="/wiki/Wikipedia:Contents" title="Guides to browsing Wikipedia"><span>Contents</span></a></li><li id="n-currentevents" class="mw-list-item"><a href="/wiki/Portal:Current_events" title="Articles related to current events"><span>Current events</span></a></li><li id="n-randompage" class="mw-list-item"><a href="/wiki/Special:Random" title="Visit a randomly selected article [x]" accesskey="x"><span>Random article</span></a></li><li id="n-aboutsite" class="mw-list-item"><a href="/wiki/Wikipedia:About" title="Learn about Wikipedia and how it works"><span>About Wikipedia</span></a></li><li id="n-contactpage" class="mw-list-item"><a href="//en.wikipedia.org/wiki/Wikipedia:Contact_us" title="How to contact Wikipedia"><span>Contact us</span></a></li> </ul> </div> </div> <div id="p-interaction" class="vector-menu mw-portlet mw-portlet-interaction" > <div class="vector-menu-heading"> Contribute </div> <div class="vector-menu-content"> <ul class="vector-menu-content-list"> <li id="n-help" class="mw-list-item"><a href="/wiki/Help:Contents" title="Guidance on how to use and edit Wikipedia"><span>Help</span></a></li><li id="n-introduction" class="mw-list-item"><a href="/wiki/Help:Introduction" title="Learn how to edit Wikipedia"><span>Learn to edit</span></a></li><li id="n-portal" class="mw-list-item"><a href="/wiki/Wikipedia:Community_portal" title="The hub for editors"><span>Community portal</span></a></li><li id="n-recentchanges" class="mw-list-item"><a href="/wiki/Special:RecentChanges" title="A list of recent changes to Wikipedia [r]" accesskey="r"><span>Recent changes</span></a></li><li id="n-upload" class="mw-list-item"><a href="/wiki/Wikipedia:File_upload_wizard" title="Add images or other media for use on Wikipedia"><span>Upload file</span></a></li> </ul> </div> </div> </div> </div> </div> </div> </nav> <a href="/wiki/Main_Page" class="mw-logo"> <img class="mw-logo-icon" src="/static/images/icons/wikipedia.png" alt="" aria-hidden="true" height="50" width="50"> <span class="mw-logo-container skin-invert"> <img class="mw-logo-wordmark" alt="Wikipedia" src="/static/images/mobile/copyright/wikipedia-wordmark-en.svg" style="width: 7.5em; height: 1.125em;"> <img class="mw-logo-tagline" alt="The Free Encyclopedia" src="/static/images/mobile/copyright/wikipedia-tagline-en.svg" width="117" height="13" style="width: 7.3125em; height: 0.8125em;"> </span> </a> </div> <div class="vector-header-end"> <div id="p-search" role="search" class="vector-search-box-vue vector-search-box-collapses vector-search-box-show-thumbnail vector-search-box-auto-expand-width vector-search-box"> <a href="/wiki/Special:Search" class="cdx-button cdx-button--fake-button cdx-button--fake-button--enabled cdx-button--weight-quiet cdx-button--icon-only search-toggle" title="Search Wikipedia [f]" accesskey="f"><span class="vector-icon mw-ui-icon-search mw-ui-icon-wikimedia-search"></span> <span>Search</span> </a> <div class="vector-typeahead-search-container"> <div class="cdx-typeahead-search cdx-typeahead-search--show-thumbnail cdx-typeahead-search--auto-expand-width"> <form action="/w/index.php" id="searchform" class="cdx-search-input cdx-search-input--has-end-button"> <div id="simpleSearch" class="cdx-search-input__input-wrapper" data-search-loc="header-moved"> <div class="cdx-text-input cdx-text-input--has-start-icon"> <input class="cdx-text-input__input" type="search" name="search" placeholder="Search Wikipedia" aria-label="Search Wikipedia" autocapitalize="sentences" title="Search Wikipedia [f]" accesskey="f" id="searchInput" > <span class="cdx-text-input__icon cdx-text-input__start-icon"></span> </div> <input type="hidden" name="title" value="Special:Search"> </div> <button class="cdx-button cdx-search-input__end-button">Search</button> </form> </div> </div> </div> <nav class="vector-user-links vector-user-links-wide" aria-label="Personal tools"> <div class="vector-user-links-main"> <div id="p-vector-user-menu-preferences" class="vector-menu mw-portlet emptyPortlet" > <div class="vector-menu-content"> <ul class="vector-menu-content-list"> </ul> </div> </div> <div id="p-vector-user-menu-userpage" class="vector-menu mw-portlet emptyPortlet" > <div class="vector-menu-content"> <ul class="vector-menu-content-list"> </ul> </div> </div> <nav class="vector-appearance-landmark" aria-label="Appearance"> <div id="vector-appearance-dropdown" class="vector-dropdown " title="Change the appearance of the page's font size, width, and color" > <input type="checkbox" id="vector-appearance-dropdown-checkbox" role="button" aria-haspopup="true" data-event-name="ui.dropdown-vector-appearance-dropdown" class="vector-dropdown-checkbox " aria-label="Appearance" > <label id="vector-appearance-dropdown-label" for="vector-appearance-dropdown-checkbox" class="vector-dropdown-label cdx-button cdx-button--fake-button cdx-button--fake-button--enabled cdx-button--weight-quiet cdx-button--icon-only " aria-hidden="true" ><span class="vector-icon mw-ui-icon-appearance mw-ui-icon-wikimedia-appearance"></span> <span class="vector-dropdown-label-text">Appearance</span> </label> <div class="vector-dropdown-content"> <div id="vector-appearance-unpinned-container" class="vector-unpinned-container"> </div> </div> </div> </nav> <div id="p-vector-user-menu-notifications" class="vector-menu mw-portlet emptyPortlet" > <div class="vector-menu-content"> <ul class="vector-menu-content-list"> </ul> </div> </div> <div id="p-vector-user-menu-overflow" class="vector-menu mw-portlet" > <div class="vector-menu-content"> <ul class="vector-menu-content-list"> <li id="pt-sitesupport-2" class="user-links-collapsible-item mw-list-item user-links-collapsible-item"><a data-mw="interface" href="https://donate.wikimedia.org/wiki/Special:FundraiserRedirector?utm_source=donate&utm_medium=sidebar&utm_campaign=C13_en.wikipedia.org&uselang=en" class=""><span>Donate</span></a> </li> <li id="pt-createaccount-2" class="user-links-collapsible-item mw-list-item user-links-collapsible-item"><a data-mw="interface" href="/w/index.php?title=Special:CreateAccount&returnto=Lexical+analysis" title="You are encouraged to create an account and log in; however, it is not mandatory" class=""><span>Create account</span></a> </li> <li id="pt-login-2" class="user-links-collapsible-item mw-list-item user-links-collapsible-item"><a data-mw="interface" href="/w/index.php?title=Special:UserLogin&returnto=Lexical+analysis" title="You're encouraged to log in; however, it's not mandatory. [o]" accesskey="o" class=""><span>Log in</span></a> </li> </ul> </div> </div> </div> <div id="vector-user-links-dropdown" class="vector-dropdown vector-user-menu vector-button-flush-right vector-user-menu-logged-out" title="Log in and more options" > <input type="checkbox" id="vector-user-links-dropdown-checkbox" role="button" aria-haspopup="true" data-event-name="ui.dropdown-vector-user-links-dropdown" class="vector-dropdown-checkbox " aria-label="Personal tools" > <label id="vector-user-links-dropdown-label" for="vector-user-links-dropdown-checkbox" class="vector-dropdown-label cdx-button cdx-button--fake-button cdx-button--fake-button--enabled cdx-button--weight-quiet cdx-button--icon-only " aria-hidden="true" ><span class="vector-icon mw-ui-icon-ellipsis mw-ui-icon-wikimedia-ellipsis"></span> <span class="vector-dropdown-label-text">Personal tools</span> </label> <div class="vector-dropdown-content"> <div id="p-personal" class="vector-menu mw-portlet mw-portlet-personal user-links-collapsible-item" title="User menu" > <div class="vector-menu-content"> <ul class="vector-menu-content-list"> <li id="pt-sitesupport" class="user-links-collapsible-item mw-list-item"><a href="https://donate.wikimedia.org/wiki/Special:FundraiserRedirector?utm_source=donate&utm_medium=sidebar&utm_campaign=C13_en.wikipedia.org&uselang=en"><span>Donate</span></a></li><li id="pt-createaccount" class="user-links-collapsible-item mw-list-item"><a href="/w/index.php?title=Special:CreateAccount&returnto=Lexical+analysis" title="You are encouraged to create an account and log in; however, it is not mandatory"><span class="vector-icon mw-ui-icon-userAdd mw-ui-icon-wikimedia-userAdd"></span> <span>Create account</span></a></li><li id="pt-login" class="user-links-collapsible-item mw-list-item"><a href="/w/index.php?title=Special:UserLogin&returnto=Lexical+analysis" title="You're encouraged to log in; however, it's not mandatory. [o]" accesskey="o"><span class="vector-icon mw-ui-icon-logIn mw-ui-icon-wikimedia-logIn"></span> <span>Log in</span></a></li> </ul> </div> </div> <div id="p-user-menu-anon-editor" class="vector-menu mw-portlet mw-portlet-user-menu-anon-editor" > <div class="vector-menu-heading"> Pages for logged out editors <a href="/wiki/Help:Introduction" aria-label="Learn more about editing"><span>learn more</span></a> </div> <div class="vector-menu-content"> <ul class="vector-menu-content-list"> <li id="pt-anoncontribs" class="mw-list-item"><a href="/wiki/Special:MyContributions" title="A list of edits made from this IP address [y]" accesskey="y"><span>Contributions</span></a></li><li id="pt-anontalk" class="mw-list-item"><a href="/wiki/Special:MyTalk" title="Discussion about edits from this IP address [n]" accesskey="n"><span>Talk</span></a></li> </ul> </div> </div> </div> </div> </nav> </div> </header> </div> <div class="mw-page-container"> <div class="mw-page-container-inner"> <div class="vector-sitenotice-container"> <div id="siteNotice"><!-- CentralNotice --></div> </div> <div class="vector-column-start"> <div class="vector-main-menu-container"> <div id="mw-navigation"> <nav id="mw-panel" class="vector-main-menu-landmark" aria-label="Site"> <div id="vector-main-menu-pinned-container" class="vector-pinned-container"> </div> </nav> </div> </div> <div class="vector-sticky-pinned-container"> <nav id="mw-panel-toc" aria-label="Contents" data-event-name="ui.sidebar-toc" class="mw-table-of-contents-container vector-toc-landmark"> <div id="vector-toc-pinned-container" class="vector-pinned-container"> <div id="vector-toc" class="vector-toc vector-pinnable-element"> <div class="vector-pinnable-header vector-toc-pinnable-header vector-pinnable-header-pinned" data-feature-name="toc-pinned" data-pinnable-element-id="vector-toc" > <h2 class="vector-pinnable-header-label">Contents</h2> <button class="vector-pinnable-header-toggle-button vector-pinnable-header-pin-button" data-event-name="pinnable-header.vector-toc.pin">move to sidebar</button> <button class="vector-pinnable-header-toggle-button vector-pinnable-header-unpin-button" data-event-name="pinnable-header.vector-toc.unpin">hide</button> </div> <ul class="vector-toc-contents" id="mw-panel-toc-list"> <li id="toc-mw-content-text" class="vector-toc-list-item vector-toc-level-1"> <a href="#" class="vector-toc-link"> <div class="vector-toc-text">(Top)</div> </a> </li> <li id="toc-Rule-based_programs" class="vector-toc-list-item vector-toc-level-1 vector-toc-list-item-expanded"> <a class="vector-toc-link" href="#Rule-based_programs"> <div class="vector-toc-text"> <span class="vector-toc-numb">1</span> <span>Rule-based programs</span> </div> </a> <ul id="toc-Rule-based_programs-sublist" class="vector-toc-list"> </ul> </li> <li id="toc-Disambiguation_of_"lexeme"" class="vector-toc-list-item vector-toc-level-1 vector-toc-list-item-expanded"> <a class="vector-toc-link" href="#Disambiguation_of_"lexeme""> <div class="vector-toc-text"> <span class="vector-toc-numb">2</span> <span>Disambiguation of "lexeme"</span> </div> </a> <ul id="toc-Disambiguation_of_"lexeme"-sublist" class="vector-toc-list"> </ul> </li> <li id="toc-Lexical_token_and_lexical_tokenization" class="vector-toc-list-item vector-toc-level-1 vector-toc-list-item-expanded"> <a class="vector-toc-link" href="#Lexical_token_and_lexical_tokenization"> <div class="vector-toc-text"> <span class="vector-toc-numb">3</span> <span>Lexical token and lexical tokenization</span> </div> </a> <ul id="toc-Lexical_token_and_lexical_tokenization-sublist" class="vector-toc-list"> </ul> </li> <li id="toc-Lexical_grammar" class="vector-toc-list-item vector-toc-level-1 vector-toc-list-item-expanded"> <a class="vector-toc-link" href="#Lexical_grammar"> <div class="vector-toc-text"> <span class="vector-toc-numb">4</span> <span>Lexical grammar</span> </div> </a> <ul id="toc-Lexical_grammar-sublist" class="vector-toc-list"> </ul> </li> <li id="toc-Details" class="vector-toc-list-item vector-toc-level-1 vector-toc-list-item-expanded"> <a class="vector-toc-link" href="#Details"> <div class="vector-toc-text"> <span class="vector-toc-numb">5</span> <span>Details</span> </div> </a> <button aria-controls="toc-Details-sublist" class="cdx-button cdx-button--weight-quiet cdx-button--icon-only vector-toc-toggle"> <span class="vector-icon mw-ui-icon-wikimedia-expand"></span> <span>Toggle Details subsection</span> </button> <ul id="toc-Details-sublist" class="vector-toc-list"> <li id="toc-Scanner" class="vector-toc-list-item vector-toc-level-2"> <a class="vector-toc-link" href="#Scanner"> <div class="vector-toc-text"> <span class="vector-toc-numb">5.1</span> <span>Scanner</span> </div> </a> <ul id="toc-Scanner-sublist" class="vector-toc-list"> </ul> </li> <li id="toc-Evaluator" class="vector-toc-list-item vector-toc-level-2"> <a class="vector-toc-link" href="#Evaluator"> <div class="vector-toc-text"> <span class="vector-toc-numb">5.2</span> <span>Evaluator</span> </div> </a> <ul id="toc-Evaluator-sublist" class="vector-toc-list"> </ul> </li> </ul> </li> <li id="toc-Obstacles" class="vector-toc-list-item vector-toc-level-1 vector-toc-list-item-expanded"> <a class="vector-toc-link" href="#Obstacles"> <div class="vector-toc-text"> <span class="vector-toc-numb">6</span> <span>Obstacles</span> </div> </a> <ul id="toc-Obstacles-sublist" class="vector-toc-list"> </ul> </li> <li id="toc-Lexer_generator" class="vector-toc-list-item vector-toc-level-1 vector-toc-list-item-expanded"> <a class="vector-toc-link" href="#Lexer_generator"> <div class="vector-toc-text"> <span class="vector-toc-numb">7</span> <span>Lexer generator</span> </div> </a> <ul id="toc-Lexer_generator-sublist" class="vector-toc-list"> </ul> </li> <li id="toc-Phrase_structure" class="vector-toc-list-item vector-toc-level-1 vector-toc-list-item-expanded"> <a class="vector-toc-link" href="#Phrase_structure"> <div class="vector-toc-text"> <span class="vector-toc-numb">8</span> <span>Phrase structure</span> </div> </a> <button aria-controls="toc-Phrase_structure-sublist" class="cdx-button cdx-button--weight-quiet cdx-button--icon-only vector-toc-toggle"> <span class="vector-icon mw-ui-icon-wikimedia-expand"></span> <span>Toggle Phrase structure subsection</span> </button> <ul id="toc-Phrase_structure-sublist" class="vector-toc-list"> <li id="toc-Line_continuation" class="vector-toc-list-item vector-toc-level-2"> <a class="vector-toc-link" href="#Line_continuation"> <div class="vector-toc-text"> <span class="vector-toc-numb">8.1</span> <span>Line continuation</span> </div> </a> <ul id="toc-Line_continuation-sublist" class="vector-toc-list"> </ul> </li> <li id="toc-Semicolon_insertion" class="vector-toc-list-item vector-toc-level-2"> <a class="vector-toc-link" href="#Semicolon_insertion"> <div class="vector-toc-text"> <span class="vector-toc-numb">8.2</span> <span>Semicolon insertion</span> </div> </a> <ul id="toc-Semicolon_insertion-sublist" class="vector-toc-list"> </ul> </li> <li id="toc-Off-side_rule" class="vector-toc-list-item vector-toc-level-2"> <a class="vector-toc-link" href="#Off-side_rule"> <div class="vector-toc-text"> <span class="vector-toc-numb">8.3</span> <span>Off-side rule</span> </div> </a> <ul id="toc-Off-side_rule-sublist" class="vector-toc-list"> </ul> </li> </ul> </li> <li id="toc-Context-sensitive_lexing" class="vector-toc-list-item vector-toc-level-1 vector-toc-list-item-expanded"> <a class="vector-toc-link" href="#Context-sensitive_lexing"> <div class="vector-toc-text"> <span class="vector-toc-numb">9</span> <span>Context-sensitive lexing</span> </div> </a> <ul id="toc-Context-sensitive_lexing-sublist" class="vector-toc-list"> </ul> </li> <li id="toc-See_also" class="vector-toc-list-item vector-toc-level-1 vector-toc-list-item-expanded"> <a class="vector-toc-link" href="#See_also"> <div class="vector-toc-text"> <span class="vector-toc-numb">10</span> <span>See also</span> </div> </a> <ul id="toc-See_also-sublist" class="vector-toc-list"> </ul> </li> <li id="toc-References" class="vector-toc-list-item vector-toc-level-1 vector-toc-list-item-expanded"> <a class="vector-toc-link" href="#References"> <div class="vector-toc-text"> <span class="vector-toc-numb">11</span> <span>References</span> </div> </a> <button aria-controls="toc-References-sublist" class="cdx-button cdx-button--weight-quiet cdx-button--icon-only vector-toc-toggle"> <span class="vector-icon mw-ui-icon-wikimedia-expand"></span> <span>Toggle References subsection</span> </button> <ul id="toc-References-sublist" class="vector-toc-list"> <li id="toc-Sources" class="vector-toc-list-item vector-toc-level-2"> <a class="vector-toc-link" href="#Sources"> <div class="vector-toc-text"> <span class="vector-toc-numb">11.1</span> <span>Sources</span> </div> </a> <ul id="toc-Sources-sublist" class="vector-toc-list"> </ul> </li> </ul> </li> <li id="toc-External_links" class="vector-toc-list-item vector-toc-level-1 vector-toc-list-item-expanded"> <a class="vector-toc-link" href="#External_links"> <div class="vector-toc-text"> <span class="vector-toc-numb">12</span> <span>External links</span> </div> </a> <ul id="toc-External_links-sublist" class="vector-toc-list"> </ul> </li> </ul> </div> </div> </nav> </div> </div> <div class="mw-content-container"> <main id="content" class="mw-body"> <header class="mw-body-header vector-page-titlebar"> <nav aria-label="Contents" class="vector-toc-landmark"> <div id="vector-page-titlebar-toc" class="vector-dropdown vector-page-titlebar-toc vector-button-flush-left" > <input type="checkbox" id="vector-page-titlebar-toc-checkbox" role="button" aria-haspopup="true" data-event-name="ui.dropdown-vector-page-titlebar-toc" class="vector-dropdown-checkbox " aria-label="Toggle the table of contents" > <label id="vector-page-titlebar-toc-label" for="vector-page-titlebar-toc-checkbox" class="vector-dropdown-label cdx-button cdx-button--fake-button cdx-button--fake-button--enabled cdx-button--weight-quiet cdx-button--icon-only " aria-hidden="true" ><span class="vector-icon mw-ui-icon-listBullet mw-ui-icon-wikimedia-listBullet"></span> <span class="vector-dropdown-label-text">Toggle the table of contents</span> </label> <div class="vector-dropdown-content"> <div id="vector-page-titlebar-toc-unpinned-container" class="vector-unpinned-container"> </div> </div> </div> </nav> <h1 id="firstHeading" class="firstHeading mw-first-heading"><span class="mw-page-title-main">Lexical analysis</span></h1> <div id="p-lang-btn" class="vector-dropdown mw-portlet mw-portlet-lang" > <input type="checkbox" id="p-lang-btn-checkbox" role="button" aria-haspopup="true" data-event-name="ui.dropdown-p-lang-btn" class="vector-dropdown-checkbox mw-interlanguage-selector" aria-label="Go to an article in another language. Available in 31 languages" > <label id="p-lang-btn-label" for="p-lang-btn-checkbox" class="vector-dropdown-label cdx-button cdx-button--fake-button cdx-button--fake-button--enabled cdx-button--weight-quiet cdx-button--action-progressive mw-portlet-lang-heading-31" aria-hidden="true" ><span class="vector-icon mw-ui-icon-language-progressive mw-ui-icon-wikimedia-language-progressive"></span> <span class="vector-dropdown-label-text">31 languages</span> </label> <div class="vector-dropdown-content"> <div class="vector-menu-content"> <ul class="vector-menu-content-list"> <li class="interlanguage-link interwiki-ar mw-list-item"><a href="https://ar.wikipedia.org/wiki/%D8%AA%D8%AD%D9%84%D9%8A%D9%84_%D8%A7%D9%84%D9%85%D9%81%D8%B1%D8%AF%D8%A7%D8%AA" title="تحليل المفردات – Arabic" lang="ar" hreflang="ar" data-title="تحليل المفردات" data-language-autonym="العربية" data-language-local-name="Arabic" class="interlanguage-link-target"><span>العربية</span></a></li><li class="interlanguage-link interwiki-ca mw-list-item"><a href="https://ca.wikipedia.org/wiki/An%C3%A0lisi_l%C3%A8xica" title="Anàlisi lèxica – Catalan" lang="ca" hreflang="ca" data-title="Anàlisi lèxica" data-language-autonym="Català" data-language-local-name="Catalan" class="interlanguage-link-target"><span>Català</span></a></li><li class="interlanguage-link interwiki-cs mw-list-item"><a href="https://cs.wikipedia.org/wiki/Lexik%C3%A1ln%C3%AD_anal%C3%BDza" title="Lexikální analýza – Czech" lang="cs" hreflang="cs" data-title="Lexikální analýza" data-language-autonym="Čeština" data-language-local-name="Czech" class="interlanguage-link-target"><span>Čeština</span></a></li><li class="interlanguage-link interwiki-da mw-list-item"><a href="https://da.wikipedia.org/wiki/Leksikalsk_analyse" title="Leksikalsk analyse – Danish" lang="da" hreflang="da" data-title="Leksikalsk analyse" data-language-autonym="Dansk" data-language-local-name="Danish" class="interlanguage-link-target"><span>Dansk</span></a></li><li class="interlanguage-link interwiki-de mw-list-item"><a href="https://de.wikipedia.org/wiki/Lexikalische_Analyse" title="Lexikalische Analyse – German" lang="de" hreflang="de" data-title="Lexikalische Analyse" data-language-autonym="Deutsch" data-language-local-name="German" class="interlanguage-link-target"><span>Deutsch</span></a></li><li class="interlanguage-link interwiki-el mw-list-item"><a href="https://el.wikipedia.org/wiki/%CE%9B%CE%B5%CE%BA%CF%84%CE%B9%CE%BA%CE%AE_%CE%B1%CE%BD%CE%AC%CE%BB%CF%85%CF%83%CE%B7" title="Λεκτική ανάλυση – Greek" lang="el" hreflang="el" data-title="Λεκτική ανάλυση" data-language-autonym="Ελληνικά" data-language-local-name="Greek" class="interlanguage-link-target"><span>Ελληνικά</span></a></li><li class="interlanguage-link interwiki-es mw-list-item"><a href="https://es.wikipedia.org/wiki/Analizador_l%C3%A9xico" title="Analizador léxico – Spanish" lang="es" hreflang="es" data-title="Analizador léxico" data-language-autonym="Español" data-language-local-name="Spanish" class="interlanguage-link-target"><span>Español</span></a></li><li class="interlanguage-link interwiki-fa mw-list-item"><a href="https://fa.wikipedia.org/wiki/%D8%AA%D8%AD%D9%84%DB%8C%D9%84_%D9%88%D8%A7%DA%98%DA%AF%D8%A7%D9%86%DB%8C" title="تحلیل واژگانی – Persian" lang="fa" hreflang="fa" data-title="تحلیل واژگانی" data-language-autonym="فارسی" data-language-local-name="Persian" class="interlanguage-link-target"><span>فارسی</span></a></li><li class="interlanguage-link interwiki-fr mw-list-item"><a href="https://fr.wikipedia.org/wiki/Analyse_lexicale" title="Analyse lexicale – French" lang="fr" hreflang="fr" data-title="Analyse lexicale" data-language-autonym="Français" data-language-local-name="French" class="interlanguage-link-target"><span>Français</span></a></li><li class="interlanguage-link interwiki-ko mw-list-item"><a href="https://ko.wikipedia.org/wiki/%EB%82%B1%EB%A7%90_%EB%B6%84%EC%84%9D" title="낱말 분석 – Korean" lang="ko" hreflang="ko" data-title="낱말 분석" data-language-autonym="한국어" data-language-local-name="Korean" class="interlanguage-link-target"><span>한국어</span></a></li><li class="interlanguage-link interwiki-hy mw-list-item"><a href="https://hy.wikipedia.org/wiki/%D4%BC%D5%A5%D6%84%D5%BD%D5%AB%D5%AF%D5%A1%D5%AF%D5%A1%D5%B6_%D5%BE%D5%A5%D6%80%D5%AC%D5%B8%D6%82%D5%AE%D5%B8%D6%82%D5%A9%D5%B5%D5%B8%D6%82%D5%B6" title="Լեքսիկական վերլուծություն – Armenian" lang="hy" hreflang="hy" data-title="Լեքսիկական վերլուծություն" data-language-autonym="Հայերեն" data-language-local-name="Armenian" class="interlanguage-link-target"><span>Հայերեն</span></a></li><li class="interlanguage-link interwiki-hr mw-list-item"><a href="https://hr.wikipedia.org/wiki/Leksi%C4%8Dka_analiza" title="Leksička analiza – Croatian" lang="hr" hreflang="hr" data-title="Leksička analiza" data-language-autonym="Hrvatski" data-language-local-name="Croatian" class="interlanguage-link-target"><span>Hrvatski</span></a></li><li class="interlanguage-link interwiki-id mw-list-item"><a href="https://id.wikipedia.org/wiki/Analisis_leksikal" title="Analisis leksikal – Indonesian" lang="id" hreflang="id" data-title="Analisis leksikal" data-language-autonym="Bahasa Indonesia" data-language-local-name="Indonesian" class="interlanguage-link-target"><span>Bahasa Indonesia</span></a></li><li class="interlanguage-link interwiki-it mw-list-item"><a href="https://it.wikipedia.org/wiki/Analisi_lessicale" title="Analisi lessicale – Italian" lang="it" hreflang="it" data-title="Analisi lessicale" data-language-autonym="Italiano" data-language-local-name="Italian" class="interlanguage-link-target"><span>Italiano</span></a></li><li class="interlanguage-link interwiki-he mw-list-item"><a href="https://he.wikipedia.org/wiki/%D7%A0%D7%99%D7%AA%D7%95%D7%97_%D7%9E%D7%99%D7%9C%D7%95%D7%9C%D7%99" title="ניתוח מילולי – Hebrew" lang="he" hreflang="he" data-title="ניתוח מילולי" data-language-autonym="עברית" data-language-local-name="Hebrew" class="interlanguage-link-target"><span>עברית</span></a></li><li class="interlanguage-link interwiki-mk mw-list-item"><a href="https://mk.wikipedia.org/wiki/%D0%9B%D0%B5%D0%BA%D1%81%D0%B8%D1%87%D0%BA%D0%B8_%D0%B0%D0%BD%D0%B0%D0%BB%D0%B8%D0%B7%D0%B0%D1%82%D0%BE%D1%80" title="Лексички анализатор – Macedonian" lang="mk" hreflang="mk" data-title="Лексички анализатор" data-language-autonym="Македонски" data-language-local-name="Macedonian" class="interlanguage-link-target"><span>Македонски</span></a></li><li class="interlanguage-link interwiki-ms mw-list-item"><a href="https://ms.wikipedia.org/wiki/Analisis_leksikal" title="Analisis leksikal – Malay" lang="ms" hreflang="ms" data-title="Analisis leksikal" data-language-autonym="Bahasa Melayu" data-language-local-name="Malay" class="interlanguage-link-target"><span>Bahasa Melayu</span></a></li><li class="interlanguage-link interwiki-nl mw-list-item"><a href="https://nl.wikipedia.org/wiki/Lexicale_analyse" title="Lexicale analyse – Dutch" lang="nl" hreflang="nl" data-title="Lexicale analyse" data-language-autonym="Nederlands" data-language-local-name="Dutch" class="interlanguage-link-target"><span>Nederlands</span></a></li><li class="interlanguage-link interwiki-ja mw-list-item"><a href="https://ja.wikipedia.org/wiki/%E5%AD%97%E5%8F%A5%E8%A7%A3%E6%9E%90" title="字句解析 – Japanese" lang="ja" hreflang="ja" data-title="字句解析" data-language-autonym="日本語" data-language-local-name="Japanese" class="interlanguage-link-target"><span>日本語</span></a></li><li class="interlanguage-link interwiki-no mw-list-item"><a href="https://no.wikipedia.org/wiki/Leksikalsk_analyse" title="Leksikalsk analyse – Norwegian Bokmål" lang="nb" hreflang="nb" data-title="Leksikalsk analyse" data-language-autonym="Norsk bokmål" data-language-local-name="Norwegian Bokmål" class="interlanguage-link-target"><span>Norsk bokmål</span></a></li><li class="interlanguage-link interwiki-pl mw-list-item"><a href="https://pl.wikipedia.org/wiki/Analiza_leksykalna" title="Analiza leksykalna – Polish" lang="pl" hreflang="pl" data-title="Analiza leksykalna" data-language-autonym="Polski" data-language-local-name="Polish" class="interlanguage-link-target"><span>Polski</span></a></li><li class="interlanguage-link interwiki-pt mw-list-item"><a href="https://pt.wikipedia.org/wiki/An%C3%A1lise_l%C3%A9xica" title="Análise léxica – Portuguese" lang="pt" hreflang="pt" data-title="Análise léxica" data-language-autonym="Português" data-language-local-name="Portuguese" class="interlanguage-link-target"><span>Português</span></a></li><li class="interlanguage-link interwiki-ru mw-list-item"><a href="https://ru.wikipedia.org/wiki/%D0%9B%D0%B5%D0%BA%D1%81%D0%B8%D1%87%D0%B5%D1%81%D0%BA%D0%B8%D0%B9_%D0%B0%D0%BD%D0%B0%D0%BB%D0%B8%D0%B7" title="Лексический анализ – Russian" lang="ru" hreflang="ru" data-title="Лексический анализ" data-language-autonym="Русский" data-language-local-name="Russian" class="interlanguage-link-target"><span>Русский</span></a></li><li class="interlanguage-link interwiki-sr mw-list-item"><a href="https://sr.wikipedia.org/wiki/Leksi%C4%8Dka_analiza" title="Leksička analiza – Serbian" lang="sr" hreflang="sr" data-title="Leksička analiza" data-language-autonym="Српски / srpski" data-language-local-name="Serbian" class="interlanguage-link-target"><span>Српски / srpski</span></a></li><li class="interlanguage-link interwiki-sv mw-list-item"><a href="https://sv.wikipedia.org/wiki/Lexikalanalys" title="Lexikalanalys – Swedish" lang="sv" hreflang="sv" data-title="Lexikalanalys" data-language-autonym="Svenska" data-language-local-name="Swedish" class="interlanguage-link-target"><span>Svenska</span></a></li><li class="interlanguage-link interwiki-ta mw-list-item"><a href="https://ta.wikipedia.org/wiki/%E0%AE%8E%E0%AE%B4%E0%AF%81%E0%AE%A4%E0%AF%8D%E0%AE%A4%E0%AF%81_%E0%AE%AA%E0%AE%BE%E0%AE%95%E0%AF%81%E0%AE%AA%E0%AE%9F%E0%AF%81%E0%AE%A4%E0%AF%8D%E0%AE%A4%E0%AE%BF" title="எழுத்து பாகுபடுத்தி – Tamil" lang="ta" hreflang="ta" data-title="எழுத்து பாகுபடுத்தி" data-language-autonym="தமிழ்" data-language-local-name="Tamil" class="interlanguage-link-target"><span>தமிழ்</span></a></li><li class="interlanguage-link interwiki-tr mw-list-item"><a href="https://tr.wikipedia.org/wiki/S%C3%B6zc%C3%BCksel_analiz" title="Sözcüksel analiz – Turkish" lang="tr" hreflang="tr" data-title="Sözcüksel analiz" data-language-autonym="Türkçe" data-language-local-name="Turkish" class="interlanguage-link-target"><span>Türkçe</span></a></li><li class="interlanguage-link interwiki-uk mw-list-item"><a href="https://uk.wikipedia.org/wiki/%D0%9B%D0%B5%D0%BA%D1%81%D0%B8%D1%87%D0%BD%D0%B8%D0%B9_%D0%B0%D0%BD%D0%B0%D0%BB%D1%96%D0%B7" title="Лексичний аналіз – Ukrainian" lang="uk" hreflang="uk" data-title="Лексичний аналіз" data-language-autonym="Українська" data-language-local-name="Ukrainian" class="interlanguage-link-target"><span>Українська</span></a></li><li class="interlanguage-link interwiki-vi mw-list-item"><a href="https://vi.wikipedia.org/wiki/Ph%C3%A2n_t%C3%ADch_t%E1%BB%AB_v%E1%BB%B1ng" title="Phân tích từ vựng – Vietnamese" lang="vi" hreflang="vi" data-title="Phân tích từ vựng" data-language-autonym="Tiếng Việt" data-language-local-name="Vietnamese" class="interlanguage-link-target"><span>Tiếng Việt</span></a></li><li class="interlanguage-link interwiki-zh-yue mw-list-item"><a href="https://zh-yue.wikipedia.org/wiki/%E8%A9%9E%E6%B3%95%E5%88%86%E6%9E%90" title="詞法分析 – Cantonese" lang="yue" hreflang="yue" data-title="詞法分析" data-language-autonym="粵語" data-language-local-name="Cantonese" class="interlanguage-link-target"><span>粵語</span></a></li><li class="interlanguage-link interwiki-zh mw-list-item"><a href="https://zh.wikipedia.org/wiki/%E8%AF%8D%E6%B3%95%E5%88%86%E6%9E%90" title="词法分析 – Chinese" lang="zh" hreflang="zh" data-title="词法分析" data-language-autonym="中文" data-language-local-name="Chinese" class="interlanguage-link-target"><span>中文</span></a></li> </ul> <div class="after-portlet after-portlet-lang"><span class="wb-langlinks-edit wb-langlinks-link"><a href="https://www.wikidata.org/wiki/Special:EntityPage/Q835922#sitelinks-wikipedia" title="Edit interlanguage links" class="wbc-editpage">Edit links</a></span></div> </div> </div> </div> </header> <div class="vector-page-toolbar"> <div class="vector-page-toolbar-container"> <div id="left-navigation"> <nav aria-label="Namespaces"> <div id="p-associated-pages" class="vector-menu vector-menu-tabs mw-portlet mw-portlet-associated-pages" > <div class="vector-menu-content"> <ul class="vector-menu-content-list"> <li id="ca-nstab-main" class="selected vector-tab-noicon mw-list-item"><a href="/wiki/Lexical_analysis" title="View the content page [c]" accesskey="c"><span>Article</span></a></li><li id="ca-talk" class="vector-tab-noicon mw-list-item"><a href="/wiki/Talk:Lexical_analysis" rel="discussion" title="Discuss improvements to the content page [t]" accesskey="t"><span>Talk</span></a></li> </ul> </div> </div> <div id="vector-variants-dropdown" class="vector-dropdown emptyPortlet" > <input type="checkbox" id="vector-variants-dropdown-checkbox" role="button" aria-haspopup="true" data-event-name="ui.dropdown-vector-variants-dropdown" class="vector-dropdown-checkbox " aria-label="Change language variant" > <label id="vector-variants-dropdown-label" for="vector-variants-dropdown-checkbox" class="vector-dropdown-label cdx-button cdx-button--fake-button cdx-button--fake-button--enabled cdx-button--weight-quiet" aria-hidden="true" ><span class="vector-dropdown-label-text">English</span> </label> <div class="vector-dropdown-content"> <div id="p-variants" class="vector-menu mw-portlet mw-portlet-variants emptyPortlet" > <div class="vector-menu-content"> <ul class="vector-menu-content-list"> </ul> </div> </div> </div> </div> </nav> </div> <div id="right-navigation" class="vector-collapsible"> <nav aria-label="Views"> <div id="p-views" class="vector-menu vector-menu-tabs mw-portlet mw-portlet-views" > <div class="vector-menu-content"> <ul class="vector-menu-content-list"> <li id="ca-view" class="selected vector-tab-noicon mw-list-item"><a href="/wiki/Lexical_analysis"><span>Read</span></a></li><li id="ca-edit" class="vector-tab-noicon mw-list-item"><a href="/w/index.php?title=Lexical_analysis&action=edit" title="Edit this page [e]" accesskey="e"><span>Edit</span></a></li><li id="ca-history" class="vector-tab-noicon mw-list-item"><a href="/w/index.php?title=Lexical_analysis&action=history" title="Past revisions of this page [h]" accesskey="h"><span>View history</span></a></li> </ul> </div> </div> </nav> <nav class="vector-page-tools-landmark" aria-label="Page tools"> <div id="vector-page-tools-dropdown" class="vector-dropdown vector-page-tools-dropdown" > <input type="checkbox" id="vector-page-tools-dropdown-checkbox" role="button" aria-haspopup="true" data-event-name="ui.dropdown-vector-page-tools-dropdown" class="vector-dropdown-checkbox " aria-label="Tools" > <label id="vector-page-tools-dropdown-label" for="vector-page-tools-dropdown-checkbox" class="vector-dropdown-label cdx-button cdx-button--fake-button cdx-button--fake-button--enabled cdx-button--weight-quiet" aria-hidden="true" ><span class="vector-dropdown-label-text">Tools</span> </label> <div class="vector-dropdown-content"> <div id="vector-page-tools-unpinned-container" class="vector-unpinned-container"> <div id="vector-page-tools" class="vector-page-tools vector-pinnable-element"> <div class="vector-pinnable-header vector-page-tools-pinnable-header vector-pinnable-header-unpinned" data-feature-name="page-tools-pinned" data-pinnable-element-id="vector-page-tools" data-pinned-container-id="vector-page-tools-pinned-container" data-unpinned-container-id="vector-page-tools-unpinned-container" > <div class="vector-pinnable-header-label">Tools</div> <button class="vector-pinnable-header-toggle-button vector-pinnable-header-pin-button" data-event-name="pinnable-header.vector-page-tools.pin">move to sidebar</button> <button class="vector-pinnable-header-toggle-button vector-pinnable-header-unpin-button" data-event-name="pinnable-header.vector-page-tools.unpin">hide</button> </div> <div id="p-cactions" class="vector-menu mw-portlet mw-portlet-cactions emptyPortlet vector-has-collapsible-items" title="More options" > <div class="vector-menu-heading"> Actions </div> <div class="vector-menu-content"> <ul class="vector-menu-content-list"> <li id="ca-more-view" class="selected vector-more-collapsible-item mw-list-item"><a href="/wiki/Lexical_analysis"><span>Read</span></a></li><li id="ca-more-edit" class="vector-more-collapsible-item mw-list-item"><a href="/w/index.php?title=Lexical_analysis&action=edit" title="Edit this page [e]" accesskey="e"><span>Edit</span></a></li><li id="ca-more-history" class="vector-more-collapsible-item mw-list-item"><a href="/w/index.php?title=Lexical_analysis&action=history"><span>View history</span></a></li> </ul> </div> </div> <div id="p-tb" class="vector-menu mw-portlet mw-portlet-tb" > <div class="vector-menu-heading"> General </div> <div class="vector-menu-content"> <ul class="vector-menu-content-list"> <li id="t-whatlinkshere" class="mw-list-item"><a href="/wiki/Special:WhatLinksHere/Lexical_analysis" title="List of all English Wikipedia pages containing links to this page [j]" accesskey="j"><span>What links here</span></a></li><li id="t-recentchangeslinked" class="mw-list-item"><a href="/wiki/Special:RecentChangesLinked/Lexical_analysis" rel="nofollow" title="Recent changes in pages linked from this page [k]" accesskey="k"><span>Related changes</span></a></li><li id="t-upload" class="mw-list-item"><a href="/wiki/Wikipedia:File_Upload_Wizard" title="Upload files [u]" accesskey="u"><span>Upload file</span></a></li><li id="t-specialpages" class="mw-list-item"><a href="/wiki/Special:SpecialPages" title="A list of all special pages [q]" accesskey="q"><span>Special pages</span></a></li><li id="t-permalink" class="mw-list-item"><a href="/w/index.php?title=Lexical_analysis&oldid=1241869300" title="Permanent link to this revision of this page"><span>Permanent link</span></a></li><li id="t-info" class="mw-list-item"><a href="/w/index.php?title=Lexical_analysis&action=info" title="More information about this page"><span>Page information</span></a></li><li id="t-cite" class="mw-list-item"><a href="/w/index.php?title=Special:CiteThisPage&page=Lexical_analysis&id=1241869300&wpFormIdentifier=titleform" title="Information on how to cite this page"><span>Cite this page</span></a></li><li id="t-urlshortener" class="mw-list-item"><a href="/w/index.php?title=Special:UrlShortener&url=https%3A%2F%2Fen.wikipedia.org%2Fwiki%2FLexical_analysis"><span>Get shortened URL</span></a></li><li id="t-urlshortener-qrcode" class="mw-list-item"><a href="/w/index.php?title=Special:QrCode&url=https%3A%2F%2Fen.wikipedia.org%2Fwiki%2FLexical_analysis"><span>Download QR code</span></a></li> </ul> </div> </div> <div id="p-coll-print_export" class="vector-menu mw-portlet mw-portlet-coll-print_export" > <div class="vector-menu-heading"> Print/export </div> <div class="vector-menu-content"> <ul class="vector-menu-content-list"> <li id="coll-download-as-rl" class="mw-list-item"><a href="/w/index.php?title=Special:DownloadAsPdf&page=Lexical_analysis&action=show-download-screen" title="Download this page as a PDF file"><span>Download as PDF</span></a></li><li id="t-print" class="mw-list-item"><a href="/w/index.php?title=Lexical_analysis&printable=yes" title="Printable version of this page [p]" accesskey="p"><span>Printable version</span></a></li> </ul> </div> </div> <div id="p-wikibase-otherprojects" class="vector-menu mw-portlet mw-portlet-wikibase-otherprojects" > <div class="vector-menu-heading"> In other projects </div> <div class="vector-menu-content"> <ul class="vector-menu-content-list"> <li id="t-wikibase" class="wb-otherproject-link wb-otherproject-wikibase-dataitem mw-list-item"><a href="https://www.wikidata.org/wiki/Special:EntityPage/Q835922" title="Structured data on this page hosted by Wikidata [g]" accesskey="g"><span>Wikidata item</span></a></li> </ul> </div> </div> </div> </div> </div> </div> </nav> </div> </div> </div> <div class="vector-column-end"> <div class="vector-sticky-pinned-container"> <nav class="vector-page-tools-landmark" aria-label="Page tools"> <div id="vector-page-tools-pinned-container" class="vector-pinned-container"> </div> </nav> <nav class="vector-appearance-landmark" aria-label="Appearance"> <div id="vector-appearance-pinned-container" class="vector-pinned-container"> <div id="vector-appearance" class="vector-appearance vector-pinnable-element"> <div class="vector-pinnable-header vector-appearance-pinnable-header vector-pinnable-header-pinned" data-feature-name="appearance-pinned" data-pinnable-element-id="vector-appearance" data-pinned-container-id="vector-appearance-pinned-container" data-unpinned-container-id="vector-appearance-unpinned-container" > <div class="vector-pinnable-header-label">Appearance</div> <button class="vector-pinnable-header-toggle-button vector-pinnable-header-pin-button" data-event-name="pinnable-header.vector-appearance.pin">move to sidebar</button> <button class="vector-pinnable-header-toggle-button vector-pinnable-header-unpin-button" data-event-name="pinnable-header.vector-appearance.unpin">hide</button> </div> </div> </div> </nav> </div> </div> <div id="bodyContent" class="vector-body" aria-labelledby="firstHeading" data-mw-ve-target-container> <div class="vector-body-before-content"> <div class="mw-indicators"> </div> <div id="siteSub" class="noprint">From Wikipedia, the free encyclopedia</div> </div> <div id="contentSub"><div id="mw-content-subtitle"></div></div> <div id="mw-content-text" class="mw-body-content"><div class="mw-content-ltr mw-parser-output" lang="en" dir="ltr"><style data-mw-deduplicate="TemplateStyles:r1236090951">.mw-parser-output .hatnote{font-style:italic}.mw-parser-output div.hatnote{padding-left:1.6em;margin-bottom:0.5em}.mw-parser-output .hatnote i{font-style:normal}.mw-parser-output .hatnote+link+.hatnote{margin-top:-0.5em}@media print{body.ns-0 .mw-parser-output .hatnote{display:none!important}}</style><div role="note" class="hatnote navigation-not-searchable">"Lexer" redirects here. For people with this name, see <a href="/wiki/Lexer_(surname)" title="Lexer (surname)">Lexer (surname)</a>.</div> <div class="shortdescription nomobile noexcerpt noprint searchaux" style="display:none">Conversion of character sequences into token sequences in computer science</div> <p><b>Lexical tokenization</b> is conversion of a text into (semantically or syntactically) meaningful <i>lexical tokens</i> belonging to categories defined by a "lexer" program. In case of a natural language, those categories include nouns, verbs, adjectives, punctuations etc. In case of a programming language, the categories include identifiers, operators, grouping symbols and <a href="/wiki/Data_type" title="Data type">data types</a>. Lexical tokenization is related to the type of tokenization used in <a href="/wiki/Large_language_model" title="Large language model">large language models</a> (LLMs) but with two differences. First, lexical tokenization is usually based on a <a href="/wiki/Lexical_grammar" title="Lexical grammar">lexical grammar</a>, whereas LLM tokenizers are usually <a href="/wiki/Probability" title="Probability">probability</a>-based. Second, LLM tokenizers perform a second step that converts the tokens into numerical values. </p> <meta property="mw:PageProp/toc" /> <div class="mw-heading mw-heading2"><h2 id="Rule-based_programs">Rule-based programs</h2><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Lexical_analysis&action=edit&section=1" title="Edit section: Rule-based programs"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <p>A rule-based program, performing lexical tokenization, is called <i>tokenizer</i>,<sup id="cite_ref-1" class="reference"><a href="#cite_note-1"><span class="cite-bracket">[</span>1<span class="cite-bracket">]</span></a></sup> or <i>scanner</i>, although <i>scanner</i> is also a term for the first stage of a lexer. A lexer forms the first phase of a <a href="/wiki/Compiler_frontend" class="mw-redirect" title="Compiler frontend">compiler frontend</a> in processing. Analysis generally occurs in one pass. Lexers and parsers are most often used for compilers, but can be used for other computer language tools, such as <a href="/wiki/Prettyprint" title="Prettyprint">prettyprinters</a> or <a href="/wiki/Lint_(software)" title="Lint (software)">linters</a>. Lexing can be divided into two stages: the <i>scanning</i>, which segments the input string into syntactic units called <i>lexemes</i> and categorizes these into token classes, and the <i>evaluating</i>, which converts lexemes into processed values. </p><p>Lexers are generally quite simple, with most of the complexity deferred to the <a href="/wiki/Parsing" title="Parsing">syntactic analysis</a> or <a href="/wiki/Semantic_analysis_(compilers)" title="Semantic analysis (compilers)">semantic analysis</a> phases, and can often be generated by a <a href="#Lexer_generator">lexer generator</a>, notably <a href="/wiki/Lex_(software)" title="Lex (software)">lex</a> or derivatives. However, lexers can sometimes include some complexity, such as <a href="#Phrase_structure">phrase structure</a> processing to make input easier and simplify the parser, and may be written partly or fully by hand, either to support more features or for performance. </p> <div class="mw-heading mw-heading2"><h2 id="Disambiguation_of_"lexeme""><span id="Disambiguation_of_.22lexeme.22"></span>Disambiguation of "lexeme"</h2><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Lexical_analysis&action=edit&section=2" title="Edit section: Disambiguation of "lexeme""><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1236090951"><div role="note" class="hatnote navigation-not-searchable">See also: <a href="/wiki/Word_boundary_(linguistics)" class="mw-redirect" title="Word boundary (linguistics)">Word boundary (linguistics)</a> and <a href="/wiki/Word_boundary_(computing)" class="mw-redirect" title="Word boundary (computing)">Word boundary (computing)</a></div> <p>What is called "lexeme" in rule-based <a href="/wiki/Natural_language_processing" title="Natural language processing">natural language processing</a> is not equal to what is called <a href="/wiki/Lexeme" title="Lexeme">lexeme</a> in linguistics. What is called "lexeme" in rule-based natural language processing can be equal to the linguistic equivalent only in <a href="/wiki/Analytic_language" title="Analytic language">analytic languages</a>, such as English, but not in highly <a href="/wiki/Synthetic_language" title="Synthetic language">synthetic languages</a>, such as <a href="/wiki/Fusional_language" title="Fusional language">fusional languages</a>. What is called a lexeme in rule-based natural language processing is more similar to what is called a <a href="/wiki/Word" title="Word">word</a> in linguistics (not to be confused with a <a href="/wiki/Word_(computer_architecture)" title="Word (computer architecture)">word in computer architecture</a>), although in some cases it may be more similar to a <a href="/wiki/Morpheme" title="Morpheme">morpheme</a>. </p> <div class="mw-heading mw-heading2"><h2 id="Lexical_token_and_lexical_tokenization"><span class="anchor" id="Tokenization"></span><span class="anchor" id="Token"></span>Lexical token and lexical tokenization</h2><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Lexical_analysis&action=edit&section=3" title="Edit section: Lexical token and lexical tokenization"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1236090951"><div role="note" class="hatnote navigation-not-searchable">Not to be confused with <a href="/wiki/Large_language_model#Tokenization" title="Large language model">Large language model § Tokenization</a>, or <a href="/wiki/Tokenization_(data_security)" title="Tokenization (data security)">tokenization (data security)</a>.</div> <p>A <i>lexical token</i> is a <a href="/wiki/String_(computer_science)" title="String (computer science)">string</a> with an assigned and thus identified meaning, in contrast to the probabilistic token used in <a href="/wiki/Large_language_model" title="Large language model">large language models</a>. A lexical token consists of a <i>token name</i> and an optional <i>token value</i>. The token name is a category of a rule-based lexical unit.<sup id="cite_ref-auto_2-0" class="reference"><a href="#cite_note-auto-2"><span class="cite-bracket">[</span>2<span class="cite-bracket">]</span></a></sup> </p> <table class="wikitable"> <caption>Examples of common tokens </caption> <tbody><tr> <th>Token name <p>(Lexical category) </p> </th> <th>Explanation</th> <th>Sample token values </th></tr> <tr> <td><a href="/wiki/Identifier_(computer_languages)" title="Identifier (computer languages)">identifier</a></td> <td>Names assigned by the programmer.</td> <td><code class="mw-highlight mw-highlight-lang-text mw-content-ltr" style="" dir="ltr">x</code>, <code class="mw-highlight mw-highlight-lang-text mw-content-ltr" style="" dir="ltr">color</code>, <code class="mw-highlight mw-highlight-lang-text mw-content-ltr" style="" dir="ltr">UP</code> </td></tr> <tr> <td><a href="/wiki/Reserved_word" title="Reserved word">keyword</a></td> <td>Reserved words of the language.</td> <td><code class="mw-highlight mw-highlight-lang-c mw-content-ltr" style="" dir="ltr"><span class="k">if</span></code>, <code class="mw-highlight mw-highlight-lang-c mw-content-ltr" style="" dir="ltr"><span class="k">while</span></code>, <code class="mw-highlight mw-highlight-lang-c mw-content-ltr" style="" dir="ltr"><span class="k">return</span></code> </td></tr> <tr> <td><a href="/wiki/Delimiter" title="Delimiter">separator/punctuator</a></td> <td>Punctuation characters and paired delimiters.</td> <td><code>}</code>, <code>(</code>, <code>;</code> </td></tr> <tr> <td><a href="/wiki/Operator_(computer_programming)" title="Operator (computer programming)">operator</a></td> <td>Symbols that operate on arguments and produce results.</td> <td><code class="mw-highlight mw-highlight-lang-c mw-content-ltr" style="" dir="ltr"><span class="o">+</span></code>, <code class="mw-highlight mw-highlight-lang-c mw-content-ltr" style="" dir="ltr"><span class="o"><</span></code>, <code class="mw-highlight mw-highlight-lang-c mw-content-ltr" style="" dir="ltr"><span class="o">=</span></code> </td></tr> <tr> <td><a href="/wiki/Literal_(computer_programming)" title="Literal (computer programming)">literal</a></td> <td>Numeric, logical, textual, and reference literals.</td> <td><code class="mw-highlight mw-highlight-lang-c mw-content-ltr" style="" dir="ltr"><span class="nb">true</span></code>, <code class="mw-highlight mw-highlight-lang-c mw-content-ltr" style="" dir="ltr"><span class="mf">6.02e23</span></code>, <code class="mw-highlight mw-highlight-lang-c mw-content-ltr" style="" dir="ltr"><span class="s">"music"</span></code> </td></tr> <tr> <td><a href="/wiki/Comment_(computer_programming)" title="Comment (computer programming)">comment</a></td> <td>Line or block comments. Usually discarded.</td> <td><code class="mw-highlight mw-highlight-lang-c mw-content-ltr" style="" dir="ltr"><span class="cm">/* Retrieves user data */</span></code>, <code class="mw-highlight mw-highlight-lang-c mw-content-ltr" style="" dir="ltr"><span class="c1">// must be negative</span></code> </td></tr> <tr> <td><a href="/wiki/Whitespace_character" title="Whitespace character">whitespace</a></td> <td>Groups of non-printable characters. Usually discarded.</td> <td>– </td></tr></tbody></table> <p>Consider this expression in the <a href="/wiki/C_(programming_language)" title="C (programming language)">C</a> programming language: </p> <dl><dd><code class="mw-highlight mw-highlight-lang-c mw-content-ltr" style="" dir="ltr"><span class="n">x</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">a</span><span class="w"> </span><span class="o">+</span><span class="w"> </span><span class="n">b</span><span class="w"> </span><span class="o">*</span><span class="w"> </span><span class="mi">2</span><span class="p">;</span></code></dd></dl> <p>The lexical analysis of this expression yields the following sequence of tokens: </p> <dl><dd><code>[(identifier, x), (operator, =), (identifier, a), (operator, +), (identifier, b), (operator, *), (literal, 2), (separator, ;)]</code></dd></dl> <p>A token name is what might be termed a <a href="/wiki/Part_of_speech" title="Part of speech">part of speech</a> in linguistics. </p><p><i>Lexical tokenization</i> is the conversion of a raw text into (semantically or syntactically) meaningful lexical tokens, belonging to categories defined by a "lexer" program, such as identifiers, operators, grouping symbols, and data types. The resulting tokens are then passed on to some other form of processing. The process can be considered a sub-task of <a href="/wiki/Parsing" title="Parsing">parsing</a> input. </p><p>For example, in the text <a href="/wiki/String_(computer_science)" title="String (computer science)">string</a>: </p> <dl><dd><code>The quick brown fox jumps over the lazy dog</code></dd></dl> <p>the string is not implicitly segmented on spaces, as a <a href="/wiki/Natural_language" title="Natural language">natural language</a> speaker would do. The raw input, the 43 characters, must be explicitly split into the 9 tokens with a given space delimiter (i.e., matching the string <code>" "</code> or <a href="/wiki/Regular_expression" title="Regular expression">regular expression</a> <code>/\s{1}/</code>). </p><p>When a token class represents more than one possible lexeme, the lexer often saves enough information to reproduce the original lexeme, so that it can be used in <a href="/wiki/Semantic_analysis_(compilers)" title="Semantic analysis (compilers)">semantic analysis</a>. The parser typically retrieves this information from the lexer and stores it in the <a href="/wiki/Abstract_syntax_tree" title="Abstract syntax tree">abstract syntax tree</a>. This is necessary in order to avoid information loss in the case where numbers may also be valid identifiers. </p><p>Tokens are identified based on the specific rules of the lexer. Some methods used to identify tokens include <a href="/wiki/Regular_expression" title="Regular expression">regular expressions</a>, specific sequences of characters termed a <a href="/wiki/Flag_(computing)" class="mw-redirect" title="Flag (computing)">flag</a>, specific separating characters called <a href="/wiki/Delimiter" title="Delimiter">delimiters</a>, and explicit definition by a dictionary. Special characters, including punctuation characters, are commonly used by lexers to identify tokens because of their natural use in written and programming languages. A lexical analyzer generally does nothing with combinations of tokens, a task left for a <a href="/wiki/Parser" class="mw-redirect" title="Parser">parser</a>. For example, a typical lexical analyzer recognizes parentheses as tokens but does nothing to ensure that each "(" is matched with a ")". </p><p>When a lexer feeds tokens to the parser, the representation used is typically an <a href="/wiki/Enumerated_type" title="Enumerated type">enumerated type</a>, which is a list of number representations. For example, "Identifier" can be represented with 0, "Assignment operator" with 1, "Addition operator" with 2, etc. </p><p>Tokens are often defined by <a href="/wiki/Regular_expression" title="Regular expression">regular expressions</a>, which are understood by a lexical analyzer generator such as <a href="/wiki/Lex_(software)" title="Lex (software)">lex</a>, or handcoded equivalent <a href="/wiki/Finite_state_automata" class="mw-redirect" title="Finite state automata">finite state automata</a>. The lexical analyzer (generated automatically by a tool like lex or hand-crafted) reads in a stream of characters, identifies the <a href="#Lexeme">lexemes</a> in the stream, and categorizes them into tokens. This is termed <i>tokenizing</i>. If the lexer finds an invalid token, it will report an error. </p><p>Following tokenizing is <a href="/wiki/Parsing" title="Parsing">parsing</a>. From there, the interpreted data may be loaded into data structures for general use, interpretation, or <a href="/wiki/Compiling" class="mw-redirect" title="Compiling">compiling</a>. </p> <div class="mw-heading mw-heading2"><h2 id="Lexical_grammar">Lexical grammar</h2><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Lexical_analysis&action=edit&section=4" title="Edit section: Lexical grammar"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <p>The specification of a <a href="/wiki/Programming_language" title="Programming language">programming language</a> often includes a set of rules, the <a href="/wiki/Lexical_grammar" title="Lexical grammar">lexical grammar</a>, which defines the lexical syntax. The lexical syntax is usually a <a href="/wiki/Regular_language" title="Regular language">regular language</a>, with the grammar rules consisting of <a href="/wiki/Regular_expression" title="Regular expression">regular expressions</a>; they define the set of possible character sequences (lexemes) of a token. A lexer recognizes strings, and for each kind of string found, the lexical program takes an action, most simply producing a token. </p><p>Two important common lexical categories are <a href="/wiki/Whitespace_character" title="Whitespace character">white space</a> and <a href="/wiki/Comment_(computer_programming)" title="Comment (computer programming)">comments</a>. These are also defined in the grammar and processed by the lexer but may be discarded (not producing any tokens) and considered <i>non-significant</i>, at most separating two tokens (as in <code>if x</code> instead of <code>ifx</code>). There are two important exceptions to this. First, in <a href="/wiki/Off-side_rule" title="Off-side rule">off-side rule</a> languages that delimit <a href="/wiki/Block_(programming)" title="Block (programming)">blocks</a> with indenting, initial whitespace is significant, as it determines block structure, and is generally handled at the lexer level; see <a href="#Phrase_structure">phrase structure</a>, below. Secondly, in some uses of lexers, comments and whitespace must be preserved – for examples, a <a href="/wiki/Prettyprint" title="Prettyprint">prettyprinter</a> also needs to output the comments and some debugging tools may provide messages to the programmer showing the original source code. In the 1960s, notably for <a href="/wiki/ALGOL" title="ALGOL">ALGOL</a>, whitespace and comments were eliminated as part of the <a href="/wiki/Line_reconstruction" class="mw-redirect" title="Line reconstruction">line reconstruction</a> phase (the initial phase of the <a href="/wiki/Compiler_frontend" class="mw-redirect" title="Compiler frontend">compiler frontend</a>), but this separate phase has been eliminated and these are now handled by the lexer. </p> <div class="mw-heading mw-heading2"><h2 id="Details">Details</h2><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Lexical_analysis&action=edit&section=5" title="Edit section: Details"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <div class="mw-heading mw-heading3"><h3 id="Scanner">Scanner</h3><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Lexical_analysis&action=edit&section=6" title="Edit section: Scanner"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <p>The first stage, the <i>scanner</i>, is usually based on a <a href="/wiki/Finite-state_machine" title="Finite-state machine">finite-state machine</a> (FSM). It has encoded within it information on the possible sequences of characters that can be contained within any of the tokens it handles (individual instances of these character sequences are termed <a href="#Lexeme">lexemes</a>). For example, an <i>integer</i> lexeme may contain any sequence of <a href="/wiki/Numerical_digit" title="Numerical digit">numerical digit</a> characters. In many cases, the first non-whitespace character can be used to deduce the kind of token that follows and subsequent input characters are then processed one at a time until reaching a character that is not in the set of characters acceptable for that token (this is termed the <i><a href="/wiki/Maximal_munch" title="Maximal munch">maximal munch</a></i>, or <i>longest match</i>, rule). In some languages, the lexeme creation rules are more complex and may involve <a href="/wiki/Backtracking" title="Backtracking">backtracking</a> over previously read characters. For example, in C, one 'L' character is not enough to distinguish between an identifier that begins with 'L' and a wide-character string literal. </p> <div class="mw-heading mw-heading3"><h3 id="Evaluator">Evaluator</h3><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Lexical_analysis&action=edit&section=7" title="Edit section: Evaluator"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <p>A <a href="#Lexeme">lexeme</a>, however, is only a string of characters known to be of a certain kind (e.g., a string literal, a sequence of letters). In order to construct a token, the lexical analyzer needs a second stage, the <i>evaluator</i>, which goes over the characters of the lexeme to produce a <i>value</i>. The lexeme's type combined with its value is what properly constitutes a token, which can be given to a parser. Some tokens such as parentheses do not really have values, and so the evaluator function for these can return nothing: Only the type is needed. Similarly, sometimes evaluators can suppress a lexeme entirely, concealing it from the parser, which is useful for whitespace and comments. The evaluators for identifiers are usually simple (literally representing the identifier), but may include some <a href="/wiki/Stropping_(syntax)" title="Stropping (syntax)">unstropping</a>. The evaluators for <a href="/wiki/Integer_literal" title="Integer literal">integer literals</a> may pass the string on (deferring evaluation to the semantic analysis phase), or may perform evaluation themselves, which can be involved for different bases or floating point numbers. For a simple quoted string literal, the evaluator needs to remove only the quotes, but the evaluator for an <a href="/wiki/String_literal#Escape_sequences" title="String literal">escaped string literal</a> incorporates a lexer, which unescapes the escape sequences. </p><p>For example, in the source code of a computer program, the string </p> <dl><dd><code class="mw-highlight mw-highlight-lang-c mw-content-ltr" style="" dir="ltr"><span class="n">net_worth_future</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="p">(</span><span class="n">assets</span><span class="w"> </span><span class="err">–</span><span class="w"> </span><span class="n">liabilities</span><span class="p">);</span></code></dd></dl> <p>might be converted into the following lexical token stream; whitespace is suppressed and special characters have no value: </p> <pre>IDENTIFIER net_worth_future EQUALS OPEN_PARENTHESIS IDENTIFIER assets MINUS IDENTIFIER liabilities CLOSE_PARENTHESIS SEMICOLON </pre> <p>Lexers may be written by hand. This is practical if the list of tokens is small, but lexers generated by automated tooling as part of a <a href="/wiki/Compiler-compiler" title="Compiler-compiler">compiler-compiler</a> <a href="/wiki/Toolchain" title="Toolchain">toolchain</a> are more practical for a larger number of potential tokens. These tools generally accept regular expressions that describe the tokens allowed in the input stream. Each regular expression is associated with a <a href="/wiki/Formal_grammar#The_syntax_of_grammars" title="Formal grammar">production rule</a> in the lexical grammar of the programming language that evaluates the lexemes matching the regular expression. These tools may generate source code that can be compiled and executed or construct a <a href="/wiki/State_transition_table" class="mw-redirect" title="State transition table">state transition table</a> for a <a href="/wiki/Finite-state_machine" title="Finite-state machine">finite-state machine</a> (which is plugged into template code for compiling and executing). </p><p>Regular expressions compactly represent patterns that the characters in lexemes might follow. For example, for an <a href="/wiki/English_language" title="English language">English</a>-based language, an IDENTIFIER token might be any English alphabetic character or an underscore, followed by any number of instances of ASCII alphanumeric characters and/or underscores. This could be represented compactly by the string <code class="mw-highlight mw-highlight-lang-text mw-content-ltr" style="" dir="ltr">[a-zA-Z_][a-zA-Z_0-9]*</code>. This means "any character a-z, A-Z or _, followed by 0 or more of a-z, A-Z, _ or 0-9". </p><p>Regular expressions and the finite-state machines they generate are not powerful enough to handle recursive patterns, such as "<i>n</i> opening parentheses, followed by a statement, followed by <i>n</i> closing parentheses." They are unable to keep count, and verify that <i>n</i> is the same on both sides, unless a finite set of permissible values exists for <i>n</i>. It takes a full parser to recognize such patterns in their full generality. A parser can push parentheses on a stack and then try to pop them off and see if the stack is empty at the end (see example<sup id="cite_ref-3" class="reference"><a href="#cite_note-3"><span class="cite-bracket">[</span>3<span class="cite-bracket">]</span></a></sup> in the <i><a href="/wiki/Structure_and_Interpretation_of_Computer_Programs" title="Structure and Interpretation of Computer Programs">Structure and Interpretation of Computer Programs</a></i> book). </p> <div class="mw-heading mw-heading2"><h2 id="Obstacles">Obstacles</h2><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Lexical_analysis&action=edit&section=8" title="Edit section: Obstacles"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <p>Typically, lexical tokenization occurs at the word level. However, it is sometimes difficult to define what is meant by a "word". Often, a tokenizer relies on simple heuristics, for example: </p> <ul><li>Punctuation and whitespace may or may not be included in the resulting list of tokens.</li> <li>All contiguous strings of alphabetic characters are part of one token; likewise with numbers.</li> <li>Tokens are separated by <a href="/wiki/Whitespace_character" title="Whitespace character">whitespace</a> characters, such as a space or line break, or by punctuation characters.</li></ul> <p>In languages that use inter-word spaces (such as most that use the Latin alphabet, and most programming languages), this approach is fairly straightforward. However, even here there are many edge cases such as <a href="/wiki/Poetic_contraction" title="Poetic contraction">contractions</a>, <a href="/wiki/Hyphen" title="Hyphen">hyphenated</a> words, <a href="/wiki/Emoticon" title="Emoticon">emoticons</a>, and larger constructs such as <a href="/wiki/URI" class="mw-redirect" title="URI">URIs</a> (which for some purposes may count as single tokens). A classic example is "New York-based", which a naive tokenizer may break at the space even though the better break is (arguably) at the hyphen. </p><p>Tokenization is particularly difficult for languages written in <a href="/wiki/Scriptio_continua" title="Scriptio continua">scriptio continua</a>, which exhibit no word boundaries, such as <a href="/wiki/Ancient_Greek" title="Ancient Greek">Ancient Greek</a>, <a href="/wiki/Chinese_language" title="Chinese language">Chinese</a>,<sup id="cite_ref-4" class="reference"><a href="#cite_note-4"><span class="cite-bracket">[</span>4<span class="cite-bracket">]</span></a></sup> or <a href="/wiki/Thai_language" title="Thai language">Thai</a>. <a href="/wiki/Agglutinative_language" title="Agglutinative language">Agglutinative languages</a>, such as Korean, also make tokenization tasks complicated. </p><p>Some ways to address the more difficult problems include developing more complex heuristics, querying a table of common special cases, or fitting the tokens to a <a href="/wiki/Language_model" title="Language model">language model</a> that identifies collocations in a later processing step. </p> <div class="mw-heading mw-heading2"><h2 id="Lexer_generator">Lexer generator</h2><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Lexical_analysis&action=edit&section=9" title="Edit section: Lexer generator"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1236090951"><div role="note" class="hatnote navigation-not-searchable">See also: <a href="/wiki/Parser_generator" class="mw-redirect" title="Parser generator">Parser generator</a></div> <p>Lexers are often generated by a <i>lexer generator</i>, analogous to <a href="/wiki/Parser_generator" class="mw-redirect" title="Parser generator">parser generators</a>, and such tools often come together. The most established is <a href="/wiki/Lex_(software)" title="Lex (software)">lex</a>, paired with the <a href="/wiki/Yacc" title="Yacc">yacc</a> parser generator, or rather some of their many reimplementations, like <a href="/wiki/Flex_(lexical_analyser_generator)" title="Flex (lexical analyser generator)">flex</a> (often paired with <a href="/wiki/GNU_Bison" title="GNU Bison">GNU Bison</a>). These generators are a form of <a href="/wiki/Domain-specific_language" title="Domain-specific language">domain-specific language</a>, taking in a lexical specification – generally regular expressions with some markup – and emitting a lexer. </p><p>These tools yield very fast development, which is very important in early development, both to get a working lexer and because a language specification may change often. Further, they often provide advanced features, such as pre- and post-conditions which are hard to program by hand. However, an automatically generated lexer may lack flexibility, and thus may require some manual modification, or an all-manually written lexer. </p><p>Lexer performance is a concern, and optimizing is worthwhile, more so in stable languages where the lexer is run very often (such as C or HTML). lex/flex-generated lexers are reasonably fast, but improvements of two to three times are possible using more tuned generators. Hand-written lexers are sometimes used, but modern lexer generators produce faster lexers than most hand-coded ones. The lex/flex family of generators uses a table-driven approach which is much less efficient than the directly coded approach.<sup class="noprint Inline-Template" style="white-space:nowrap;">[<i><a href="/wiki/Wikipedia:Accuracy_dispute#Disputed_statement" title="Wikipedia:Accuracy dispute"><span title="The material near this tag is possibly inaccurate or nonfactual. (May 2010)">dubious</span></a> – <a href="/wiki/Talk:Lexical_analysis#table-driven_vs_directly_coded" title="Talk:Lexical analysis">discuss</a></i>]</sup> With the latter approach the generator produces an engine that directly jumps to follow-up states via goto statements. Tools like <a href="/wiki/Re2c" title="Re2c">re2c</a><sup id="cite_ref-5" class="reference"><a href="#cite_note-5"><span class="cite-bracket">[</span>5<span class="cite-bracket">]</span></a></sup> have proven to produce engines that are between two and three times faster than flex produced engines.<sup class="noprint Inline-Template Template-Fact" style="white-space:nowrap;">[<i><a href="/wiki/Wikipedia:Citation_needed" title="Wikipedia:Citation needed"><span title="This claim needs references to reliable sources. (April 2008)">citation needed</span></a></i>]</sup> It is in general difficult to hand-write analyzers that perform better than engines generated by these latter tools. </p> <div class="mw-heading mw-heading2"><h2 id="Phrase_structure">Phrase structure</h2><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Lexical_analysis&action=edit&section=10" title="Edit section: Phrase structure"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <p>Lexical analysis mainly segments the input stream of characters into tokens, simply grouping the characters into pieces and categorizing them. However, the lexing may be significantly more complex; most simply, lexers may omit tokens or insert added tokens. Omitting tokens, notably whitespace and comments, is very common when these are not needed by the compiler. Less commonly, added tokens may be inserted. This is done mainly to group tokens into <a href="/wiki/Statement_(computer_science)" title="Statement (computer science)">statements</a>, or statements into blocks, to simplify the parser. </p> <div class="mw-heading mw-heading3"><h3 id="Line_continuation">Line continuation</h3><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Lexical_analysis&action=edit&section=11" title="Edit section: Line continuation"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <p><a href="/wiki/Line_continuation" class="mw-redirect" title="Line continuation">Line continuation</a> is a feature of some languages where a newline is normally a statement terminator. Most often, ending a line with a backslash (immediately followed by a <a href="/wiki/Newline" title="Newline">newline</a>) results in the line being <i>continued</i> – the following line is <i>joined</i> to the prior line. This is generally done in the lexer: The backslash and newline are discarded, rather than the newline being tokenized. Examples include <a href="/wiki/Bash_(Unix_shell)" title="Bash (Unix shell)">bash</a>,<sup id="cite_ref-6" class="reference"><a href="#cite_note-6"><span class="cite-bracket">[</span>6<span class="cite-bracket">]</span></a></sup> other shell scripts and Python.<sup id="cite_ref-3.6.4_Documentation_7-0" class="reference"><a href="#cite_note-3.6.4_Documentation-7"><span class="cite-bracket">[</span>7<span class="cite-bracket">]</span></a></sup> </p> <div class="mw-heading mw-heading3"><h3 id="Semicolon_insertion">Semicolon insertion</h3><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Lexical_analysis&action=edit&section=12" title="Edit section: Semicolon insertion"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <p>Many languages use the semicolon as a statement terminator. Most often this is mandatory, but in some languages the semicolon is optional in many contexts. This is mainly done at the lexer level, where the lexer outputs a semicolon into the token stream, despite one not being present in the input character stream, and is termed <i>semicolon insertion</i> or <i>automatic semicolon insertion</i>. In these cases, semicolons are part of the formal phrase grammar of the language, but may not be found in input text, as they can be inserted by the lexer. Optional semicolons or other terminators or separators are also sometimes handled at the parser level, notably in the case of <a href="/wiki/Trailing_comma" class="mw-redirect" title="Trailing comma">trailing commas</a> or semicolons. </p><p>Semicolon insertion is a feature of <a href="/wiki/BCPL" title="BCPL">BCPL</a> and its distant descendant <a href="/wiki/Go_(programming_language)" title="Go (programming language)">Go</a>,<sup id="cite_ref-8" class="reference"><a href="#cite_note-8"><span class="cite-bracket">[</span>8<span class="cite-bracket">]</span></a></sup> though it is absent in B or C.<sup id="cite_ref-9" class="reference"><a href="#cite_note-9"><span class="cite-bracket">[</span>9<span class="cite-bracket">]</span></a></sup> Semicolon insertion is present in <a href="/wiki/JavaScript" title="JavaScript">JavaScript</a>, though the rules are somewhat complex and much-criticized; to avoid bugs, some recommend always using semicolons, while others use initial semicolons, termed <a href="/wiki/Defensive_semicolon" class="mw-redirect" title="Defensive semicolon">defensive semicolons</a>, at the start of potentially ambiguous statements. </p><p>Semicolon insertion (in languages with semicolon-terminated statements) and line continuation (in languages with newline-terminated statements) can be seen as complementary: Semicolon insertion adds a token even though newlines generally do <i>not</i> generate tokens, while line continuation prevents a token from being generated even though newlines generally <i>do</i> generate tokens. </p> <div class="mw-heading mw-heading3"><h3 id="Off-side_rule">Off-side rule</h3><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Lexical_analysis&action=edit&section=13" title="Edit section: Off-side rule"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1236090951"><div role="note" class="hatnote navigation-not-searchable">Further information: <a href="/wiki/Off-side_rule" title="Off-side rule">Off-side rule</a></div> <p>The <a href="/wiki/Off-side_rule" title="Off-side rule">off-side rule</a> (blocks determined by indenting) can be implemented in the lexer, as in <a href="/wiki/Python_(programming_language)" title="Python (programming language)">Python</a>, where increasing the indenting results in the lexer emitting an INDENT token and decreasing the indenting results in the lexer emitting one or more DEDENT tokens.<sup id="cite_ref-10" class="reference"><a href="#cite_note-10"><span class="cite-bracket">[</span>10<span class="cite-bracket">]</span></a></sup> These tokens correspond to the opening brace <code>{</code> and closing brace <code>}</code> in languages that use braces for blocks and means that the phrase grammar does not depend on whether braces or indenting are used. This requires that the lexer hold state, namely a stack of indent levels, and thus can detect changes in indenting when this changes, and thus the lexical grammar is not <a href="/wiki/Context-free_grammar" title="Context-free grammar">context-free</a>: INDENT–DEDENT depend on the contextual information of prior indent levels. </p> <div class="mw-heading mw-heading2"><h2 id="Context-sensitive_lexing">Context-sensitive lexing</h2><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Lexical_analysis&action=edit&section=14" title="Edit section: Context-sensitive lexing"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <p>Generally lexical grammars are context-free, or almost so, and thus require no looking back or ahead, or backtracking, which allows a simple, clean, and efficient implementation. This also allows simple one-way communication from lexer to parser, without needing any information flowing back to the lexer. </p><p>There are exceptions, however. Simple examples include semicolon insertion in Go, which requires looking back one token; concatenation of consecutive string literals in Python,<sup id="cite_ref-3.6.4_Documentation_7-1" class="reference"><a href="#cite_note-3.6.4_Documentation-7"><span class="cite-bracket">[</span>7<span class="cite-bracket">]</span></a></sup> which requires holding one token in a buffer before emitting it (to see if the next token is another string literal); and the off-side rule in Python, which requires maintaining a count of indent level (indeed, a stack of each indent level). These examples all only require lexical context, and while they complicate a lexer somewhat, they are invisible to the parser and later phases. </p><p>A more complex example is <a href="/wiki/The_lexer_hack" class="mw-redirect" title="The lexer hack">the lexer hack</a> in C, where the token class of a sequence of characters cannot be determined until the semantic analysis phase since <a href="/wiki/Typedef" title="Typedef">typedef</a> names and variable names are lexically identical but constitute different token classes. Thus in the hack, the lexer calls the semantic analyzer (say, symbol table) and checks if the sequence requires a typedef name. In this case, information must flow back not from the parser only, but from the semantic analyzer back to the lexer, which complicates design. </p> <div class="mw-heading mw-heading2"><h2 id="See_also">See also</h2><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Lexical_analysis&action=edit&section=15" title="Edit section: See also"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <ul><li><a href="/wiki/Lexicalization" title="Lexicalization">Lexicalization</a></li> <li><a href="/wiki/Lexical_semantics" title="Lexical semantics">Lexical semantics</a></li> <li><a href="/wiki/List_of_parser_generators" class="mw-redirect" title="List of parser generators">List of parser generators</a></li></ul> <div class="mw-heading mw-heading2"><h2 id="References">References</h2><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Lexical_analysis&action=edit&section=16" title="Edit section: References"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <style data-mw-deduplicate="TemplateStyles:r1239543626">.mw-parser-output .reflist{margin-bottom:0.5em;list-style-type:decimal}@media screen{.mw-parser-output .reflist{font-size:90%}}.mw-parser-output .reflist .references{font-size:100%;margin-bottom:0;list-style-type:inherit}.mw-parser-output .reflist-columns-2{column-width:30em}.mw-parser-output .reflist-columns-3{column-width:25em}.mw-parser-output .reflist-columns{margin-top:0.3em}.mw-parser-output .reflist-columns ol{margin-top:0}.mw-parser-output .reflist-columns li{page-break-inside:avoid;break-inside:avoid-column}.mw-parser-output .reflist-upper-alpha{list-style-type:upper-alpha}.mw-parser-output .reflist-upper-roman{list-style-type:upper-roman}.mw-parser-output .reflist-lower-alpha{list-style-type:lower-alpha}.mw-parser-output .reflist-lower-greek{list-style-type:lower-greek}.mw-parser-output .reflist-lower-roman{list-style-type:lower-roman}</style><div class="reflist"> <div class="mw-references-wrap"><ol class="references"> <li id="cite_note-1"><span class="mw-cite-backlink"><b><a href="#cite_ref-1">^</a></b></span> <span class="reference-text"><style data-mw-deduplicate="TemplateStyles:r1238218222">.mw-parser-output cite.citation{font-style:inherit;word-wrap:break-word}.mw-parser-output .citation q{quotes:"\"""\"""'""'"}.mw-parser-output .citation:target{background-color:rgba(0,127,255,0.133)}.mw-parser-output .id-lock-free.id-lock-free a{background:url("//upload.wikimedia.org/wikipedia/commons/6/65/Lock-green.svg")right 0.1em center/9px no-repeat}.mw-parser-output .id-lock-limited.id-lock-limited a,.mw-parser-output .id-lock-registration.id-lock-registration a{background:url("//upload.wikimedia.org/wikipedia/commons/d/d6/Lock-gray-alt-2.svg")right 0.1em center/9px no-repeat}.mw-parser-output .id-lock-subscription.id-lock-subscription a{background:url("//upload.wikimedia.org/wikipedia/commons/a/aa/Lock-red-alt-2.svg")right 0.1em center/9px no-repeat}.mw-parser-output .cs1-ws-icon a{background:url("//upload.wikimedia.org/wikipedia/commons/4/4c/Wikisource-logo.svg")right 0.1em center/12px no-repeat}body:not(.skin-timeless):not(.skin-minerva) .mw-parser-output .id-lock-free a,body:not(.skin-timeless):not(.skin-minerva) .mw-parser-output .id-lock-limited a,body:not(.skin-timeless):not(.skin-minerva) .mw-parser-output .id-lock-registration a,body:not(.skin-timeless):not(.skin-minerva) .mw-parser-output .id-lock-subscription a,body:not(.skin-timeless):not(.skin-minerva) .mw-parser-output .cs1-ws-icon a{background-size:contain;padding:0 1em 0 0}.mw-parser-output .cs1-code{color:inherit;background:inherit;border:none;padding:inherit}.mw-parser-output .cs1-hidden-error{display:none;color:var(--color-error,#d33)}.mw-parser-output .cs1-visible-error{color:var(--color-error,#d33)}.mw-parser-output .cs1-maint{display:none;color:#085;margin-left:0.3em}.mw-parser-output .cs1-kern-left{padding-left:0.2em}.mw-parser-output .cs1-kern-right{padding-right:0.2em}.mw-parser-output .citation .mw-selflink{font-weight:inherit}@media screen{.mw-parser-output .cs1-format{font-size:95%}html.skin-theme-clientpref-night .mw-parser-output .cs1-maint{color:#18911f}}@media screen and (prefers-color-scheme:dark){html.skin-theme-clientpref-os .mw-parser-output .cs1-maint{color:#18911f}}</style><cite class="citation web cs1"><a rel="nofollow" class="external text" href="http://www.cs.man.ac.uk/~pjj/farrell/comp3.html">"Anatomy of a Compiler and The Tokenizer"</a>. <i>www.cs.man.ac.uk</i>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=unknown&rft.jtitle=www.cs.man.ac.uk&rft.atitle=Anatomy+of+a+Compiler+and+The+Tokenizer&rft_id=http%3A%2F%2Fwww.cs.man.ac.uk%2F~pjj%2Ffarrell%2Fcomp3.html&rfr_id=info%3Asid%2Fen.wikipedia.org%3ALexical+analysis" class="Z3988"></span></span> </li> <li id="cite_note-auto-2"><span class="mw-cite-backlink"><b><a href="#cite_ref-auto_2-0">^</a></b></span> <span class="reference-text">page 111, "Compilers Principles, Techniques, & Tools, 2nd Ed." (WorldCat) by Aho, Lam, Sethi and Ullman, as quoted in <a rel="nofollow" class="external free" href="https://stackoverflow.com/questions/14954721/what-is-the-difference-between-token-and-lexeme">https://stackoverflow.com/questions/14954721/what-is-the-difference-between-token-and-lexeme</a></span> </li> <li id="cite_note-3"><span class="mw-cite-backlink"><b><a href="#cite_ref-3">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite class="citation web cs1"><a rel="nofollow" class="external text" href="https://web.archive.org/web/20121030233934/http://mitpress.mit.edu/sicp/full-text/book/book-Z-H-31.html#%25_sec_5.1.4">"Structure and Interpretation of Computer Programs"</a>. <i>mitpress.mit.edu</i>. Archived from <a rel="nofollow" class="external text" href="http://mitpress.mit.edu/sicp/full-text/book/book-Z-H-31.html#%25_sec_5.1.4">the original</a> on 2012-10-30<span class="reference-accessdate">. Retrieved <span class="nowrap">2009-03-07</span></span>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=unknown&rft.jtitle=mitpress.mit.edu&rft.atitle=Structure+and+Interpretation+of+Computer+Programs&rft_id=http%3A%2F%2Fmitpress.mit.edu%2Fsicp%2Ffull-text%2Fbook%2Fbook-Z-H-31.html%23%2525_sec_5.1.4&rfr_id=info%3Asid%2Fen.wikipedia.org%3ALexical+analysis" class="Z3988"></span></span> </li> <li id="cite_note-4"><span class="mw-cite-backlink"><b><a href="#cite_ref-4">^</a></b></span> <span class="reference-text">Huang, C., Simon, P., Hsieh, S., & Prevot, L. (2007) <a rel="nofollow" class="external text" href="http://www.aclweb.org/anthology/P/P07/P07-2018.pdf">Rethinking Chinese Word Segmentation: Tokenization, Character Classification, or Word break Identification</a></span> </li> <li id="cite_note-5"><span class="mw-cite-backlink"><b><a href="#cite_ref-5">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFBumbulisCowan1993" class="citation journal cs1">Bumbulis, P.; Cowan, D. D. (Mar–Dec 1993). <a rel="nofollow" class="external text" href="https://doi.org/10.1145%2F176454.176487">"RE2C: A more versatile scanner generator"</a>. <i>ACM Letters on Programming Languages and Systems</i>. <b>2</b> (1–4): 70–84. <a href="/wiki/Doi_(identifier)" class="mw-redirect" title="Doi (identifier)">doi</a>:<span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://doi.org/10.1145%2F176454.176487">10.1145/176454.176487</a></span>. <a href="/wiki/S2CID_(identifier)" class="mw-redirect" title="S2CID (identifier)">S2CID</a> <a rel="nofollow" class="external text" href="https://api.semanticscholar.org/CorpusID:14814637">14814637</a>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=article&rft.jtitle=ACM+Letters+on+Programming+Languages+and+Systems&rft.atitle=RE2C%3A+A+more+versatile+scanner+generator&rft.volume=2&rft.issue=1%E2%80%934&rft.pages=70-84&rft.date=1993-03%2F1993-12&rft_id=info%3Adoi%2F10.1145%2F176454.176487&rft_id=https%3A%2F%2Fapi.semanticscholar.org%2FCorpusID%3A14814637%23id-name%3DS2CID&rft.aulast=Bumbulis&rft.aufirst=P.&rft.au=Cowan%2C+D.+D.&rft_id=https%3A%2F%2Fdoi.org%2F10.1145%252F176454.176487&rfr_id=info%3Asid%2Fen.wikipedia.org%3ALexical+analysis" class="Z3988"></span></span> </li> <li id="cite_note-6"><span class="mw-cite-backlink"><b><a href="#cite_ref-6">^</a></b></span> <span class="reference-text"><i><a rel="nofollow" class="external text" href="https://www.gnu.org/software/bash/manual/bashref.html">Bash Reference Manual</a></i>, <a rel="nofollow" class="external text" href="https://www.gnu.org/software/bash/manual/bashref.html#Escape-Character">3.1.2.1 Escape Character</a></span> </li> <li id="cite_note-3.6.4_Documentation-7"><span class="mw-cite-backlink">^ <a href="#cite_ref-3.6.4_Documentation_7-0"><sup><i><b>a</b></i></sup></a> <a href="#cite_ref-3.6.4_Documentation_7-1"><sup><i><b>b</b></i></sup></a></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite class="citation web cs1"><a rel="nofollow" class="external text" href="https://docs.python.org/">"3.6.4 Documentation"</a>. <i>docs.python.org</i>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=unknown&rft.jtitle=docs.python.org&rft.atitle=3.6.4+Documentation&rft_id=https%3A%2F%2Fdocs.python.org%2F&rfr_id=info%3Asid%2Fen.wikipedia.org%3ALexical+analysis" class="Z3988"></span></span> </li> <li id="cite_note-8"><span class="mw-cite-backlink"><b><a href="#cite_ref-8">^</a></b></span> <span class="reference-text"><i><a rel="nofollow" class="external text" href="https://golang.org/doc/effective_go.html">Effective Go</a></i>, "<a rel="nofollow" class="external text" href="https://golang.org/doc/effective_go.html#semicolons">Semicolons</a>"</span> </li> <li id="cite_note-9"><span class="mw-cite-backlink"><b><a href="#cite_ref-9">^</a></b></span> <span class="reference-text">"<a rel="nofollow" class="external text" href="https://groups.google.com/forum/#!topic/golang-nuts/XuMrWI0Q8uk">Semicolons in Go</a>", golang-nuts, Rob 'Commander' Pike, 12/10/09</span> </li> <li id="cite_note-10"><span class="mw-cite-backlink"><b><a href="#cite_ref-10">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite class="citation web cs1"><a rel="nofollow" class="external text" href="https://docs.python.org/3.11/reference/lexical_analysis.html#indentation">"Lexical analysis > Indentation"</a>. <i>The Python Language Reference</i><span class="reference-accessdate">. Retrieved <span class="nowrap">21 June</span> 2023</span>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=unknown&rft.jtitle=The+Python+Language+Reference&rft.atitle=Lexical+analysis+%3E+Indentation&rft_id=https%3A%2F%2Fdocs.python.org%2F3.11%2Freference%2Flexical_analysis.html%23indentation&rfr_id=info%3Asid%2Fen.wikipedia.org%3ALexical+analysis" class="Z3988"></span></span> </li> </ol></div></div> <div class="mw-heading mw-heading3"><h3 id="Sources">Sources</h3><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Lexical_analysis&action=edit&section=17" title="Edit section: Sources"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <style data-mw-deduplicate="TemplateStyles:r1239549316">.mw-parser-output .refbegin{margin-bottom:0.5em}.mw-parser-output .refbegin-hanging-indents>ul{margin-left:0}.mw-parser-output .refbegin-hanging-indents>ul>li{margin-left:0;padding-left:3.2em;text-indent:-3.2em}.mw-parser-output .refbegin-hanging-indents ul,.mw-parser-output .refbegin-hanging-indents ul li{list-style:none}@media(max-width:720px){.mw-parser-output .refbegin-hanging-indents>ul>li{padding-left:1.6em;text-indent:-1.6em}}.mw-parser-output .refbegin-columns{margin-top:0.3em}.mw-parser-output .refbegin-columns ul{margin-top:0}.mw-parser-output .refbegin-columns li{page-break-inside:avoid;break-inside:avoid-column}@media screen{.mw-parser-output .refbegin{font-size:90%}}</style><div class="refbegin" style=""> <ul><li><i>Compiling with C# and Java</i>, Pat Terry, 2005, <link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><a href="/wiki/ISBN_(identifier)" class="mw-redirect" title="ISBN (identifier)">ISBN</a> <a href="/wiki/Special:BookSources/032126360X" title="Special:BookSources/032126360X">032126360X</a></li> <li><i>Algorithms + Data Structures = Programs</i>, Niklaus Wirth, 1975, <link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><a href="/wiki/ISBN_(identifier)" class="mw-redirect" title="ISBN (identifier)">ISBN</a> <a href="/wiki/Special:BookSources/0-13-022418-9" title="Special:BookSources/0-13-022418-9">0-13-022418-9</a></li> <li><i>Compiler Construction</i>, Niklaus Wirth, 1996, <link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><a href="/wiki/ISBN_(identifier)" class="mw-redirect" title="ISBN (identifier)">ISBN</a> <a href="/wiki/Special:BookSources/0-201-40353-6" title="Special:BookSources/0-201-40353-6">0-201-40353-6</a></li> <li>Sebesta, R. W. (2006). Concepts of programming languages (Seventh edition) pp. 177. Boston: Pearson/Addison-Wesley.</li></ul> </div> <div class="mw-heading mw-heading2"><h2 id="External_links">External links</h2><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Lexical_analysis&action=edit&section=18" title="Edit section: External links"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <ul><li><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFYangTsayChan2002" class="citation journal cs1">Yang, W.; Tsay, Chey-Woei; Chan, Jien-Tsai (2002). <a rel="nofollow" class="external text" href="http://people.cs.nctu.edu.tw/~wuuyang/homepage/papers/applicability2002.ps">"On the applicability of the longest-match rule in lexical analysis"</a>. <i>Computer Languages, Systems & Structures</i>. <b>28</b> (3): 273–288. <a href="/wiki/Doi_(identifier)" class="mw-redirect" title="Doi (identifier)">doi</a>:<a rel="nofollow" class="external text" href="https://doi.org/10.1016%2FS0096-0551%2802%2900014-0">10.1016/S0096-0551(02)00014-0</a>. NSC 86-2213-E-009-021 and NSC 86-2213-E-009-079.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=article&rft.jtitle=Computer+Languages%2C+Systems+%26+Structures&rft.atitle=On+the+applicability+of+the+longest-match+rule+in+lexical+analysis&rft.volume=28&rft.issue=3&rft.pages=273-288&rft.date=2002&rft_id=info%3Adoi%2F10.1016%2FS0096-0551%2802%2900014-0&rft.aulast=Yang&rft.aufirst=W.&rft.au=Tsay%2C+Chey-Woei&rft.au=Chan%2C+Jien-Tsai&rft_id=http%3A%2F%2Fpeople.cs.nctu.edu.tw%2F~wuuyang%2Fhomepage%2Fpapers%2Fapplicability2002.ps&rfr_id=info%3Asid%2Fen.wikipedia.org%3ALexical+analysis" class="Z3988"></span></li> <li><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFTrim2013" class="citation web cs1">Trim, Craig (Jan 23, 2013). <a rel="nofollow" class="external text" href="https://web.archive.org/web/20190530155339/https://www.ibm.com/developerworks/community/blogs/nlp/entry/tokenization?lang=en">"The Art of Tokenization"</a>. <i>Developer Works</i>. IBM. Archived from <a rel="nofollow" class="external text" href="https://www.ibm.com/developerworks/community/blogs/nlp/entry/tokenization">the original</a> on 2019-05-30.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=unknown&rft.jtitle=Developer+Works&rft.atitle=The+Art+of+Tokenization&rft.date=2013-01-23&rft.aulast=Trim&rft.aufirst=Craig&rft_id=https%3A%2F%2Fwww.ibm.com%2Fdeveloperworks%2Fcommunity%2Fblogs%2Fnlp%2Fentry%2Ftokenization&rfr_id=info%3Asid%2Fen.wikipedia.org%3ALexical+analysis" class="Z3988"></span></li> <li><a rel="nofollow" class="external text" href="http://www.gabormelli.com/RKB/Word_Mention_Segmentation_Task">Word Mention Segmentation Task</a>, an analysis</li></ul> <div class="navbox-styles"><style data-mw-deduplicate="TemplateStyles:r1129693374">.mw-parser-output .hlist dl,.mw-parser-output .hlist ol,.mw-parser-output .hlist ul{margin:0;padding:0}.mw-parser-output .hlist dd,.mw-parser-output .hlist dt,.mw-parser-output .hlist li{margin:0;display:inline}.mw-parser-output .hlist.inline,.mw-parser-output .hlist.inline dl,.mw-parser-output .hlist.inline ol,.mw-parser-output .hlist.inline ul,.mw-parser-output .hlist dl dl,.mw-parser-output .hlist dl ol,.mw-parser-output .hlist dl ul,.mw-parser-output .hlist ol dl,.mw-parser-output .hlist ol ol,.mw-parser-output .hlist ol ul,.mw-parser-output .hlist ul dl,.mw-parser-output .hlist ul ol,.mw-parser-output .hlist ul ul{display:inline}.mw-parser-output .hlist .mw-empty-li{display:none}.mw-parser-output .hlist dt::after{content:": "}.mw-parser-output .hlist dd::after,.mw-parser-output .hlist li::after{content:" · ";font-weight:bold}.mw-parser-output .hlist dd:last-child::after,.mw-parser-output .hlist dt:last-child::after,.mw-parser-output .hlist li:last-child::after{content:none}.mw-parser-output .hlist dd dd:first-child::before,.mw-parser-output .hlist dd dt:first-child::before,.mw-parser-output .hlist dd li:first-child::before,.mw-parser-output .hlist dt dd:first-child::before,.mw-parser-output .hlist dt dt:first-child::before,.mw-parser-output .hlist dt li:first-child::before,.mw-parser-output .hlist li dd:first-child::before,.mw-parser-output .hlist li dt:first-child::before,.mw-parser-output .hlist li li:first-child::before{content:" (";font-weight:normal}.mw-parser-output .hlist dd dd:last-child::after,.mw-parser-output .hlist dd dt:last-child::after,.mw-parser-output .hlist dd li:last-child::after,.mw-parser-output .hlist dt dd:last-child::after,.mw-parser-output .hlist dt dt:last-child::after,.mw-parser-output .hlist dt li:last-child::after,.mw-parser-output .hlist li dd:last-child::after,.mw-parser-output .hlist li dt:last-child::after,.mw-parser-output .hlist li li:last-child::after{content:")";font-weight:normal}.mw-parser-output .hlist ol{counter-reset:listitem}.mw-parser-output .hlist ol>li{counter-increment:listitem}.mw-parser-output .hlist ol>li::before{content:" "counter(listitem)"\a0 "}.mw-parser-output .hlist dd ol>li:first-child::before,.mw-parser-output .hlist dt ol>li:first-child::before,.mw-parser-output .hlist li ol>li:first-child::before{content:" ("counter(listitem)"\a0 "}</style><style data-mw-deduplicate="TemplateStyles:r1236075235">.mw-parser-output .navbox{box-sizing:border-box;border:1px solid #a2a9b1;width:100%;clear:both;font-size:88%;text-align:center;padding:1px;margin:1em auto 0}.mw-parser-output .navbox .navbox{margin-top:0}.mw-parser-output .navbox+.navbox,.mw-parser-output .navbox+.navbox-styles+.navbox{margin-top:-1px}.mw-parser-output .navbox-inner,.mw-parser-output .navbox-subgroup{width:100%}.mw-parser-output .navbox-group,.mw-parser-output .navbox-title,.mw-parser-output .navbox-abovebelow{padding:0.25em 1em;line-height:1.5em;text-align:center}.mw-parser-output .navbox-group{white-space:nowrap;text-align:right}.mw-parser-output .navbox,.mw-parser-output .navbox-subgroup{background-color:#fdfdfd}.mw-parser-output .navbox-list{line-height:1.5em;border-color:#fdfdfd}.mw-parser-output .navbox-list-with-group{text-align:left;border-left-width:2px;border-left-style:solid}.mw-parser-output tr+tr>.navbox-abovebelow,.mw-parser-output tr+tr>.navbox-group,.mw-parser-output tr+tr>.navbox-image,.mw-parser-output tr+tr>.navbox-list{border-top:2px solid #fdfdfd}.mw-parser-output .navbox-title{background-color:#ccf}.mw-parser-output .navbox-abovebelow,.mw-parser-output .navbox-group,.mw-parser-output .navbox-subgroup .navbox-title{background-color:#ddf}.mw-parser-output .navbox-subgroup .navbox-group,.mw-parser-output .navbox-subgroup .navbox-abovebelow{background-color:#e6e6ff}.mw-parser-output .navbox-even{background-color:#f7f7f7}.mw-parser-output .navbox-odd{background-color:transparent}.mw-parser-output .navbox .hlist td dl,.mw-parser-output .navbox .hlist td ol,.mw-parser-output .navbox .hlist td ul,.mw-parser-output .navbox td.hlist dl,.mw-parser-output .navbox td.hlist ol,.mw-parser-output .navbox td.hlist ul{padding:0.125em 0}.mw-parser-output .navbox .navbar{display:block;font-size:100%}.mw-parser-output .navbox-title .navbar{float:left;text-align:left;margin-right:0.5em}body.skin--responsive .mw-parser-output .navbox-image img{max-width:none!important}@media print{body.ns-0 .mw-parser-output .navbox{display:none!important}}</style></div><div role="navigation" class="navbox" aria-labelledby="Natural_language_processing" style="padding:3px"><table class="nowraplinks hlist mw-collapsible autocollapse navbox-inner" style="border-spacing:0;background:transparent;color:inherit"><tbody><tr><th scope="col" class="navbox-title" colspan="2"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1129693374"><style data-mw-deduplicate="TemplateStyles:r1239400231">.mw-parser-output .navbar{display:inline;font-size:88%;font-weight:normal}.mw-parser-output .navbar-collapse{float:left;text-align:left}.mw-parser-output .navbar-boxtext{word-spacing:0}.mw-parser-output .navbar ul{display:inline-block;white-space:nowrap;line-height:inherit}.mw-parser-output .navbar-brackets::before{margin-right:-0.125em;content:"[ "}.mw-parser-output .navbar-brackets::after{margin-left:-0.125em;content:" ]"}.mw-parser-output .navbar li{word-spacing:-0.125em}.mw-parser-output .navbar a>span,.mw-parser-output .navbar a>abbr{text-decoration:inherit}.mw-parser-output .navbar-mini abbr{font-variant:small-caps;border-bottom:none;text-decoration:none;cursor:inherit}.mw-parser-output .navbar-ct-full{font-size:114%;margin:0 7em}.mw-parser-output .navbar-ct-mini{font-size:114%;margin:0 4em}html.skin-theme-clientpref-night .mw-parser-output .navbar li a abbr{color:var(--color-base)!important}@media(prefers-color-scheme:dark){html.skin-theme-clientpref-os .mw-parser-output .navbar li a abbr{color:var(--color-base)!important}}@media print{.mw-parser-output .navbar{display:none!important}}</style><div class="navbar plainlinks hlist navbar-mini"><ul><li class="nv-view"><a href="/wiki/Template:Natural_language_processing" title="Template:Natural language processing"><abbr title="View this template">v</abbr></a></li><li class="nv-talk"><a href="/wiki/Template_talk:Natural_language_processing" title="Template talk:Natural language processing"><abbr title="Discuss this template">t</abbr></a></li><li class="nv-edit"><a href="/wiki/Special:EditPage/Template:Natural_language_processing" title="Special:EditPage/Template:Natural language processing"><abbr title="Edit this template">e</abbr></a></li></ul></div><div id="Natural_language_processing" style="font-size:114%;margin:0 4em"><a href="/wiki/Natural_language_processing" title="Natural language processing">Natural language processing</a></div></th></tr><tr><th scope="row" class="navbox-group" style="width:1%">General terms</th><td class="navbox-list-with-group navbox-list navbox-odd" style="width:100%;padding:0"><div style="padding:0 0.25em"> <ul><li><a href="/wiki/AI-complete" title="AI-complete">AI-complete</a></li> <li><a href="/wiki/Bag-of-words_model" title="Bag-of-words model">Bag-of-words</a></li> <li><a href="/wiki/N-gram" title="N-gram">n-gram</a> <ul><li><a href="/wiki/Bigram" title="Bigram">Bigram</a></li> <li><a href="/wiki/Trigram" title="Trigram">Trigram</a></li></ul></li> <li><a href="/wiki/Computational_linguistics" title="Computational linguistics">Computational linguistics</a></li> <li><a href="/wiki/Natural_language_understanding" title="Natural language understanding">Natural language understanding</a></li> <li><a href="/wiki/Stop_word" title="Stop word">Stop words</a></li> <li><a href="/wiki/Text_processing" title="Text processing">Text processing</a></li></ul> </div></td></tr><tr><th scope="row" class="navbox-group" style="width:1%"><a href="/wiki/Text_mining" title="Text mining">Text analysis</a></th><td class="navbox-list-with-group navbox-list navbox-even" style="width:100%;padding:0"><div style="padding:0 0.25em"> <ul><li><a href="/wiki/Argument_mining" title="Argument mining">Argument mining</a></li> <li><a href="/wiki/Collocation_extraction" title="Collocation extraction">Collocation extraction</a></li> <li><a href="/wiki/Concept_mining" title="Concept mining">Concept mining</a></li> <li><a href="/wiki/Coreference#Coreference_resolution" title="Coreference">Coreference resolution</a></li> <li><a href="/wiki/Deep_linguistic_processing" title="Deep linguistic processing">Deep linguistic processing</a></li> <li><a href="/wiki/Distant_reading" title="Distant reading">Distant reading</a></li> <li><a href="/wiki/Information_extraction" title="Information extraction">Information extraction</a></li> <li><a href="/wiki/Named-entity_recognition" title="Named-entity recognition">Named-entity recognition</a></li> <li><a href="/wiki/Ontology_learning" title="Ontology learning">Ontology learning</a></li> <li><a href="/wiki/Parsing" title="Parsing">Parsing</a> <ul><li><a href="/wiki/Semantic_parsing" title="Semantic parsing">Semantic parsing</a></li> <li><a href="/wiki/Syntactic_parsing_(computational_linguistics)" title="Syntactic parsing (computational linguistics)">Syntactic parsing</a></li></ul></li> <li><a href="/wiki/Part-of-speech_tagging" title="Part-of-speech tagging">Part-of-speech tagging</a></li> <li><a href="/wiki/Semantic_analysis_(machine_learning)" title="Semantic analysis (machine learning)">Semantic analysis</a></li> <li><a href="/wiki/Semantic_role_labeling" title="Semantic role labeling">Semantic role labeling</a></li> <li><a href="/wiki/Semantic_decomposition_(natural_language_processing)" title="Semantic decomposition (natural language processing)">Semantic decomposition</a></li> <li><a href="/wiki/Semantic_similarity" title="Semantic similarity">Semantic similarity</a></li> <li><a href="/wiki/Sentiment_analysis" title="Sentiment analysis">Sentiment analysis</a></li></ul> <ul><li><a href="/wiki/Terminology_extraction" title="Terminology extraction">Terminology extraction</a></li> <li><a href="/wiki/Text_mining" title="Text mining">Text mining</a></li> <li><a href="/wiki/Textual_entailment" title="Textual entailment">Textual entailment</a></li> <li><a href="/wiki/Truecasing" title="Truecasing">Truecasing</a></li> <li><a href="/wiki/Word-sense_disambiguation" title="Word-sense disambiguation">Word-sense disambiguation</a></li> <li><a href="/wiki/Word-sense_induction" title="Word-sense induction">Word-sense induction</a></li></ul> </div><table class="nowraplinks navbox-subgroup" style="border-spacing:0"><tbody><tr><th id="Text_segmentation" scope="row" class="navbox-group" style="width:1%"><a href="/wiki/Text_segmentation" title="Text segmentation">Text segmentation</a></th><td class="navbox-list-with-group navbox-list navbox-odd" style="width:100%;padding:0"><div style="padding:0 0.25em"> <ul><li><a href="/wiki/Compound-term_processing" title="Compound-term processing">Compound-term processing</a></li> <li><a href="/wiki/Lemmatisation" class="mw-redirect" title="Lemmatisation">Lemmatisation</a></li> <li><a class="mw-selflink selflink">Lexical analysis</a></li> <li><a href="/wiki/Shallow_parsing" title="Shallow parsing">Text chunking</a></li> <li><a href="/wiki/Stemming" title="Stemming">Stemming</a></li> <li><a href="/wiki/Sentence_boundary_disambiguation" title="Sentence boundary disambiguation">Sentence segmentation</a></li> <li><a href="/wiki/Word#Word_boundaries" title="Word">Word segmentation</a></li></ul> </div></td></tr></tbody></table><div> </div></td></tr><tr><th scope="row" class="navbox-group" style="width:1%"><a href="/wiki/Automatic_summarization" title="Automatic summarization">Automatic summarization</a></th><td class="navbox-list-with-group navbox-list navbox-even" style="width:100%;padding:0"><div style="padding:0 0.25em"> <ul><li><a href="/wiki/Multi-document_summarization" title="Multi-document summarization">Multi-document summarization</a></li> <li><a href="/wiki/Sentence_extraction" title="Sentence extraction">Sentence extraction</a></li> <li><a href="/wiki/Text_simplification" title="Text simplification">Text simplification</a></li></ul> </div></td></tr><tr><th scope="row" class="navbox-group" style="width:1%"><a href="/wiki/Machine_translation" title="Machine translation">Machine translation</a></th><td class="navbox-list-with-group navbox-list navbox-odd" style="width:100%;padding:0"><div style="padding:0 0.25em"> <ul><li><a href="/wiki/Computer-assisted_translation" title="Computer-assisted translation">Computer-assisted</a></li> <li><a href="/wiki/Example-based_machine_translation" title="Example-based machine translation">Example-based</a></li> <li><a href="/wiki/Rule-based_machine_translation" title="Rule-based machine translation">Rule-based</a></li> <li><a href="/wiki/Statistical_machine_translation" title="Statistical machine translation">Statistical</a></li> <li><a href="/wiki/Transfer-based_machine_translation" title="Transfer-based machine translation">Transfer-based</a></li> <li><a href="/wiki/Neural_machine_translation" title="Neural machine translation">Neural</a></li></ul> </div></td></tr><tr><th scope="row" class="navbox-group" style="width:1%"><a href="/wiki/Distributional_semantics" title="Distributional semantics">Distributional semantics</a> models</th><td class="navbox-list-with-group navbox-list navbox-even" style="width:100%;padding:0"><div style="padding:0 0.25em"> <ul><li><a href="/wiki/BERT_(language_model)" title="BERT (language model)">BERT</a></li> <li><a href="/wiki/Document-term_matrix" title="Document-term matrix">Document-term matrix</a></li> <li><a href="/wiki/Explicit_semantic_analysis" title="Explicit semantic analysis">Explicit semantic analysis</a></li> <li><a href="/wiki/FastText" title="FastText">fastText</a></li> <li><a href="/wiki/GloVe" title="GloVe">GloVe</a></li> <li><a href="/wiki/Language_model" title="Language model">Language model</a> (<a href="/wiki/Large_language_model" title="Large language model">large</a>)</li> <li><a href="/wiki/Latent_semantic_analysis" title="Latent semantic analysis">Latent semantic analysis</a></li> <li><a href="/wiki/Seq2seq" title="Seq2seq">Seq2seq</a></li> <li><a href="/wiki/Word_embedding" title="Word embedding">Word embedding</a></li> <li><a href="/wiki/Word2vec" title="Word2vec">Word2vec</a></li></ul> </div></td></tr><tr><th scope="row" class="navbox-group" style="width:1%"><a href="/wiki/Language_resource" title="Language resource">Language resources</a>,<br />datasets and corpora</th><td class="navbox-list-with-group navbox-list navbox-odd" style="width:100%;padding:0"><div style="padding:0 0.25em"></div><table class="nowraplinks navbox-subgroup" style="border-spacing:0"><tbody><tr><th scope="row" class="navbox-group" style="width:1%">Types and<br />standards</th><td class="navbox-list-with-group navbox-list navbox-odd" style="width:100%;padding:0"><div style="padding:0 0.25em"> <ul><li><a href="/wiki/Corpus_linguistics" title="Corpus linguistics">Corpus linguistics</a></li> <li><a href="/wiki/Lexical_resource" title="Lexical resource">Lexical resource</a></li> <li><a href="/wiki/Linguistic_Linked_Open_Data" title="Linguistic Linked Open Data">Linguistic Linked Open Data</a></li> <li><a href="/wiki/Machine-readable_dictionary" title="Machine-readable dictionary">Machine-readable dictionary</a></li> <li><a href="/wiki/Parallel_text" title="Parallel text">Parallel text</a></li> <li><a href="/wiki/PropBank" title="PropBank">PropBank</a></li> <li><a href="/wiki/Semantic_network" title="Semantic network">Semantic network</a></li> <li><a href="/wiki/Simple_Knowledge_Organization_System" title="Simple Knowledge Organization System">Simple Knowledge Organization System</a></li> <li><a href="/wiki/Speech_corpus" title="Speech corpus">Speech corpus</a></li> <li><a href="/wiki/Text_corpus" title="Text corpus">Text corpus</a></li> <li><a href="/wiki/Thesaurus_(information_retrieval)" title="Thesaurus (information retrieval)">Thesaurus (information retrieval)</a></li> <li><a href="/wiki/Treebank" title="Treebank">Treebank</a></li> <li><a href="/wiki/Universal_Dependencies" title="Universal Dependencies">Universal Dependencies</a></li></ul> </div></td></tr><tr><th scope="row" class="navbox-group" style="width:1%">Data</th><td class="navbox-list-with-group navbox-list navbox-even" style="width:100%;padding:0"><div style="padding:0 0.25em"> <ul><li><a href="/wiki/BabelNet" title="BabelNet">BabelNet</a></li> <li><a href="/wiki/Bank_of_English" title="Bank of English">Bank of English</a></li> <li><a href="/wiki/DBpedia" title="DBpedia">DBpedia</a></li> <li><a href="/wiki/FrameNet" title="FrameNet">FrameNet</a></li> <li><a href="/wiki/Google_Ngram_Viewer" class="mw-redirect" title="Google Ngram Viewer">Google Ngram Viewer</a></li> <li><a href="/wiki/UBY" title="UBY">UBY</a></li> <li><a href="/wiki/WordNet" title="WordNet">WordNet</a></li> <li><a href="/wiki/Wikidata" title="Wikidata">Wikidata</a></li></ul> </div></td></tr></tbody></table><div></div></td></tr><tr><th scope="row" class="navbox-group" style="width:1%"><a href="/wiki/Automatic_identification_and_data_capture" title="Automatic identification and data capture">Automatic identification<br />and data capture</a></th><td class="navbox-list-with-group navbox-list navbox-odd" style="width:100%;padding:0"><div style="padding:0 0.25em"> <ul><li><a href="/wiki/Speech_recognition" title="Speech recognition">Speech recognition</a></li> <li><a href="/wiki/Speech_segmentation" title="Speech segmentation">Speech segmentation</a></li> <li><a href="/wiki/Speech_synthesis" title="Speech synthesis">Speech synthesis</a></li> <li><a href="/wiki/Natural_language_generation" title="Natural language generation">Natural language generation</a></li> <li><a href="/wiki/Optical_character_recognition" title="Optical character recognition">Optical character recognition</a></li></ul> </div></td></tr><tr><th scope="row" class="navbox-group" style="width:1%"><a href="/wiki/Topic_model" title="Topic model">Topic model</a></th><td class="navbox-list-with-group navbox-list navbox-even" style="width:100%;padding:0"><div style="padding:0 0.25em"> <ul><li><a href="/wiki/Document_classification" title="Document classification">Document classification</a></li> <li><a href="/wiki/Latent_Dirichlet_allocation" title="Latent Dirichlet allocation">Latent Dirichlet allocation</a></li> <li><a href="/wiki/Pachinko_allocation" title="Pachinko allocation">Pachinko allocation</a></li></ul> </div></td></tr><tr><th scope="row" class="navbox-group" style="width:1%"><a href="/wiki/Computer-assisted_reviewing" title="Computer-assisted reviewing">Computer-assisted<br />reviewing</a></th><td class="navbox-list-with-group navbox-list navbox-odd" style="width:100%;padding:0"><div style="padding:0 0.25em"> <ul><li><a href="/wiki/Automated_essay_scoring" title="Automated essay scoring">Automated essay scoring</a></li> <li><a href="/wiki/Concordancer" title="Concordancer">Concordancer</a></li> <li><a href="/wiki/Grammar_checker" title="Grammar checker">Grammar checker</a></li> <li><a href="/wiki/Predictive_text" title="Predictive text">Predictive text</a></li> <li><a href="/wiki/Pronunciation_assessment" title="Pronunciation assessment">Pronunciation assessment</a></li> <li><a href="/wiki/Spell_checker" title="Spell checker">Spell checker</a></li></ul> </div></td></tr><tr><th scope="row" class="navbox-group" style="width:1%"><a href="/wiki/Natural-language_user_interface" title="Natural-language user interface">Natural language<br />user interface</a></th><td class="navbox-list-with-group navbox-list navbox-even" style="width:100%;padding:0"><div style="padding:0 0.25em"> <ul><li><a href="/wiki/Chatbot" title="Chatbot">Chatbot</a></li> <li><a href="/wiki/Interactive_fiction" title="Interactive fiction">Interactive fiction</a> (c.f. <a href="/wiki/Syntax_guessing" class="mw-redirect" title="Syntax guessing">Syntax guessing</a>)</li> <li><a href="/wiki/Question_answering" title="Question answering">Question answering</a></li> <li><a href="/wiki/Virtual_assistant" title="Virtual assistant">Virtual assistant</a></li> <li><a href="/wiki/Voice_user_interface" title="Voice user interface">Voice user interface</a></li></ul> </div></td></tr><tr><th scope="row" class="navbox-group" style="width:1%">Related</th><td class="navbox-list-with-group navbox-list navbox-odd" style="width:100%;padding:0"><div style="padding:0 0.25em"> <ul><li><a href="/wiki/Formal_semantics_(natural_language)" title="Formal semantics (natural language)">Formal semantics</a></li> <li><a href="/wiki/Hallucination_(artificial_intelligence)" title="Hallucination (artificial intelligence)">Hallucination</a></li> <li><a href="/wiki/Natural_Language_Toolkit" title="Natural Language Toolkit">Natural Language Toolkit</a></li> <li><a href="/wiki/SpaCy" title="SpaCy">spaCy</a></li></ul> </div></td></tr></tbody></table></div> <!-- NewPP limit report Parsed by mw‐web.eqiad.main‐5dc468848‐mkmlx Cached time: 20241122140504 Cache expiry: 2592000 Reduced expiry: false Complications: [vary‐revision‐sha1, show‐toc] CPU time usage: 0.399 seconds Real time usage: 0.958 seconds Preprocessor visited node count: 2315/1000000 Post‐expand include size: 48484/2097152 bytes Template argument size: 2585/2097152 bytes Highest expansion depth: 14/100 Expensive parser function count: 28/500 Unstrip recursion depth: 1/20 Unstrip post‐expand size: 45391/5000000 bytes Lua time usage: 0.220/10.000 seconds Lua memory usage: 5355478/52428800 bytes Number of Wikibase entities loaded: 0/400 --> <!-- Transclusion expansion time report (%,ms,calls,template) 100.00% 885.782 1 -total 46.63% 413.045 17 Template:Code 14.92% 132.171 1 Template:Reflist 12.68% 112.348 5 Template:Cite_web 9.92% 87.839 1 Template:Natural_Language_Processing 9.73% 86.159 3 Template:Navbox 8.38% 74.250 1 Template:Short_description 5.52% 48.875 1 Template:Redirect 4.29% 38.033 2 Template:Fix 4.22% 37.413 2 Template:Pagetype --> <!-- Saved in parser cache with key enwiki:pcache:idhash:81251-0!canonical and timestamp 20241122140504 and revision id 1241869300. Rendering was triggered because: page-view --> </div><!--esi <esi:include src="/esitest-fa8a495983347898/content" /> --><noscript><img src="https://login.wikimedia.org/wiki/Special:CentralAutoLogin/start?type=1x1" alt="" width="1" height="1" style="border: none; position: absolute;"></noscript> <div class="printfooter" data-nosnippet="">Retrieved from "<a dir="ltr" href="https://en.wikipedia.org/w/index.php?title=Lexical_analysis&oldid=1241869300">https://en.wikipedia.org/w/index.php?title=Lexical_analysis&oldid=1241869300</a>"</div></div> <div id="catlinks" class="catlinks" data-mw="interface"><div id="mw-normal-catlinks" class="mw-normal-catlinks"><a href="/wiki/Help:Category" title="Help:Category">Categories</a>: <ul><li><a href="/wiki/Category:Lexical_analysis" title="Category:Lexical analysis">Lexical analysis</a></li><li><a href="/wiki/Category:Compiler_construction" title="Category:Compiler construction">Compiler construction</a></li><li><a href="/wiki/Category:Programming_language_implementation" title="Category:Programming language implementation">Programming language implementation</a></li><li><a href="/wiki/Category:Parsing" title="Category:Parsing">Parsing</a></li></ul></div><div id="mw-hidden-catlinks" class="mw-hidden-catlinks mw-hidden-cats-hidden">Hidden categories: <ul><li><a href="/wiki/Category:Articles_with_short_description" title="Category:Articles with short description">Articles with short description</a></li><li><a href="/wiki/Category:Short_description_is_different_from_Wikidata" title="Category:Short description is different from Wikidata">Short description is different from Wikidata</a></li><li><a href="/wiki/Category:All_accuracy_disputes" title="Category:All accuracy disputes">All accuracy disputes</a></li><li><a href="/wiki/Category:Articles_with_disputed_statements_from_May_2010" title="Category:Articles with disputed statements from May 2010">Articles with disputed statements from May 2010</a></li><li><a href="/wiki/Category:All_articles_with_unsourced_statements" title="Category:All articles with unsourced statements">All articles with unsourced statements</a></li><li><a href="/wiki/Category:Articles_with_unsourced_statements_from_April_2008" title="Category:Articles with unsourced statements from April 2008">Articles with unsourced statements from April 2008</a></li></ul></div></div> </div> </main> </div> <div class="mw-footer-container"> <footer id="footer" class="mw-footer" > <ul id="footer-info"> <li id="footer-info-lastmod"> This page was last edited on 23 August 2024, at 15:52<span class="anonymous-show"> (UTC)</span>.</li> <li id="footer-info-copyright">Text is available under the <a href="/wiki/Wikipedia:Text_of_the_Creative_Commons_Attribution-ShareAlike_4.0_International_License" title="Wikipedia:Text of the Creative Commons Attribution-ShareAlike 4.0 International License">Creative Commons Attribution-ShareAlike 4.0 License</a>; additional terms may apply. By using this site, you agree to the <a href="https://foundation.wikimedia.org/wiki/Special:MyLanguage/Policy:Terms_of_Use" class="extiw" title="foundation:Special:MyLanguage/Policy:Terms of Use">Terms of Use</a> and <a href="https://foundation.wikimedia.org/wiki/Special:MyLanguage/Policy:Privacy_policy" class="extiw" title="foundation:Special:MyLanguage/Policy:Privacy policy">Privacy Policy</a>. Wikipedia® is a registered trademark of the <a rel="nofollow" class="external text" href="https://wikimediafoundation.org/">Wikimedia Foundation, Inc.</a>, a non-profit organization.</li> </ul> <ul id="footer-places"> <li id="footer-places-privacy"><a href="https://foundation.wikimedia.org/wiki/Special:MyLanguage/Policy:Privacy_policy">Privacy policy</a></li> <li id="footer-places-about"><a href="/wiki/Wikipedia:About">About Wikipedia</a></li> <li id="footer-places-disclaimers"><a href="/wiki/Wikipedia:General_disclaimer">Disclaimers</a></li> <li id="footer-places-contact"><a href="//en.wikipedia.org/wiki/Wikipedia:Contact_us">Contact Wikipedia</a></li> <li id="footer-places-wm-codeofconduct"><a href="https://foundation.wikimedia.org/wiki/Special:MyLanguage/Policy:Universal_Code_of_Conduct">Code of Conduct</a></li> <li id="footer-places-developers"><a href="https://developer.wikimedia.org">Developers</a></li> <li id="footer-places-statslink"><a href="https://stats.wikimedia.org/#/en.wikipedia.org">Statistics</a></li> <li id="footer-places-cookiestatement"><a href="https://foundation.wikimedia.org/wiki/Special:MyLanguage/Policy:Cookie_statement">Cookie statement</a></li> <li id="footer-places-mobileview"><a href="//en.m.wikipedia.org/w/index.php?title=Lexical_analysis&mobileaction=toggle_view_mobile" class="noprint stopMobileRedirectToggle">Mobile view</a></li> </ul> <ul id="footer-icons" class="noprint"> <li id="footer-copyrightico"><a href="https://wikimediafoundation.org/" class="cdx-button cdx-button--fake-button cdx-button--size-large cdx-button--fake-button--enabled"><img src="/static/images/footer/wikimedia-button.svg" width="84" height="29" alt="Wikimedia Foundation" loading="lazy"></a></li> <li id="footer-poweredbyico"><a href="https://www.mediawiki.org/" class="cdx-button cdx-button--fake-button cdx-button--size-large cdx-button--fake-button--enabled"><img src="/w/resources/assets/poweredby_mediawiki.svg" alt="Powered by MediaWiki" width="88" height="31" loading="lazy"></a></li> </ul> </footer> </div> </div> </div> <div class="vector-settings" id="p-dock-bottom"> <ul></ul> </div><script>(RLQ=window.RLQ||[]).push(function(){mw.config.set({"wgHostname":"mw-web.codfw.main-f69cdc8f6-svl76","wgBackendResponseTime":211,"wgPageParseReport":{"limitreport":{"cputime":"0.399","walltime":"0.958","ppvisitednodes":{"value":2315,"limit":1000000},"postexpandincludesize":{"value":48484,"limit":2097152},"templateargumentsize":{"value":2585,"limit":2097152},"expansiondepth":{"value":14,"limit":100},"expensivefunctioncount":{"value":28,"limit":500},"unstrip-depth":{"value":1,"limit":20},"unstrip-size":{"value":45391,"limit":5000000},"entityaccesscount":{"value":0,"limit":400},"timingprofile":["100.00% 885.782 1 -total"," 46.63% 413.045 17 Template:Code"," 14.92% 132.171 1 Template:Reflist"," 12.68% 112.348 5 Template:Cite_web"," 9.92% 87.839 1 Template:Natural_Language_Processing"," 9.73% 86.159 3 Template:Navbox"," 8.38% 74.250 1 Template:Short_description"," 5.52% 48.875 1 Template:Redirect"," 4.29% 38.033 2 Template:Fix"," 4.22% 37.413 2 Template:Pagetype"]},"scribunto":{"limitreport-timeusage":{"value":"0.220","limit":"10.000"},"limitreport-memusage":{"value":5355478,"limit":52428800}},"cachereport":{"origin":"mw-web.eqiad.main-5dc468848-mkmlx","timestamp":"20241122140504","ttl":2592000,"transientcontent":false}}});});</script> <script type="application/ld+json">{"@context":"https:\/\/schema.org","@type":"Article","name":"Lexical analysis","url":"https:\/\/en.wikipedia.org\/wiki\/Lexical_analysis","sameAs":"http:\/\/www.wikidata.org\/entity\/Q835922","mainEntity":"http:\/\/www.wikidata.org\/entity\/Q835922","author":{"@type":"Organization","name":"Contributors to Wikimedia projects"},"publisher":{"@type":"Organization","name":"Wikimedia Foundation, Inc.","logo":{"@type":"ImageObject","url":"https:\/\/www.wikimedia.org\/static\/images\/wmf-hor-googpub.png"}},"datePublished":"2002-09-06T12:07:01Z","dateModified":"2024-08-23T15:52:00Z","headline":"computing process of parsing a sequence of characters into a sequence of tokens"}</script> </body> </html>