CINXE.COM
Text corpus - Wikipedia
<!DOCTYPE html> <html class="client-nojs vector-feature-language-in-header-enabled vector-feature-language-in-main-page-header-disabled vector-feature-page-tools-pinned-disabled vector-feature-toc-pinned-clientpref-1 vector-feature-main-menu-pinned-disabled vector-feature-limited-width-clientpref-1 vector-feature-limited-width-content-enabled vector-feature-custom-font-size-clientpref-1 vector-feature-appearance-pinned-clientpref-1 vector-feature-night-mode-enabled skin-theme-clientpref-day vector-sticky-header-enabled vector-toc-available" lang="en" dir="ltr"> <head> <meta charset="UTF-8"> <title>Text corpus - Wikipedia</title> <script>(function(){var className="client-js vector-feature-language-in-header-enabled vector-feature-language-in-main-page-header-disabled vector-feature-page-tools-pinned-disabled vector-feature-toc-pinned-clientpref-1 vector-feature-main-menu-pinned-disabled vector-feature-limited-width-clientpref-1 vector-feature-limited-width-content-enabled vector-feature-custom-font-size-clientpref-1 vector-feature-appearance-pinned-clientpref-1 vector-feature-night-mode-enabled skin-theme-clientpref-day vector-sticky-header-enabled vector-toc-available";var cookie=document.cookie.match(/(?:^|; )enwikimwclientpreferences=([^;]+)/);if(cookie){cookie[1].split('%2C').forEach(function(pref){className=className.replace(new RegExp('(^| )'+pref.replace(/-clientpref-\w+$|[^\w-]+/g,'')+'-clientpref-\\w+( |$)'),'$1'+pref+'$2');});}document.documentElement.className=className;}());RLCONF={"wgBreakFrames":false,"wgSeparatorTransformTable":["",""],"wgDigitTransformTable":["",""],"wgDefaultDateFormat":"dmy","wgMonthNames":["","January","February","March","April","May","June","July","August","September","October","November","December"],"wgRequestId":"92eeed4e-4e16-46ec-a7ca-da3d11b7c92c","wgCanonicalNamespace":"","wgCanonicalSpecialPageName":false,"wgNamespaceNumber":0,"wgPageName":"Text_corpus","wgTitle":"Text corpus","wgCurRevisionId":1257324423,"wgRevisionId":1257324423,"wgArticleId":53887,"wgIsArticle":true,"wgIsRedirect":false,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgCategories":["Articles with short description","Short description is different from Wikidata","Use dmy dates from July 2022","Webarchive template wayback links","Discourse analysis","Corpus linguistics","Computational linguistics","Works based on multiple works","Test items"],"wgPageViewLanguage":"en","wgPageContentLanguage":"en","wgPageContentModel":"wikitext","wgRelevantPageName":"Text_corpus","wgRelevantArticleId":53887,"wgIsProbablyEditable":true,"wgRelevantPageIsProbablyEditable":true,"wgRestrictionEdit":[],"wgRestrictionMove":[],"wgNoticeProject":"wikipedia","wgCiteReferencePreviewsActive":false,"wgFlaggedRevsParams":{"tags":{"status":{"levels":1}}},"wgMediaViewerOnClick":true,"wgMediaViewerEnabledByDefault":true,"wgPopupsFlags":0,"wgVisualEditor":{"pageLanguageCode":"en","pageLanguageDir":"ltr","pageVariantFallbacks":"en"},"wgMFDisplayWikibaseDescriptions":{"search":true,"watchlist":true,"tagline":false,"nearby":true},"wgWMESchemaEditAttemptStepOversample":false,"wgWMEPageLength":8000,"wgEditSubmitButtonLabelPublish":true,"wgULSPosition":"interlanguage","wgULSisCompactLinksEnabled":false,"wgVector2022LanguageInHeader":true,"wgULSisLanguageSelectorEmpty":false,"wgWikibaseItemId":"Q461183","wgCheckUserClientHintsHeadersJsApi":["brands","architecture","bitness","fullVersionList","mobile","model","platform","platformVersion"],"GEHomepageSuggestedEditsEnableTopics":true,"wgGETopicsMatchModeEnabled":false,"wgGEStructuredTaskRejectionReasonTextInputEnabled":false,"wgGELevelingUpEnabledForUser":false}; RLSTATE={"ext.globalCssJs.user.styles":"ready","site.styles":"ready","user.styles":"ready","ext.globalCssJs.user":"ready","user":"ready","user.options":"loading","ext.cite.styles":"ready","skins.vector.search.codex.styles":"ready","skins.vector.styles":"ready","skins.vector.icons":"ready","jquery.makeCollapsible.styles":"ready","ext.wikimediamessages.styles":"ready","ext.visualEditor.desktopArticleTarget.noscript":"ready","ext.uls.interlanguage":"ready","wikibase.client.init":"ready"};RLPAGEMODULES=["ext.cite.ux-enhancements","site","mediawiki.page.ready","jquery.makeCollapsible","mediawiki.toc","skins.vector.js","ext.centralNotice.geoIP","ext.centralNotice.startUp","ext.gadget.ReferenceTooltips","ext.gadget.switcher","ext.urlShortener.toolbar","ext.centralauth.centralautologin","ext.popups","ext.visualEditor.desktopArticleTarget.init","ext.visualEditor.targetLoader","ext.echo.centralauth","ext.eventLogging","ext.wikimediaEvents","ext.navigationTiming","ext.uls.interface","ext.cx.eventlogging.campaigns","ext.cx.uls.quick.actions","wikibase.client.vector-2022","ext.checkUser.clientHints","ext.growthExperiments.SuggestedEditSession"];</script> <script>(RLQ=window.RLQ||[]).push(function(){mw.loader.impl(function(){return["user.options@12s5i",function($,jQuery,require,module){mw.user.tokens.set({"patrolToken":"+\\","watchToken":"+\\","csrfToken":"+\\"}); }];});});</script> <link rel="stylesheet" href="/w/load.php?lang=en&modules=ext.cite.styles%7Cext.uls.interlanguage%7Cext.visualEditor.desktopArticleTarget.noscript%7Cext.wikimediamessages.styles%7Cjquery.makeCollapsible.styles%7Cskins.vector.icons%2Cstyles%7Cskins.vector.search.codex.styles%7Cwikibase.client.init&only=styles&skin=vector-2022"> <script async="" src="/w/load.php?lang=en&modules=startup&only=scripts&raw=1&skin=vector-2022"></script> <meta name="ResourceLoaderDynamicStyles" content=""> <link rel="stylesheet" href="/w/load.php?lang=en&modules=site.styles&only=styles&skin=vector-2022"> <meta name="generator" content="MediaWiki 1.44.0-wmf.20"> <meta name="referrer" content="origin"> <meta name="referrer" content="origin-when-cross-origin"> <meta name="robots" content="max-image-preview:standard"> <meta name="format-detection" content="telephone=no"> <meta name="viewport" content="width=1120"> <meta property="og:title" content="Text corpus - Wikipedia"> <meta property="og:type" content="website"> <link rel="alternate" media="only screen and (max-width: 640px)" href="//en.m.wikipedia.org/wiki/Text_corpus"> <link rel="alternate" type="application/x-wiki" title="Edit this page" href="/w/index.php?title=Text_corpus&action=edit"> <link rel="apple-touch-icon" href="/static/apple-touch/wikipedia.png"> <link rel="icon" href="/static/favicon/wikipedia.ico"> <link rel="search" type="application/opensearchdescription+xml" href="/w/rest.php/v1/search" title="Wikipedia (en)"> <link rel="EditURI" type="application/rsd+xml" href="//en.wikipedia.org/w/api.php?action=rsd"> <link rel="canonical" href="https://en.wikipedia.org/wiki/Text_corpus"> <link rel="license" href="https://creativecommons.org/licenses/by-sa/4.0/deed.en"> <link rel="alternate" type="application/atom+xml" title="Wikipedia Atom feed" href="/w/index.php?title=Special:RecentChanges&feed=atom"> <link rel="dns-prefetch" href="//meta.wikimedia.org" /> <link rel="dns-prefetch" href="login.wikimedia.org"> </head> <body class="skin--responsive skin-vector skin-vector-search-vue mediawiki ltr sitedir-ltr mw-hide-empty-elt ns-0 ns-subject mw-editable page-Text_corpus rootpage-Text_corpus skin-vector-2022 action-view"><a class="mw-jump-link" href="#bodyContent">Jump to content</a> <div class="vector-header-container"> <header class="vector-header mw-header"> <div class="vector-header-start"> <nav class="vector-main-menu-landmark" aria-label="Site"> <div id="vector-main-menu-dropdown" class="vector-dropdown vector-main-menu-dropdown vector-button-flush-left vector-button-flush-right" title="Main menu" > <input type="checkbox" id="vector-main-menu-dropdown-checkbox" role="button" aria-haspopup="true" data-event-name="ui.dropdown-vector-main-menu-dropdown" class="vector-dropdown-checkbox " aria-label="Main menu" > <label id="vector-main-menu-dropdown-label" for="vector-main-menu-dropdown-checkbox" class="vector-dropdown-label cdx-button cdx-button--fake-button cdx-button--fake-button--enabled cdx-button--weight-quiet cdx-button--icon-only " aria-hidden="true" ><span class="vector-icon mw-ui-icon-menu mw-ui-icon-wikimedia-menu"></span> <span class="vector-dropdown-label-text">Main menu</span> </label> <div class="vector-dropdown-content"> <div id="vector-main-menu-unpinned-container" class="vector-unpinned-container"> <div id="vector-main-menu" class="vector-main-menu vector-pinnable-element"> <div class="vector-pinnable-header vector-main-menu-pinnable-header vector-pinnable-header-unpinned" data-feature-name="main-menu-pinned" data-pinnable-element-id="vector-main-menu" data-pinned-container-id="vector-main-menu-pinned-container" data-unpinned-container-id="vector-main-menu-unpinned-container" > <div class="vector-pinnable-header-label">Main menu</div> <button class="vector-pinnable-header-toggle-button vector-pinnable-header-pin-button" data-event-name="pinnable-header.vector-main-menu.pin">move to sidebar</button> <button class="vector-pinnable-header-toggle-button vector-pinnable-header-unpin-button" data-event-name="pinnable-header.vector-main-menu.unpin">hide</button> </div> <div id="p-navigation" class="vector-menu mw-portlet mw-portlet-navigation" > <div class="vector-menu-heading"> Navigation </div> <div class="vector-menu-content"> <ul class="vector-menu-content-list"> <li id="n-mainpage-description" class="mw-list-item"><a href="/wiki/Main_Page" title="Visit the main page [z]" accesskey="z"><span>Main page</span></a></li><li id="n-contents" class="mw-list-item"><a href="/wiki/Wikipedia:Contents" title="Guides to browsing Wikipedia"><span>Contents</span></a></li><li id="n-currentevents" class="mw-list-item"><a href="/wiki/Portal:Current_events" title="Articles related to current events"><span>Current events</span></a></li><li id="n-randompage" class="mw-list-item"><a href="/wiki/Special:Random" title="Visit a randomly selected article [x]" accesskey="x"><span>Random article</span></a></li><li id="n-aboutsite" class="mw-list-item"><a href="/wiki/Wikipedia:About" title="Learn about Wikipedia and how it works"><span>About Wikipedia</span></a></li><li id="n-contactpage" class="mw-list-item"><a href="//en.wikipedia.org/wiki/Wikipedia:Contact_us" title="How to contact Wikipedia"><span>Contact us</span></a></li> </ul> </div> </div> <div id="p-interaction" class="vector-menu mw-portlet mw-portlet-interaction" > <div class="vector-menu-heading"> Contribute </div> <div class="vector-menu-content"> <ul class="vector-menu-content-list"> <li id="n-help" class="mw-list-item"><a href="/wiki/Help:Contents" title="Guidance on how to use and edit Wikipedia"><span>Help</span></a></li><li id="n-introduction" class="mw-list-item"><a href="/wiki/Help:Introduction" title="Learn how to edit Wikipedia"><span>Learn to edit</span></a></li><li id="n-portal" class="mw-list-item"><a href="/wiki/Wikipedia:Community_portal" title="The hub for editors"><span>Community portal</span></a></li><li id="n-recentchanges" class="mw-list-item"><a href="/wiki/Special:RecentChanges" title="A list of recent changes to Wikipedia [r]" accesskey="r"><span>Recent changes</span></a></li><li id="n-upload" class="mw-list-item"><a href="/wiki/Wikipedia:File_upload_wizard" title="Add images or other media for use on Wikipedia"><span>Upload file</span></a></li><li id="n-specialpages" class="mw-list-item"><a href="/wiki/Special:SpecialPages"><span>Special pages</span></a></li> </ul> </div> </div> </div> </div> </div> </div> </nav> <a href="/wiki/Main_Page" class="mw-logo"> <img class="mw-logo-icon" src="/static/images/icons/wikipedia.png" alt="" aria-hidden="true" height="50" width="50"> <span class="mw-logo-container skin-invert"> <img class="mw-logo-wordmark" alt="Wikipedia" src="/static/images/mobile/copyright/wikipedia-wordmark-en.svg" style="width: 7.5em; height: 1.125em;"> <img class="mw-logo-tagline" alt="The Free Encyclopedia" src="/static/images/mobile/copyright/wikipedia-tagline-en.svg" width="117" height="13" style="width: 7.3125em; height: 0.8125em;"> </span> </a> </div> <div class="vector-header-end"> <div id="p-search" role="search" class="vector-search-box-vue vector-search-box-collapses vector-search-box-show-thumbnail vector-search-box-auto-expand-width vector-search-box"> <a href="/wiki/Special:Search" class="cdx-button cdx-button--fake-button cdx-button--fake-button--enabled cdx-button--weight-quiet cdx-button--icon-only search-toggle" title="Search Wikipedia [f]" accesskey="f"><span class="vector-icon mw-ui-icon-search mw-ui-icon-wikimedia-search"></span> <span>Search</span> </a> <div class="vector-typeahead-search-container"> <div class="cdx-typeahead-search cdx-typeahead-search--show-thumbnail cdx-typeahead-search--auto-expand-width"> <form action="/w/index.php" id="searchform" class="cdx-search-input cdx-search-input--has-end-button"> <div id="simpleSearch" class="cdx-search-input__input-wrapper" data-search-loc="header-moved"> <div class="cdx-text-input cdx-text-input--has-start-icon"> <input class="cdx-text-input__input" type="search" name="search" placeholder="Search Wikipedia" aria-label="Search Wikipedia" autocapitalize="sentences" title="Search Wikipedia [f]" accesskey="f" id="searchInput" > <span class="cdx-text-input__icon cdx-text-input__start-icon"></span> </div> <input type="hidden" name="title" value="Special:Search"> </div> <button class="cdx-button cdx-search-input__end-button">Search</button> </form> </div> </div> </div> <nav class="vector-user-links vector-user-links-wide" aria-label="Personal tools"> <div class="vector-user-links-main"> <div id="p-vector-user-menu-preferences" class="vector-menu mw-portlet emptyPortlet" > <div class="vector-menu-content"> <ul class="vector-menu-content-list"> </ul> </div> </div> <div id="p-vector-user-menu-userpage" class="vector-menu mw-portlet emptyPortlet" > <div class="vector-menu-content"> <ul class="vector-menu-content-list"> </ul> </div> </div> <nav class="vector-appearance-landmark" aria-label="Appearance"> <div id="vector-appearance-dropdown" class="vector-dropdown " title="Change the appearance of the page's font size, width, and color" > <input type="checkbox" id="vector-appearance-dropdown-checkbox" role="button" aria-haspopup="true" data-event-name="ui.dropdown-vector-appearance-dropdown" class="vector-dropdown-checkbox " aria-label="Appearance" > <label id="vector-appearance-dropdown-label" for="vector-appearance-dropdown-checkbox" class="vector-dropdown-label cdx-button cdx-button--fake-button cdx-button--fake-button--enabled cdx-button--weight-quiet cdx-button--icon-only " aria-hidden="true" ><span class="vector-icon mw-ui-icon-appearance mw-ui-icon-wikimedia-appearance"></span> <span class="vector-dropdown-label-text">Appearance</span> </label> <div class="vector-dropdown-content"> <div id="vector-appearance-unpinned-container" class="vector-unpinned-container"> </div> </div> </div> </nav> <div id="p-vector-user-menu-notifications" class="vector-menu mw-portlet emptyPortlet" > <div class="vector-menu-content"> <ul class="vector-menu-content-list"> </ul> </div> </div> <div id="p-vector-user-menu-overflow" class="vector-menu mw-portlet" > <div class="vector-menu-content"> <ul class="vector-menu-content-list"> <li id="pt-sitesupport-2" class="user-links-collapsible-item mw-list-item user-links-collapsible-item"><a data-mw="interface" href="https://donate.wikimedia.org/?wmf_source=donate&wmf_medium=sidebar&wmf_campaign=en.wikipedia.org&uselang=en" class=""><span>Donate</span></a> </li> <li id="pt-createaccount-2" class="user-links-collapsible-item mw-list-item user-links-collapsible-item"><a data-mw="interface" href="/w/index.php?title=Special:CreateAccount&returnto=Text+corpus" title="You are encouraged to create an account and log in; however, it is not mandatory" class=""><span>Create account</span></a> </li> <li id="pt-login-2" class="user-links-collapsible-item mw-list-item user-links-collapsible-item"><a data-mw="interface" href="/w/index.php?title=Special:UserLogin&returnto=Text+corpus" title="You're encouraged to log in; however, it's not mandatory. [o]" accesskey="o" class=""><span>Log in</span></a> </li> </ul> </div> </div> </div> <div id="vector-user-links-dropdown" class="vector-dropdown vector-user-menu vector-button-flush-right vector-user-menu-logged-out" title="Log in and more options" > <input type="checkbox" id="vector-user-links-dropdown-checkbox" role="button" aria-haspopup="true" data-event-name="ui.dropdown-vector-user-links-dropdown" class="vector-dropdown-checkbox " aria-label="Personal tools" > <label id="vector-user-links-dropdown-label" for="vector-user-links-dropdown-checkbox" class="vector-dropdown-label cdx-button cdx-button--fake-button cdx-button--fake-button--enabled cdx-button--weight-quiet cdx-button--icon-only " aria-hidden="true" ><span class="vector-icon mw-ui-icon-ellipsis mw-ui-icon-wikimedia-ellipsis"></span> <span class="vector-dropdown-label-text">Personal tools</span> </label> <div class="vector-dropdown-content"> <div id="p-personal" class="vector-menu mw-portlet mw-portlet-personal user-links-collapsible-item" title="User menu" > <div class="vector-menu-content"> <ul class="vector-menu-content-list"> <li id="pt-sitesupport" class="user-links-collapsible-item mw-list-item"><a href="https://donate.wikimedia.org/?wmf_source=donate&wmf_medium=sidebar&wmf_campaign=en.wikipedia.org&uselang=en"><span>Donate</span></a></li><li id="pt-createaccount" class="user-links-collapsible-item mw-list-item"><a href="/w/index.php?title=Special:CreateAccount&returnto=Text+corpus" title="You are encouraged to create an account and log in; however, it is not mandatory"><span class="vector-icon mw-ui-icon-userAdd mw-ui-icon-wikimedia-userAdd"></span> <span>Create account</span></a></li><li id="pt-login" class="user-links-collapsible-item mw-list-item"><a href="/w/index.php?title=Special:UserLogin&returnto=Text+corpus" title="You're encouraged to log in; however, it's not mandatory. [o]" accesskey="o"><span class="vector-icon mw-ui-icon-logIn mw-ui-icon-wikimedia-logIn"></span> <span>Log in</span></a></li> </ul> </div> </div> <div id="p-user-menu-anon-editor" class="vector-menu mw-portlet mw-portlet-user-menu-anon-editor" > <div class="vector-menu-heading"> Pages for logged out editors <a href="/wiki/Help:Introduction" aria-label="Learn more about editing"><span>learn more</span></a> </div> <div class="vector-menu-content"> <ul class="vector-menu-content-list"> <li id="pt-anoncontribs" class="mw-list-item"><a href="/wiki/Special:MyContributions" title="A list of edits made from this IP address [y]" accesskey="y"><span>Contributions</span></a></li><li id="pt-anontalk" class="mw-list-item"><a href="/wiki/Special:MyTalk" title="Discussion about edits from this IP address [n]" accesskey="n"><span>Talk</span></a></li> </ul> </div> </div> </div> </div> </nav> </div> </header> </div> <div class="mw-page-container"> <div class="mw-page-container-inner"> <div class="vector-sitenotice-container"> <div id="siteNotice"><!-- CentralNotice --></div> </div> <div class="vector-column-start"> <div class="vector-main-menu-container"> <div id="mw-navigation"> <nav id="mw-panel" class="vector-main-menu-landmark" aria-label="Site"> <div id="vector-main-menu-pinned-container" class="vector-pinned-container"> </div> </nav> </div> </div> <div class="vector-sticky-pinned-container"> <nav id="mw-panel-toc" aria-label="Contents" data-event-name="ui.sidebar-toc" class="mw-table-of-contents-container vector-toc-landmark"> <div id="vector-toc-pinned-container" class="vector-pinned-container"> <div id="vector-toc" class="vector-toc vector-pinnable-element"> <div class="vector-pinnable-header vector-toc-pinnable-header vector-pinnable-header-pinned" data-feature-name="toc-pinned" data-pinnable-element-id="vector-toc" > <h2 class="vector-pinnable-header-label">Contents</h2> <button class="vector-pinnable-header-toggle-button vector-pinnable-header-pin-button" data-event-name="pinnable-header.vector-toc.pin">move to sidebar</button> <button class="vector-pinnable-header-toggle-button vector-pinnable-header-unpin-button" data-event-name="pinnable-header.vector-toc.unpin">hide</button> </div> <ul class="vector-toc-contents" id="mw-panel-toc-list"> <li id="toc-mw-content-text" class="vector-toc-list-item vector-toc-level-1"> <a href="#" class="vector-toc-link"> <div class="vector-toc-text">(Top)</div> </a> </li> <li id="toc-Overview" class="vector-toc-list-item vector-toc-level-1 vector-toc-list-item-expanded"> <a class="vector-toc-link" href="#Overview"> <div class="vector-toc-text"> <span class="vector-toc-numb">1</span> <span>Overview</span> </div> </a> <ul id="toc-Overview-sublist" class="vector-toc-list"> </ul> </li> <li id="toc-Applications" class="vector-toc-list-item vector-toc-level-1 vector-toc-list-item-expanded"> <a class="vector-toc-link" href="#Applications"> <div class="vector-toc-text"> <span class="vector-toc-numb">2</span> <span>Applications</span> </div> </a> <ul id="toc-Applications-sublist" class="vector-toc-list"> </ul> </li> <li id="toc-Some_notable_text_corpora" class="vector-toc-list-item vector-toc-level-1 vector-toc-list-item-expanded"> <a class="vector-toc-link" href="#Some_notable_text_corpora"> <div class="vector-toc-text"> <span class="vector-toc-numb">3</span> <span>Some notable text corpora</span> </div> </a> <ul id="toc-Some_notable_text_corpora-sublist" class="vector-toc-list"> </ul> </li> <li id="toc-See_also" class="vector-toc-list-item vector-toc-level-1 vector-toc-list-item-expanded"> <a class="vector-toc-link" href="#See_also"> <div class="vector-toc-text"> <span class="vector-toc-numb">4</span> <span>See also</span> </div> </a> <ul id="toc-See_also-sublist" class="vector-toc-list"> </ul> </li> <li id="toc-References" class="vector-toc-list-item vector-toc-level-1 vector-toc-list-item-expanded"> <a class="vector-toc-link" href="#References"> <div class="vector-toc-text"> <span class="vector-toc-numb">5</span> <span>References</span> </div> </a> <ul id="toc-References-sublist" class="vector-toc-list"> </ul> </li> <li id="toc-External_links" class="vector-toc-list-item vector-toc-level-1 vector-toc-list-item-expanded"> <a class="vector-toc-link" href="#External_links"> <div class="vector-toc-text"> <span class="vector-toc-numb">6</span> <span>External links</span> </div> </a> <ul id="toc-External_links-sublist" class="vector-toc-list"> </ul> </li> </ul> </div> </div> </nav> </div> </div> <div class="mw-content-container"> <main id="content" class="mw-body"> <header class="mw-body-header vector-page-titlebar"> <nav aria-label="Contents" class="vector-toc-landmark"> <div id="vector-page-titlebar-toc" class="vector-dropdown vector-page-titlebar-toc vector-button-flush-left" title="Table of Contents" > <input type="checkbox" id="vector-page-titlebar-toc-checkbox" role="button" aria-haspopup="true" data-event-name="ui.dropdown-vector-page-titlebar-toc" class="vector-dropdown-checkbox " aria-label="Toggle the table of contents" > <label id="vector-page-titlebar-toc-label" for="vector-page-titlebar-toc-checkbox" class="vector-dropdown-label cdx-button cdx-button--fake-button cdx-button--fake-button--enabled cdx-button--weight-quiet cdx-button--icon-only " aria-hidden="true" ><span class="vector-icon mw-ui-icon-listBullet mw-ui-icon-wikimedia-listBullet"></span> <span class="vector-dropdown-label-text">Toggle the table of contents</span> </label> <div class="vector-dropdown-content"> <div id="vector-page-titlebar-toc-unpinned-container" class="vector-unpinned-container"> </div> </div> </div> </nav> <h1 id="firstHeading" class="firstHeading mw-first-heading"><span class="mw-page-title-main">Text corpus</span></h1> <div id="p-lang-btn" class="vector-dropdown mw-portlet mw-portlet-lang" > <input type="checkbox" id="p-lang-btn-checkbox" role="button" aria-haspopup="true" data-event-name="ui.dropdown-p-lang-btn" class="vector-dropdown-checkbox mw-interlanguage-selector" aria-label="Go to an article in another language. Available in 51 languages" > <label id="p-lang-btn-label" for="p-lang-btn-checkbox" class="vector-dropdown-label cdx-button cdx-button--fake-button cdx-button--fake-button--enabled cdx-button--weight-quiet cdx-button--action-progressive mw-portlet-lang-heading-51" aria-hidden="true" ><span class="vector-icon mw-ui-icon-language-progressive mw-ui-icon-wikimedia-language-progressive"></span> <span class="vector-dropdown-label-text">51 languages</span> </label> <div class="vector-dropdown-content"> <div class="vector-menu-content"> <ul class="vector-menu-content-list"> <li class="interlanguage-link interwiki-af mw-list-item"><a href="https://af.wikipedia.org/wiki/Tekskorpus" title="Tekskorpus – Afrikaans" lang="af" hreflang="af" data-title="Tekskorpus" data-language-autonym="Afrikaans" data-language-local-name="Afrikaans" class="interlanguage-link-target"><span>Afrikaans</span></a></li><li class="interlanguage-link interwiki-ar mw-list-item"><a href="https://ar.wikipedia.org/wiki/%D9%85%D8%AF%D9%88%D9%86%D8%A9_%D9%86%D8%B5%D9%8A%D8%A9" title="مدونة نصية – Arabic" lang="ar" hreflang="ar" data-title="مدونة نصية" data-language-autonym="العربية" data-language-local-name="Arabic" class="interlanguage-link-target"><span>العربية</span></a></li><li class="interlanguage-link interwiki-zh-min-nan mw-list-item"><a href="https://zh-min-nan.wikipedia.org/wiki/G%C3%AD-li%C4%81u-kh%C3%B2%CD%98" title="Gí-liāu-khò͘ – Minnan" lang="nan" hreflang="nan" data-title="Gí-liāu-khò͘" data-language-autonym="閩南語 / Bân-lâm-gú" data-language-local-name="Minnan" class="interlanguage-link-target"><span>閩南語 / Bân-lâm-gú</span></a></li><li class="interlanguage-link interwiki-be mw-list-item"><a href="https://be.wikipedia.org/wiki/%D0%9A%D0%BE%D1%80%D0%BF%D1%83%D1%81_%D1%82%D1%8D%D0%BA%D1%81%D1%82%D0%B0%D1%9E" title="Корпус тэкстаў – Belarusian" lang="be" hreflang="be" data-title="Корпус тэкстаў" data-language-autonym="Беларуская" data-language-local-name="Belarusian" class="interlanguage-link-target"><span>Беларуская</span></a></li><li class="interlanguage-link interwiki-be-x-old mw-list-item"><a href="https://be-tarask.wikipedia.org/wiki/%D0%9A%D0%BE%D1%80%D0%BF%D1%83%D1%81_%D1%82%D1%8D%D0%BA%D1%81%D1%82%D0%B0%D1%9E" title="Корпус тэкстаў – Belarusian (Taraškievica orthography)" lang="be-tarask" hreflang="be-tarask" data-title="Корпус тэкстаў" data-language-autonym="Беларуская (тарашкевіца)" data-language-local-name="Belarusian (Taraškievica orthography)" class="interlanguage-link-target"><span>Беларуская (тарашкевіца)</span></a></li><li class="interlanguage-link interwiki-bg mw-list-item"><a href="https://bg.wikipedia.org/wiki/%D0%9A%D0%BE%D1%80%D0%BF%D1%83%D1%81_(%D0%BB%D0%B8%D0%BD%D0%B3%D0%B2%D0%B8%D1%81%D1%82%D0%B8%D0%BA%D0%B0)" title="Корпус (лингвистика) – Bulgarian" lang="bg" hreflang="bg" data-title="Корпус (лингвистика)" data-language-autonym="Български" data-language-local-name="Bulgarian" class="interlanguage-link-target"><span>Български</span></a></li><li class="interlanguage-link interwiki-ca mw-list-item"><a href="https://ca.wikipedia.org/wiki/Corpus_ling%C3%BC%C3%ADstic" title="Corpus lingüístic – Catalan" lang="ca" hreflang="ca" data-title="Corpus lingüístic" data-language-autonym="Català" data-language-local-name="Catalan" class="interlanguage-link-target"><span>Català</span></a></li><li class="interlanguage-link interwiki-cv mw-list-item"><a href="https://cv.wikipedia.org/wiki/%D0%A2%D0%B5%D0%BA%D1%81%D1%82%D1%81%D0%B5%D0%BD_%D1%85%D0%B0%D0%BF%D0%B8" title="Текстсен хапи – Chuvash" lang="cv" hreflang="cv" data-title="Текстсен хапи" data-language-autonym="Чӑвашла" data-language-local-name="Chuvash" class="interlanguage-link-target"><span>Чӑвашла</span></a></li><li class="interlanguage-link interwiki-cs mw-list-item"><a href="https://cs.wikipedia.org/wiki/Jazykov%C3%BD_korpus" title="Jazykový korpus – Czech" lang="cs" hreflang="cs" data-title="Jazykový korpus" data-language-autonym="Čeština" data-language-local-name="Czech" class="interlanguage-link-target"><span>Čeština</span></a></li><li class="interlanguage-link interwiki-cy mw-list-item"><a href="https://cy.wikipedia.org/wiki/Corpws_(ieithyddiaeth)" title="Corpws (ieithyddiaeth) – Welsh" lang="cy" hreflang="cy" data-title="Corpws (ieithyddiaeth)" data-language-autonym="Cymraeg" data-language-local-name="Welsh" class="interlanguage-link-target"><span>Cymraeg</span></a></li><li class="interlanguage-link interwiki-da mw-list-item"><a href="https://da.wikipedia.org/wiki/Tekstkorpus" title="Tekstkorpus – Danish" lang="da" hreflang="da" data-title="Tekstkorpus" data-language-autonym="Dansk" data-language-local-name="Danish" class="interlanguage-link-target"><span>Dansk</span></a></li><li class="interlanguage-link interwiki-de mw-list-item"><a href="https://de.wikipedia.org/wiki/Textkorpus" title="Textkorpus – German" lang="de" hreflang="de" data-title="Textkorpus" data-language-autonym="Deutsch" data-language-local-name="German" class="interlanguage-link-target"><span>Deutsch</span></a></li><li class="interlanguage-link interwiki-et mw-list-item"><a href="https://et.wikipedia.org/wiki/Keelekorpus" title="Keelekorpus – Estonian" lang="et" hreflang="et" data-title="Keelekorpus" data-language-autonym="Eesti" data-language-local-name="Estonian" class="interlanguage-link-target"><span>Eesti</span></a></li><li class="interlanguage-link interwiki-el mw-list-item"><a href="https://el.wikipedia.org/wiki/%CE%A3%CF%8E%CE%BC%CE%B1_%CE%BA%CE%B5%CE%B9%CE%BC%CE%AD%CE%BD%CF%89%CE%BD" title="Σώμα κειμένων – Greek" lang="el" hreflang="el" data-title="Σώμα κειμένων" data-language-autonym="Ελληνικά" data-language-local-name="Greek" class="interlanguage-link-target"><span>Ελληνικά</span></a></li><li class="interlanguage-link interwiki-es mw-list-item"><a href="https://es.wikipedia.org/wiki/Corpus_ling%C3%BC%C3%ADstico" title="Corpus lingüístico – Spanish" lang="es" hreflang="es" data-title="Corpus lingüístico" data-language-autonym="Español" data-language-local-name="Spanish" class="interlanguage-link-target"><span>Español</span></a></li><li class="interlanguage-link interwiki-eo mw-list-item"><a href="https://eo.wikipedia.org/wiki/Korpuso" title="Korpuso – Esperanto" lang="eo" hreflang="eo" data-title="Korpuso" data-language-autonym="Esperanto" data-language-local-name="Esperanto" class="interlanguage-link-target"><span>Esperanto</span></a></li><li class="interlanguage-link interwiki-eu mw-list-item"><a href="https://eu.wikipedia.org/wiki/Testu_corpus" title="Testu corpus – Basque" lang="eu" hreflang="eu" data-title="Testu corpus" data-language-autonym="Euskara" data-language-local-name="Basque" class="interlanguage-link-target"><span>Euskara</span></a></li><li class="interlanguage-link interwiki-fa mw-list-item"><a href="https://fa.wikipedia.org/wiki/%D9%BE%DB%8C%DA%A9%D8%B1%D9%87_%D9%85%D8%AA%D9%86%DB%8C" title="پیکره متنی – Persian" lang="fa" hreflang="fa" data-title="پیکره متنی" data-language-autonym="فارسی" data-language-local-name="Persian" class="interlanguage-link-target"><span>فارسی</span></a></li><li class="interlanguage-link interwiki-fr mw-list-item"><a href="https://fr.wikipedia.org/wiki/Corpus" title="Corpus – French" lang="fr" hreflang="fr" data-title="Corpus" data-language-autonym="Français" data-language-local-name="French" class="interlanguage-link-target"><span>Français</span></a></li><li class="interlanguage-link interwiki-ga mw-list-item"><a href="https://ga.wikipedia.org/wiki/Corpas_t%C3%A9acs" title="Corpas téacs – Irish" lang="ga" hreflang="ga" data-title="Corpas téacs" data-language-autonym="Gaeilge" data-language-local-name="Irish" class="interlanguage-link-target"><span>Gaeilge</span></a></li><li class="interlanguage-link interwiki-gl mw-list-item"><a href="https://gl.wikipedia.org/wiki/Corpus_ling%C3%BC%C3%ADstico" title="Corpus lingüístico – Galician" lang="gl" hreflang="gl" data-title="Corpus lingüístico" data-language-autonym="Galego" data-language-local-name="Galician" class="interlanguage-link-target"><span>Galego</span></a></li><li class="interlanguage-link interwiki-ko mw-list-item"><a href="https://ko.wikipedia.org/wiki/%EB%A7%90%EB%AD%89%EC%B9%98" title="말뭉치 – Korean" lang="ko" hreflang="ko" data-title="말뭉치" data-language-autonym="한국어" data-language-local-name="Korean" class="interlanguage-link-target"><span>한국어</span></a></li><li class="interlanguage-link interwiki-hy mw-list-item"><a href="https://hy.wikipedia.org/wiki/%D5%8F%D5%A5%D6%84%D5%BD%D5%BF%D5%AB_%D5%AF%D5%B8%D6%80%D5%BA%D5%B8%D6%82%D5%BD" title="Տեքստի կորպուս – Armenian" lang="hy" hreflang="hy" data-title="Տեքստի կորպուս" data-language-autonym="Հայերեն" data-language-local-name="Armenian" class="interlanguage-link-target"><span>Հայերեն</span></a></li><li class="interlanguage-link interwiki-hi mw-list-item"><a href="https://hi.wikipedia.org/wiki/%E0%A4%AA%E0%A4%BE%E0%A4%A0%E0%A4%B8%E0%A4%82%E0%A4%97%E0%A5%8D%E0%A4%B0%E0%A4%B9" title="पाठसंग्रह – Hindi" lang="hi" hreflang="hi" data-title="पाठसंग्रह" data-language-autonym="हिन्दी" data-language-local-name="Hindi" class="interlanguage-link-target"><span>हिन्दी</span></a></li><li class="interlanguage-link interwiki-io mw-list-item"><a href="https://io.wikipedia.org/wiki/Texto_korpo" title="Texto korpo – Ido" lang="io" hreflang="io" data-title="Texto korpo" data-language-autonym="Ido" data-language-local-name="Ido" class="interlanguage-link-target"><span>Ido</span></a></li><li class="interlanguage-link interwiki-id mw-list-item"><a href="https://id.wikipedia.org/wiki/Korpus_teks" title="Korpus teks – Indonesian" lang="id" hreflang="id" data-title="Korpus teks" data-language-autonym="Bahasa Indonesia" data-language-local-name="Indonesian" class="interlanguage-link-target"><span>Bahasa Indonesia</span></a></li><li class="interlanguage-link interwiki-it mw-list-item"><a href="https://it.wikipedia.org/wiki/Corpus" title="Corpus – Italian" lang="it" hreflang="it" data-title="Corpus" data-language-autonym="Italiano" data-language-local-name="Italian" class="interlanguage-link-target"><span>Italiano</span></a></li><li class="interlanguage-link interwiki-lv mw-list-item"><a href="https://lv.wikipedia.org/wiki/Valodas_korpuss" title="Valodas korpuss – Latvian" lang="lv" hreflang="lv" data-title="Valodas korpuss" data-language-autonym="Latviešu" data-language-local-name="Latvian" class="interlanguage-link-target"><span>Latviešu</span></a></li><li class="interlanguage-link interwiki-lt mw-list-item"><a href="https://lt.wikipedia.org/wiki/Tekstynas" title="Tekstynas – Lithuanian" lang="lt" hreflang="lt" data-title="Tekstynas" data-language-autonym="Lietuvių" data-language-local-name="Lithuanian" class="interlanguage-link-target"><span>Lietuvių</span></a></li><li class="interlanguage-link interwiki-hu mw-list-item"><a href="https://hu.wikipedia.org/wiki/Korpusz" title="Korpusz – Hungarian" lang="hu" hreflang="hu" data-title="Korpusz" data-language-autonym="Magyar" data-language-local-name="Hungarian" class="interlanguage-link-target"><span>Magyar</span></a></li><li class="interlanguage-link interwiki-mk mw-list-item"><a href="https://mk.wikipedia.org/wiki/%D0%9A%D0%BE%D1%80%D0%BF%D1%83%D1%81_%D1%82%D0%B5%D0%BA%D1%81%D1%82%D0%BE%D0%B2%D0%B8" title="Корпус текстови – Macedonian" lang="mk" hreflang="mk" data-title="Корпус текстови" data-language-autonym="Македонски" data-language-local-name="Macedonian" class="interlanguage-link-target"><span>Македонски</span></a></li><li class="interlanguage-link interwiki-ml mw-list-item"><a href="https://ml.wikipedia.org/wiki/%E0%B4%AD%E0%B4%BE%E0%B4%B7%E0%B4%BE_%E0%B4%95%E0%B5%8B%E0%B5%BC%E0%B4%AA%E0%B5%8D%E0%B4%AA%E0%B4%B8%E0%B5%8D" title="ഭാഷാ കോർപ്പസ് – Malayalam" lang="ml" hreflang="ml" data-title="ഭാഷാ കോർപ്പസ്" data-language-autonym="മലയാളം" data-language-local-name="Malayalam" class="interlanguage-link-target"><span>മലയാളം</span></a></li><li class="interlanguage-link interwiki-ms mw-list-item"><a href="https://ms.wikipedia.org/wiki/Korpus" title="Korpus – Malay" lang="ms" hreflang="ms" data-title="Korpus" data-language-autonym="Bahasa Melayu" data-language-local-name="Malay" class="interlanguage-link-target"><span>Bahasa Melayu</span></a></li><li class="interlanguage-link interwiki-nl mw-list-item"><a href="https://nl.wikipedia.org/wiki/Corpus_(taalkunde)" title="Corpus (taalkunde) – Dutch" lang="nl" hreflang="nl" data-title="Corpus (taalkunde)" data-language-autonym="Nederlands" data-language-local-name="Dutch" class="interlanguage-link-target"><span>Nederlands</span></a></li><li class="interlanguage-link interwiki-ja mw-list-item"><a href="https://ja.wikipedia.org/wiki/%E3%82%B3%E3%83%BC%E3%83%91%E3%82%B9" title="コーパス – Japanese" lang="ja" hreflang="ja" data-title="コーパス" data-language-autonym="日本語" data-language-local-name="Japanese" class="interlanguage-link-target"><span>日本語</span></a></li><li class="interlanguage-link interwiki-no mw-list-item"><a href="https://no.wikipedia.org/wiki/Tekstkorpus" title="Tekstkorpus – Norwegian Bokmål" lang="nb" hreflang="nb" data-title="Tekstkorpus" data-language-autonym="Norsk bokmål" data-language-local-name="Norwegian Bokmål" class="interlanguage-link-target"><span>Norsk bokmål</span></a></li><li class="interlanguage-link interwiki-nn mw-list-item"><a href="https://nn.wikipedia.org/wiki/Tekstkorpus" title="Tekstkorpus – Norwegian Nynorsk" lang="nn" hreflang="nn" data-title="Tekstkorpus" data-language-autonym="Norsk nynorsk" data-language-local-name="Norwegian Nynorsk" class="interlanguage-link-target"><span>Norsk nynorsk</span></a></li><li class="interlanguage-link interwiki-uz mw-list-item"><a href="https://uz.wikipedia.org/wiki/Matn_korpusi" title="Matn korpusi – Uzbek" lang="uz" hreflang="uz" data-title="Matn korpusi" data-language-autonym="Oʻzbekcha / ўзбекча" data-language-local-name="Uzbek" class="interlanguage-link-target"><span>Oʻzbekcha / ўзбекча</span></a></li><li class="interlanguage-link interwiki-pl mw-list-item"><a href="https://pl.wikipedia.org/wiki/Korpus_(j%C4%99zykoznawstwo)" title="Korpus (językoznawstwo) – Polish" lang="pl" hreflang="pl" data-title="Korpus (językoznawstwo)" data-language-autonym="Polski" data-language-local-name="Polish" class="interlanguage-link-target"><span>Polski</span></a></li><li class="interlanguage-link interwiki-pt mw-list-item"><a href="https://pt.wikipedia.org/wiki/Corpus_lingu%C3%ADstico" title="Corpus linguístico – Portuguese" lang="pt" hreflang="pt" data-title="Corpus linguístico" data-language-autonym="Português" data-language-local-name="Portuguese" class="interlanguage-link-target"><span>Português</span></a></li><li class="interlanguage-link interwiki-ru mw-list-item"><a href="https://ru.wikipedia.org/wiki/%D0%9A%D0%BE%D1%80%D0%BF%D1%83%D1%81_%D1%82%D0%B5%D0%BA%D1%81%D1%82%D0%BE%D0%B2" title="Корпус текстов – Russian" lang="ru" hreflang="ru" data-title="Корпус текстов" data-language-autonym="Русский" data-language-local-name="Russian" class="interlanguage-link-target"><span>Русский</span></a></li><li class="interlanguage-link interwiki-sk mw-list-item"><a href="https://sk.wikipedia.org/wiki/Korpus_(jazykoveda)" title="Korpus (jazykoveda) – Slovak" lang="sk" hreflang="sk" data-title="Korpus (jazykoveda)" data-language-autonym="Slovenčina" data-language-local-name="Slovak" class="interlanguage-link-target"><span>Slovenčina</span></a></li><li class="interlanguage-link interwiki-sl mw-list-item"><a href="https://sl.wikipedia.org/wiki/Besedilni_korpus" title="Besedilni korpus – Slovenian" lang="sl" hreflang="sl" data-title="Besedilni korpus" data-language-autonym="Slovenščina" data-language-local-name="Slovenian" class="interlanguage-link-target"><span>Slovenščina</span></a></li><li class="interlanguage-link interwiki-ckb mw-list-item"><a href="https://ckb.wikipedia.org/wiki/%DA%A9%DB%86%D8%B1%D9%BE%DB%95%D8%B3" title="کۆرپەس – Central Kurdish" lang="ckb" hreflang="ckb" data-title="کۆرپەس" data-language-autonym="کوردی" data-language-local-name="Central Kurdish" class="interlanguage-link-target"><span>کوردی</span></a></li><li class="interlanguage-link interwiki-fi mw-list-item"><a href="https://fi.wikipedia.org/wiki/Korpus" title="Korpus – Finnish" lang="fi" hreflang="fi" data-title="Korpus" data-language-autonym="Suomi" data-language-local-name="Finnish" class="interlanguage-link-target"><span>Suomi</span></a></li><li class="interlanguage-link interwiki-th mw-list-item"><a href="https://th.wikipedia.org/wiki/%E0%B8%84%E0%B8%A5%E0%B8%B1%E0%B8%87%E0%B8%82%E0%B9%89%E0%B8%AD%E0%B8%84%E0%B8%A7%E0%B8%B2%E0%B8%A1" title="คลังข้อความ – Thai" lang="th" hreflang="th" data-title="คลังข้อความ" data-language-autonym="ไทย" data-language-local-name="Thai" class="interlanguage-link-target"><span>ไทย</span></a></li><li class="interlanguage-link interwiki-uk mw-list-item"><a href="https://uk.wikipedia.org/wiki/%D0%9A%D0%BE%D1%80%D0%BF%D1%83%D1%81_%D1%82%D0%B5%D0%BA%D1%81%D1%82%D1%96%D0%B2" title="Корпус текстів – Ukrainian" lang="uk" hreflang="uk" data-title="Корпус текстів" data-language-autonym="Українська" data-language-local-name="Ukrainian" class="interlanguage-link-target"><span>Українська</span></a></li><li class="interlanguage-link interwiki-vi mw-list-item"><a href="https://vi.wikipedia.org/wiki/Ng%E1%BB%AF_li%E1%BB%87u_v%C4%83n_b%E1%BA%A3n" title="Ngữ liệu văn bản – Vietnamese" lang="vi" hreflang="vi" data-title="Ngữ liệu văn bản" data-language-autonym="Tiếng Việt" data-language-local-name="Vietnamese" class="interlanguage-link-target"><span>Tiếng Việt</span></a></li><li class="interlanguage-link interwiki-zh-yue mw-list-item"><a href="https://zh-yue.wikipedia.org/wiki/%E8%AA%9E%E6%96%99%E5%BA%AB" title="語料庫 – Cantonese" lang="yue" hreflang="yue" data-title="語料庫" data-language-autonym="粵語" data-language-local-name="Cantonese" class="interlanguage-link-target"><span>粵語</span></a></li><li class="interlanguage-link interwiki-zh mw-list-item"><a href="https://zh.wikipedia.org/wiki/%E8%AF%AD%E6%96%99%E5%BA%93" title="语料库 – Chinese" lang="zh" hreflang="zh" data-title="语料库" data-language-autonym="中文" data-language-local-name="Chinese" class="interlanguage-link-target"><span>中文</span></a></li><li class="interlanguage-link interwiki-rsk mw-list-item"><a href="https://rsk.wikipedia.org/wiki/%D0%A2%D0%B5%D0%BA%D1%81%D1%82%D1%83%D0%B0%D0%BB%D0%BD%D0%B8_%D0%BA%D0%BE%D1%80%D0%BF%D1%83%D1%81" title="Текстуални корпус – Pannonian Rusyn" lang="rsk" hreflang="rsk" data-title="Текстуални корпус" data-language-autonym="Руски" data-language-local-name="Pannonian Rusyn" class="interlanguage-link-target"><span>Руски</span></a></li> </ul> <div class="after-portlet after-portlet-lang"><span class="wb-langlinks-edit wb-langlinks-link"><a href="https://www.wikidata.org/wiki/Special:EntityPage/Q461183#sitelinks-wikipedia" title="Edit interlanguage links" class="wbc-editpage">Edit links</a></span></div> </div> </div> </div> </header> <div class="vector-page-toolbar"> <div class="vector-page-toolbar-container"> <div id="left-navigation"> <nav aria-label="Namespaces"> <div id="p-associated-pages" class="vector-menu vector-menu-tabs mw-portlet mw-portlet-associated-pages" > <div class="vector-menu-content"> <ul class="vector-menu-content-list"> <li id="ca-nstab-main" class="selected vector-tab-noicon mw-list-item"><a href="/wiki/Text_corpus" title="View the content page [c]" accesskey="c"><span>Article</span></a></li><li id="ca-talk" class="vector-tab-noicon mw-list-item"><a href="/wiki/Talk:Text_corpus" rel="discussion" title="Discuss improvements to the content page [t]" accesskey="t"><span>Talk</span></a></li> </ul> </div> </div> <div id="vector-variants-dropdown" class="vector-dropdown emptyPortlet" > <input type="checkbox" id="vector-variants-dropdown-checkbox" role="button" aria-haspopup="true" data-event-name="ui.dropdown-vector-variants-dropdown" class="vector-dropdown-checkbox " aria-label="Change language variant" > <label id="vector-variants-dropdown-label" for="vector-variants-dropdown-checkbox" class="vector-dropdown-label cdx-button cdx-button--fake-button cdx-button--fake-button--enabled cdx-button--weight-quiet" aria-hidden="true" ><span class="vector-dropdown-label-text">English</span> </label> <div class="vector-dropdown-content"> <div id="p-variants" class="vector-menu mw-portlet mw-portlet-variants emptyPortlet" > <div class="vector-menu-content"> <ul class="vector-menu-content-list"> </ul> </div> </div> </div> </div> </nav> </div> <div id="right-navigation" class="vector-collapsible"> <nav aria-label="Views"> <div id="p-views" class="vector-menu vector-menu-tabs mw-portlet mw-portlet-views" > <div class="vector-menu-content"> <ul class="vector-menu-content-list"> <li id="ca-view" class="selected vector-tab-noicon mw-list-item"><a href="/wiki/Text_corpus"><span>Read</span></a></li><li id="ca-edit" class="vector-tab-noicon mw-list-item"><a href="/w/index.php?title=Text_corpus&action=edit" title="Edit this page [e]" accesskey="e"><span>Edit</span></a></li><li id="ca-history" class="vector-tab-noicon mw-list-item"><a href="/w/index.php?title=Text_corpus&action=history" title="Past revisions of this page [h]" accesskey="h"><span>View history</span></a></li> </ul> </div> </div> </nav> <nav class="vector-page-tools-landmark" aria-label="Page tools"> <div id="vector-page-tools-dropdown" class="vector-dropdown vector-page-tools-dropdown" > <input type="checkbox" id="vector-page-tools-dropdown-checkbox" role="button" aria-haspopup="true" data-event-name="ui.dropdown-vector-page-tools-dropdown" class="vector-dropdown-checkbox " aria-label="Tools" > <label id="vector-page-tools-dropdown-label" for="vector-page-tools-dropdown-checkbox" class="vector-dropdown-label cdx-button cdx-button--fake-button cdx-button--fake-button--enabled cdx-button--weight-quiet" aria-hidden="true" ><span class="vector-dropdown-label-text">Tools</span> </label> <div class="vector-dropdown-content"> <div id="vector-page-tools-unpinned-container" class="vector-unpinned-container"> <div id="vector-page-tools" class="vector-page-tools vector-pinnable-element"> <div class="vector-pinnable-header vector-page-tools-pinnable-header vector-pinnable-header-unpinned" data-feature-name="page-tools-pinned" data-pinnable-element-id="vector-page-tools" data-pinned-container-id="vector-page-tools-pinned-container" data-unpinned-container-id="vector-page-tools-unpinned-container" > <div class="vector-pinnable-header-label">Tools</div> <button class="vector-pinnable-header-toggle-button vector-pinnable-header-pin-button" data-event-name="pinnable-header.vector-page-tools.pin">move to sidebar</button> <button class="vector-pinnable-header-toggle-button vector-pinnable-header-unpin-button" data-event-name="pinnable-header.vector-page-tools.unpin">hide</button> </div> <div id="p-cactions" class="vector-menu mw-portlet mw-portlet-cactions emptyPortlet vector-has-collapsible-items" title="More options" > <div class="vector-menu-heading"> Actions </div> <div class="vector-menu-content"> <ul class="vector-menu-content-list"> <li id="ca-more-view" class="selected vector-more-collapsible-item mw-list-item"><a href="/wiki/Text_corpus"><span>Read</span></a></li><li id="ca-more-edit" class="vector-more-collapsible-item mw-list-item"><a href="/w/index.php?title=Text_corpus&action=edit" title="Edit this page [e]" accesskey="e"><span>Edit</span></a></li><li id="ca-more-history" class="vector-more-collapsible-item mw-list-item"><a href="/w/index.php?title=Text_corpus&action=history"><span>View history</span></a></li> </ul> </div> </div> <div id="p-tb" class="vector-menu mw-portlet mw-portlet-tb" > <div class="vector-menu-heading"> General </div> <div class="vector-menu-content"> <ul class="vector-menu-content-list"> <li id="t-whatlinkshere" class="mw-list-item"><a href="/wiki/Special:WhatLinksHere/Text_corpus" title="List of all English Wikipedia pages containing links to this page [j]" accesskey="j"><span>What links here</span></a></li><li id="t-recentchangeslinked" class="mw-list-item"><a href="/wiki/Special:RecentChangesLinked/Text_corpus" rel="nofollow" title="Recent changes in pages linked from this page [k]" accesskey="k"><span>Related changes</span></a></li><li id="t-upload" class="mw-list-item"><a href="//en.wikipedia.org/wiki/Wikipedia:File_Upload_Wizard" title="Upload files [u]" accesskey="u"><span>Upload file</span></a></li><li id="t-permalink" class="mw-list-item"><a href="/w/index.php?title=Text_corpus&oldid=1257324423" title="Permanent link to this revision of this page"><span>Permanent link</span></a></li><li id="t-info" class="mw-list-item"><a href="/w/index.php?title=Text_corpus&action=info" title="More information about this page"><span>Page information</span></a></li><li id="t-cite" class="mw-list-item"><a href="/w/index.php?title=Special:CiteThisPage&page=Text_corpus&id=1257324423&wpFormIdentifier=titleform" title="Information on how to cite this page"><span>Cite this page</span></a></li><li id="t-urlshortener" class="mw-list-item"><a href="/w/index.php?title=Special:UrlShortener&url=https%3A%2F%2Fen.wikipedia.org%2Fwiki%2FText_corpus"><span>Get shortened URL</span></a></li><li id="t-urlshortener-qrcode" class="mw-list-item"><a href="/w/index.php?title=Special:QrCode&url=https%3A%2F%2Fen.wikipedia.org%2Fwiki%2FText_corpus"><span>Download QR code</span></a></li> </ul> </div> </div> <div id="p-coll-print_export" class="vector-menu mw-portlet mw-portlet-coll-print_export" > <div class="vector-menu-heading"> Print/export </div> <div class="vector-menu-content"> <ul class="vector-menu-content-list"> <li id="coll-download-as-rl" class="mw-list-item"><a href="/w/index.php?title=Special:DownloadAsPdf&page=Text_corpus&action=show-download-screen" title="Download this page as a PDF file"><span>Download as PDF</span></a></li><li id="t-print" class="mw-list-item"><a href="/w/index.php?title=Text_corpus&printable=yes" title="Printable version of this page [p]" accesskey="p"><span>Printable version</span></a></li> </ul> </div> </div> <div id="p-wikibase-otherprojects" class="vector-menu mw-portlet mw-portlet-wikibase-otherprojects" > <div class="vector-menu-heading"> In other projects </div> <div class="vector-menu-content"> <ul class="vector-menu-content-list"> <li class="wb-otherproject-link wb-otherproject-commons mw-list-item"><a href="https://commons.wikimedia.org/wiki/Category:Text_corpus" hreflang="en"><span>Wikimedia Commons</span></a></li><li id="t-wikibase" class="wb-otherproject-link wb-otherproject-wikibase-dataitem mw-list-item"><a href="https://www.wikidata.org/wiki/Special:EntityPage/Q461183" title="Structured data on this page hosted by Wikidata [g]" accesskey="g"><span>Wikidata item</span></a></li> </ul> </div> </div> </div> </div> </div> </div> </nav> </div> </div> </div> <div class="vector-column-end"> <div class="vector-sticky-pinned-container"> <nav class="vector-page-tools-landmark" aria-label="Page tools"> <div id="vector-page-tools-pinned-container" class="vector-pinned-container"> </div> </nav> <nav class="vector-appearance-landmark" aria-label="Appearance"> <div id="vector-appearance-pinned-container" class="vector-pinned-container"> <div id="vector-appearance" class="vector-appearance vector-pinnable-element"> <div class="vector-pinnable-header vector-appearance-pinnable-header vector-pinnable-header-pinned" data-feature-name="appearance-pinned" data-pinnable-element-id="vector-appearance" data-pinned-container-id="vector-appearance-pinned-container" data-unpinned-container-id="vector-appearance-unpinned-container" > <div class="vector-pinnable-header-label">Appearance</div> <button class="vector-pinnable-header-toggle-button vector-pinnable-header-pin-button" data-event-name="pinnable-header.vector-appearance.pin">move to sidebar</button> <button class="vector-pinnable-header-toggle-button vector-pinnable-header-unpin-button" data-event-name="pinnable-header.vector-appearance.unpin">hide</button> </div> </div> </div> </nav> </div> </div> <div id="bodyContent" class="vector-body" aria-labelledby="firstHeading" data-mw-ve-target-container> <div class="vector-body-before-content"> <div class="mw-indicators"> </div> <div id="siteSub" class="noprint">From Wikipedia, the free encyclopedia</div> </div> <div id="contentSub"><div id="mw-content-subtitle"></div></div> <div id="mw-content-text" class="mw-body-content"><div class="mw-content-ltr mw-parser-output" lang="en" dir="ltr"><div class="shortdescription nomobile noexcerpt noprint searchaux" style="display:none">Digital collections of natural language data</div> <p class="mw-empty-elt"> </p><p>In <a href="/wiki/Linguistics" title="Linguistics">linguistics</a> and <a href="/wiki/Natural_language_processing" title="Natural language processing">natural language processing</a>, a <b>corpus</b> (<abbr title="plural">pl.</abbr>: <b>corpora</b>) or <b>text corpus</b> is a dataset, consisting of natively digital and older, digitalized, <a href="/wiki/Language_resource" title="Language resource">language resources</a>, either annotated or unannotated. </p><p>Annotated, they have been used in <a href="/wiki/Corpus_linguistics" title="Corpus linguistics">corpus linguistics</a> for statistical <a href="/wiki/Statistical_hypothesis_testing" class="mw-redirect" title="Statistical hypothesis testing">hypothesis testing</a>, checking occurrences or validating linguistic rules within a specific language territory. </p> <meta property="mw:PageProp/toc" /> <div class="mw-heading mw-heading2"><h2 id="Overview">Overview</h2><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Text_corpus&action=edit&section=1" title="Edit section: Overview"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <p>A corpus may contain texts in a single language (<i>monolingual corpus</i>) or text data in multiple languages (<i>multilingual corpus</i>). </p><p>In order to make the corpora more useful for doing linguistic research, they are often subjected to a process known as <a href="/wiki/Annotation" title="Annotation">annotation</a>. An example of annotating a corpus is <a href="/wiki/Part-of-speech_tagging" title="Part-of-speech tagging">part-of-speech tagging</a>, or <i>POS-tagging</i>, in which information about each word's part of speech (verb, noun, adjective, etc.) is added to the corpus in the form of <i>tags</i>. Another example is indicating the <a href="/wiki/Lemma_(morphology)" title="Lemma (morphology)">lemma</a> (base) form of each word. When the language of the corpus is not a working language of the researchers who use it, <a href="/wiki/Interlinear_gloss" title="Interlinear gloss">interlinear glossing</a> is used to make the annotation bilingual. </p><p>Some corpora have further <i>structured</i> levels of analysis applied. In particular, smaller corpora may be fully <a href="/wiki/Parsing" title="Parsing">parsed</a>. Such corpora are usually called <a href="/wiki/Treebank" title="Treebank">Treebanks</a> or <a href="/wiki/Treebank" title="Treebank">Parsed Corpora</a>. The difficulty of ensuring that the entire corpus is completely and consistently annotated means that these corpora are usually smaller, containing around one to three million words. Other levels of linguistic structured analysis are possible, including annotations for <a href="/wiki/Morphology_(linguistics)" title="Morphology (linguistics)">morphology</a>, <a href="/wiki/Semantics" title="Semantics">semantics</a> and <a href="/wiki/Pragmatics" title="Pragmatics">pragmatics</a>. </p> <div class="mw-heading mw-heading2"><h2 id="Applications">Applications</h2><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Text_corpus&action=edit&section=2" title="Edit section: Applications"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <p>Corpora are the main knowledge base in <a href="/wiki/Corpus_linguistics" title="Corpus linguistics">corpus linguistics</a>. Other notable areas of application include: </p> <ul><li><a href="/wiki/Language_technology" title="Language technology">Language technology</a>, <a href="/wiki/Natural_language_processing" title="Natural language processing">natural language processing</a>, <a href="/wiki/Computational_linguistics" title="Computational linguistics">computational linguistics</a> <ul><li>The analysis and processing of various types of corpora are also the subject of much work in <a href="/wiki/Computational_linguistics" title="Computational linguistics">computational linguistics</a>, <a href="/wiki/Speech_recognition" title="Speech recognition">speech recognition</a> and <a href="/wiki/Machine_translation" title="Machine translation">machine translation</a>, where they are often used to create <a href="/wiki/Hidden_Markov_model" title="Hidden Markov model">hidden Markov models</a> for part of speech tagging and other purposes. Corpora and <a href="/wiki/Frequency_list" class="mw-redirect" title="Frequency list">frequency lists</a> derived from them are useful for <a href="/wiki/Language_teaching" class="mw-redirect" title="Language teaching">language teaching</a>. Corpora can be considered as a type of <a href="/wiki/Foreign_language_writing_aid" class="mw-redirect" title="Foreign language writing aid">foreign language writing aid</a> as the contextualised grammatical knowledge acquired by non-native language users through exposure to authentic texts in corpora allows learners to grasp the manner of sentence formation in the target language, enabling effective writing.<sup id="cite_ref-Yoon_1-0" class="reference"><a href="#cite_note-Yoon-1"><span class="cite-bracket">[</span>1<span class="cite-bracket">]</span></a></sup></li></ul></li></ul> <ul><li><a href="/wiki/Machine_translation" title="Machine translation">Machine translation</a> <ul><li>Multilingual corpora that have been specially formatted for side-by-side comparison are called <i>aligned parallel corpora</i>. There are two main types of <a href="/wiki/Parallel_corpora" class="mw-redirect" title="Parallel corpora">parallel corpora</a> which contain texts in two languages. In a <i>translation corpus</i>, the texts in one language are translations of texts in the other language. In a <i>comparable corpus</i>, the texts are of the same kind and cover the same content, but they are not translations of each other.<sup id="cite_ref-2" class="reference"><a href="#cite_note-2"><span class="cite-bracket">[</span>2<span class="cite-bracket">]</span></a></sup> To exploit a parallel text, some kind of text alignment identifying equivalent text segments (phrases or sentences) is a prerequisite for analysis. <a href="/wiki/Machine_translation" title="Machine translation">Machine translation</a> algorithms for translating between two languages are often trained using parallel fragments comprising a first-language corpus and a second-language corpus, which is an element-for-element translation of the first-language corpus.<sup id="cite_ref-3" class="reference"><a href="#cite_note-3"><span class="cite-bracket">[</span>3<span class="cite-bracket">]</span></a></sup></li></ul></li></ul> <ul><li><a href="/wiki/Philology" title="Philology">Philologies</a> <ul><li>Text corpora are also used in the study of <a href="/wiki/Historical_document" title="Historical document">historical documents</a>, for example in attempts to <a href="/wiki/Decipherment" title="Decipherment">decipher</a> ancient scripts, or in <a href="/wiki/Biblical_scholarship" class="mw-redirect" title="Biblical scholarship">Biblical scholarship</a>. Some archaeological corpora can be of such short duration that they provide a snapshot in time. One of the shortest corpora in time may be the 15–30 year <a href="/wiki/Amarna_letters" title="Amarna letters">Amarna letters</a> texts (<a href="/wiki/1350_BC" class="mw-redirect" title="1350 BC">1350 BC</a>). The <i>corpus</i> of an ancient city, (for example the "<a href="/wiki/K%C3%BCltepe" title="Kültepe">Kültepe</a> Texts" of Turkey), may go through a series of corpora, determined by their find site dates.</li></ul></li></ul> <div class="mw-heading mw-heading2"><h2 id="Some_notable_text_corpora">Some notable text corpora</h2><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Text_corpus&action=edit&section=3" title="Edit section: Some notable text corpora"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <style data-mw-deduplicate="TemplateStyles:r1236090951">.mw-parser-output .hatnote{font-style:italic}.mw-parser-output div.hatnote{padding-left:1.6em;margin-bottom:0.5em}.mw-parser-output .hatnote i{font-style:normal}.mw-parser-output .hatnote+link+.hatnote{margin-top:-0.5em}@media print{body.ns-0 .mw-parser-output .hatnote{display:none!important}}</style><div role="note" class="hatnote navigation-not-searchable">Main article: <a href="/wiki/List_of_text_corpora" title="List of text corpora">List of text corpora</a></div> <div class="mw-heading mw-heading2"><h2 id="See_also">See also</h2><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Text_corpus&action=edit&section=4" title="Edit section: See also"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <ul><li><a href="/wiki/Concordance_(publishing)" title="Concordance (publishing)">Concordance</a></li> <li><a href="/wiki/Corpus_linguistics" title="Corpus linguistics">Corpus linguistics</a></li> <li><a href="/wiki/Culturomics" title="Culturomics">Culturomics</a></li> <li><a href="/wiki/Distributional%E2%80%93relational_database" title="Distributional–relational database">Distributional–relational database</a></li> <li><a href="/wiki/Linguistic_Data_Consortium" title="Linguistic Data Consortium">Linguistic Data Consortium</a></li> <li><a href="/wiki/Natural_language_processing" title="Natural language processing">Natural language processing</a></li> <li><a href="/wiki/Natural_Language_Toolkit" title="Natural Language Toolkit">Natural Language Toolkit</a></li> <li><a href="/wiki/Parallel_text" title="Parallel text">Parallel text</a></li> <li><a href="/wiki/Speech_corpus" title="Speech corpus">Speech corpus</a></li> <li><a href="/wiki/Translation_memory" title="Translation memory">Translation memory</a></li> <li><a href="/wiki/Treebank" title="Treebank">Treebank</a></li> <li><a href="/wiki/Zipf%27s_law" title="Zipf's law">Zipf's law</a></li></ul> <div class="mw-heading mw-heading2"><h2 id="References">References</h2><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Text_corpus&action=edit&section=5" title="Edit section: References"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <style data-mw-deduplicate="TemplateStyles:r1239543626">.mw-parser-output .reflist{margin-bottom:0.5em;list-style-type:decimal}@media screen{.mw-parser-output .reflist{font-size:90%}}.mw-parser-output .reflist .references{font-size:100%;margin-bottom:0;list-style-type:inherit}.mw-parser-output .reflist-columns-2{column-width:30em}.mw-parser-output .reflist-columns-3{column-width:25em}.mw-parser-output .reflist-columns{margin-top:0.3em}.mw-parser-output .reflist-columns ol{margin-top:0}.mw-parser-output .reflist-columns li{page-break-inside:avoid;break-inside:avoid-column}.mw-parser-output .reflist-upper-alpha{list-style-type:upper-alpha}.mw-parser-output .reflist-upper-roman{list-style-type:upper-roman}.mw-parser-output .reflist-lower-alpha{list-style-type:lower-alpha}.mw-parser-output .reflist-lower-greek{list-style-type:lower-greek}.mw-parser-output .reflist-lower-roman{list-style-type:lower-roman}</style><div class="reflist"> <div class="mw-references-wrap"><ol class="references"> <li id="cite_note-Yoon-1"><span class="mw-cite-backlink"><b><a href="#cite_ref-Yoon_1-0">^</a></b></span> <span class="reference-text">Yoon, H., & Hirvela, A. (2004). <a rel="nofollow" class="external text" href="https://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.1073.2322&rep=rep1&type=pdf">ESL Student Attitudes toward Corpus Use in L2 Writing</a>. <i>Journal of Second Language Writing, 13</i>(4), 257–283. Retrieved 21 March 2012.</span> </li> <li id="cite_note-2"><span class="mw-cite-backlink"><b><a href="#cite_ref-2">^</a></b></span> <span class="reference-text"><style data-mw-deduplicate="TemplateStyles:r1238218222">.mw-parser-output cite.citation{font-style:inherit;word-wrap:break-word}.mw-parser-output .citation q{quotes:"\"""\"""'""'"}.mw-parser-output .citation:target{background-color:rgba(0,127,255,0.133)}.mw-parser-output .id-lock-free.id-lock-free a{background:url("//upload.wikimedia.org/wikipedia/commons/6/65/Lock-green.svg")right 0.1em center/9px no-repeat}.mw-parser-output .id-lock-limited.id-lock-limited a,.mw-parser-output .id-lock-registration.id-lock-registration a{background:url("//upload.wikimedia.org/wikipedia/commons/d/d6/Lock-gray-alt-2.svg")right 0.1em center/9px no-repeat}.mw-parser-output .id-lock-subscription.id-lock-subscription a{background:url("//upload.wikimedia.org/wikipedia/commons/a/aa/Lock-red-alt-2.svg")right 0.1em center/9px no-repeat}.mw-parser-output .cs1-ws-icon a{background:url("//upload.wikimedia.org/wikipedia/commons/4/4c/Wikisource-logo.svg")right 0.1em center/12px no-repeat}body:not(.skin-timeless):not(.skin-minerva) .mw-parser-output .id-lock-free a,body:not(.skin-timeless):not(.skin-minerva) .mw-parser-output .id-lock-limited a,body:not(.skin-timeless):not(.skin-minerva) .mw-parser-output .id-lock-registration a,body:not(.skin-timeless):not(.skin-minerva) .mw-parser-output .id-lock-subscription a,body:not(.skin-timeless):not(.skin-minerva) .mw-parser-output .cs1-ws-icon a{background-size:contain;padding:0 1em 0 0}.mw-parser-output .cs1-code{color:inherit;background:inherit;border:none;padding:inherit}.mw-parser-output .cs1-hidden-error{display:none;color:var(--color-error,#d33)}.mw-parser-output .cs1-visible-error{color:var(--color-error,#d33)}.mw-parser-output .cs1-maint{display:none;color:#085;margin-left:0.3em}.mw-parser-output .cs1-kern-left{padding-left:0.2em}.mw-parser-output .cs1-kern-right{padding-right:0.2em}.mw-parser-output .citation .mw-selflink{font-weight:inherit}@media screen{.mw-parser-output .cs1-format{font-size:95%}html.skin-theme-clientpref-night .mw-parser-output .cs1-maint{color:#18911f}}@media screen and (prefers-color-scheme:dark){html.skin-theme-clientpref-os .mw-parser-output .cs1-maint{color:#18911f}}</style><cite id="CITEREFWołkMarasek2014" class="citation book cs1">Wołk, K.; Marasek, K. (7 April 2014). "Real-Time Statistical Speech Translation". <i>New Perspectives in Information Systems and Technologies, Volume 1</i>. Advances in Intelligent Systems and Computing. Vol. 275. Springer. pp. <span class="nowrap">107–</span>114. <a href="/wiki/ArXiv_(identifier)" class="mw-redirect" title="ArXiv (identifier)">arXiv</a>:<span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://arxiv.org/abs/1509.09090">1509.09090</a></span>. <a href="/wiki/Doi_(identifier)" class="mw-redirect" title="Doi (identifier)">doi</a>:<a rel="nofollow" class="external text" href="https://doi.org/10.1007%2F978-3-319-05951-8_11">10.1007/978-3-319-05951-8_11</a>. <a href="/wiki/ISBN_(identifier)" class="mw-redirect" title="ISBN (identifier)">ISBN</a> <a href="/wiki/Special:BookSources/978-3-319-05950-1" title="Special:BookSources/978-3-319-05950-1"><bdi>978-3-319-05950-1</bdi></a>. <a href="/wiki/ISSN_(identifier)" class="mw-redirect" title="ISSN (identifier)">ISSN</a> <a rel="nofollow" class="external text" href="https://search.worldcat.org/issn/2194-5357">2194-5357</a>. <a href="/wiki/S2CID_(identifier)" class="mw-redirect" title="S2CID (identifier)">S2CID</a> <a rel="nofollow" class="external text" href="https://api.semanticscholar.org/CorpusID:15361632">15361632</a>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&rft.genre=bookitem&rft.atitle=Real-Time+Statistical+Speech+Translation&rft.btitle=New+Perspectives+in+Information+Systems+and+Technologies%2C+Volume+1&rft.series=Advances+in+Intelligent+Systems+and+Computing&rft.pages=%3Cspan+class%3D%22nowrap%22%3E107-%3C%2Fspan%3E114&rft.pub=Springer&rft.date=2014-04-07&rft_id=https%3A%2F%2Fapi.semanticscholar.org%2FCorpusID%3A15361632%23id-name%3DS2CID&rft_id=info%3Adoi%2F10.1007%2F978-3-319-05951-8_11&rft_id=info%3Aarxiv%2F1509.09090&rft.issn=2194-5357&rft.isbn=978-3-319-05950-1&rft.aulast=Wo%C5%82k&rft.aufirst=K.&rft.au=Marasek%2C+K.&rfr_id=info%3Asid%2Fen.wikipedia.org%3AText+corpus" class="Z3988"></span></span> </li> <li id="cite_note-3"><span class="mw-cite-backlink"><b><a href="#cite_ref-3">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222" /><cite id="CITEREFWolkMarasek2015" class="citation conference cs1">Wolk, Krzysztof; Marasek, Krzysztof (2015). "Tuned and GPU-accelerated parallel data mining from comparable corpora". In Král, Pavel; Matoušek, Václav (eds.). <i>Text, Speech, and Dialogue – 18th International Conference, TSD 2015, Plzeň, Czech Republic, September 14–17, 2015, Proceedings</i>. Lecture Notes in Computer Science. Vol. 9302. Springer. pp. <span class="nowrap">32–</span>40. <a href="/wiki/ArXiv_(identifier)" class="mw-redirect" title="ArXiv (identifier)">arXiv</a>:<span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://arxiv.org/abs/1509.08639">1509.08639</a></span>. <a href="/wiki/Doi_(identifier)" class="mw-redirect" title="Doi (identifier)">doi</a>:<a rel="nofollow" class="external text" href="https://doi.org/10.1007%2F978-3-319-24033-6_4">10.1007/978-3-319-24033-6_4</a>. <a href="/wiki/ISBN_(identifier)" class="mw-redirect" title="ISBN (identifier)">ISBN</a> <a href="/wiki/Special:BookSources/978-3-319-24032-9" title="Special:BookSources/978-3-319-24032-9"><bdi>978-3-319-24032-9</bdi></a>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&rft.genre=conference&rft.atitle=Tuned+and+GPU-accelerated+parallel+data+mining+from+comparable+corpora&rft.btitle=Text%2C+Speech%2C+and+Dialogue+%E2%80%93+18th+International+Conference%2C+TSD+2015%2C+Plze%C5%88%2C+Czech+Republic%2C+September+14%E2%80%9317%2C+2015%2C+Proceedings&rft.series=Lecture+Notes+in+Computer+Science&rft.pages=%3Cspan+class%3D%22nowrap%22%3E32-%3C%2Fspan%3E40&rft.pub=Springer&rft.date=2015&rft_id=info%3Aarxiv%2F1509.08639&rft_id=info%3Adoi%2F10.1007%2F978-3-319-24033-6_4&rft.isbn=978-3-319-24032-9&rft.aulast=Wolk&rft.aufirst=Krzysztof&rft.au=Marasek%2C+Krzysztof&rfr_id=info%3Asid%2Fen.wikipedia.org%3AText+corpus" class="Z3988"></span></span> </li> </ol></div></div> <div class="mw-heading mw-heading2"><h2 id="External_links">External links</h2><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Text_corpus&action=edit&section=6" title="Edit section: External links"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <ul><li><a rel="nofollow" class="external text" href="http://www.clres.com/corp.html">ACL SIGLEX Resource Links: Text Corpora</a> <a rel="nofollow" class="external text" href="https://web.archive.org/web/20130813141813/http://www.clres.com/corp.html">Archived</a> 2013-08-13 at the <a href="/wiki/Wayback_Machine" title="Wayback Machine">Wayback Machine</a></li> <li><a rel="nofollow" class="external text" href="https://archive.today/20121222193153/http://www.ahds.ac.uk/linguistic-corpora">Developing Linguistic Corpora: a Guide to Good Practice</a></li> <li><a rel="nofollow" class="external text" href="http://corpus.byu.edu/">Free samples (not free), web-based corpora (45-425 million words each): American (COCA, COHA, TIME), British (BNC), Spanish, Portuguese</a></li> <li><a rel="nofollow" class="external text" href="http://ucnk.korpus.cz/intercorp/?lang=en">Intercorp</a> Building synchronous parallel corpora of the languages taught at the Faculty of Arts of Charles University.</li> <li><a rel="nofollow" class="external text" href="https://the.sketchengine.co.uk/open/">Sketch Engine: Open corpora with free access</a></li> <li><a rel="nofollow" class="external text" href="http://www.tscorpus.com/">TS Corpus – A Turkish Corpus freely available for academic research.</a></li> <li><a rel="nofollow" class="external text" href="http://www.tnc.org.tr/">Turkish National Corpus – A general-purpose corpus for contemporary Turkish</a></li> <li><a rel="nofollow" class="external text" href="https://digital.lib.hkbu.edu.hk/corpus/index.php">Corpus of Political Speeches</a>, Free access to political speeches by American and Chinese politicians, developed by Hong Kong Baptist University Library</li> <li><a rel="nofollow" class="external text" href="https://ruscorpora.ru/en/">Russian National Corpus</a></li></ul> <p><br /> </p> <div class="navbox-styles"><style data-mw-deduplicate="TemplateStyles:r1129693374">.mw-parser-output .hlist dl,.mw-parser-output .hlist ol,.mw-parser-output .hlist ul{margin:0;padding:0}.mw-parser-output .hlist dd,.mw-parser-output .hlist dt,.mw-parser-output .hlist li{margin:0;display:inline}.mw-parser-output .hlist.inline,.mw-parser-output .hlist.inline dl,.mw-parser-output .hlist.inline ol,.mw-parser-output .hlist.inline ul,.mw-parser-output .hlist dl dl,.mw-parser-output .hlist dl ol,.mw-parser-output .hlist dl ul,.mw-parser-output .hlist ol dl,.mw-parser-output .hlist ol ol,.mw-parser-output .hlist ol ul,.mw-parser-output .hlist ul dl,.mw-parser-output .hlist ul ol,.mw-parser-output .hlist ul ul{display:inline}.mw-parser-output .hlist .mw-empty-li{display:none}.mw-parser-output .hlist dt::after{content:": "}.mw-parser-output .hlist dd::after,.mw-parser-output .hlist li::after{content:" · ";font-weight:bold}.mw-parser-output .hlist dd:last-child::after,.mw-parser-output .hlist dt:last-child::after,.mw-parser-output .hlist li:last-child::after{content:none}.mw-parser-output .hlist dd dd:first-child::before,.mw-parser-output .hlist dd dt:first-child::before,.mw-parser-output .hlist dd li:first-child::before,.mw-parser-output .hlist dt dd:first-child::before,.mw-parser-output .hlist dt dt:first-child::before,.mw-parser-output .hlist dt li:first-child::before,.mw-parser-output .hlist li dd:first-child::before,.mw-parser-output .hlist li dt:first-child::before,.mw-parser-output .hlist li li:first-child::before{content:" (";font-weight:normal}.mw-parser-output .hlist dd dd:last-child::after,.mw-parser-output .hlist dd dt:last-child::after,.mw-parser-output .hlist dd li:last-child::after,.mw-parser-output .hlist dt dd:last-child::after,.mw-parser-output .hlist dt dt:last-child::after,.mw-parser-output .hlist dt li:last-child::after,.mw-parser-output .hlist li dd:last-child::after,.mw-parser-output .hlist li dt:last-child::after,.mw-parser-output .hlist li li:last-child::after{content:")";font-weight:normal}.mw-parser-output .hlist ol{counter-reset:listitem}.mw-parser-output .hlist ol>li{counter-increment:listitem}.mw-parser-output .hlist ol>li::before{content:" "counter(listitem)"\a0 "}.mw-parser-output .hlist dd ol>li:first-child::before,.mw-parser-output .hlist dt ol>li:first-child::before,.mw-parser-output .hlist li ol>li:first-child::before{content:" ("counter(listitem)"\a0 "}</style><style data-mw-deduplicate="TemplateStyles:r1236075235">.mw-parser-output .navbox{box-sizing:border-box;border:1px solid #a2a9b1;width:100%;clear:both;font-size:88%;text-align:center;padding:1px;margin:1em auto 0}.mw-parser-output .navbox .navbox{margin-top:0}.mw-parser-output .navbox+.navbox,.mw-parser-output .navbox+.navbox-styles+.navbox{margin-top:-1px}.mw-parser-output .navbox-inner,.mw-parser-output .navbox-subgroup{width:100%}.mw-parser-output .navbox-group,.mw-parser-output .navbox-title,.mw-parser-output .navbox-abovebelow{padding:0.25em 1em;line-height:1.5em;text-align:center}.mw-parser-output .navbox-group{white-space:nowrap;text-align:right}.mw-parser-output .navbox,.mw-parser-output .navbox-subgroup{background-color:#fdfdfd}.mw-parser-output .navbox-list{line-height:1.5em;border-color:#fdfdfd}.mw-parser-output .navbox-list-with-group{text-align:left;border-left-width:2px;border-left-style:solid}.mw-parser-output tr+tr>.navbox-abovebelow,.mw-parser-output tr+tr>.navbox-group,.mw-parser-output tr+tr>.navbox-image,.mw-parser-output tr+tr>.navbox-list{border-top:2px solid #fdfdfd}.mw-parser-output .navbox-title{background-color:#ccf}.mw-parser-output .navbox-abovebelow,.mw-parser-output .navbox-group,.mw-parser-output .navbox-subgroup .navbox-title{background-color:#ddf}.mw-parser-output .navbox-subgroup .navbox-group,.mw-parser-output .navbox-subgroup .navbox-abovebelow{background-color:#e6e6ff}.mw-parser-output .navbox-even{background-color:#f7f7f7}.mw-parser-output .navbox-odd{background-color:transparent}.mw-parser-output .navbox .hlist td dl,.mw-parser-output .navbox .hlist td ol,.mw-parser-output .navbox .hlist td ul,.mw-parser-output .navbox td.hlist dl,.mw-parser-output .navbox td.hlist ol,.mw-parser-output .navbox td.hlist ul{padding:0.125em 0}.mw-parser-output .navbox .navbar{display:block;font-size:100%}.mw-parser-output .navbox-title .navbar{float:left;text-align:left;margin-right:0.5em}body.skin--responsive .mw-parser-output .navbox-image img{max-width:none!important}@media print{body.ns-0 .mw-parser-output .navbox{display:none!important}}</style></div><div role="navigation" class="navbox" aria-labelledby="Natural_language_processing454" style="padding:3px"><table class="nowraplinks hlist mw-collapsible autocollapse navbox-inner" style="border-spacing:0;background:transparent;color:inherit"><tbody><tr><th scope="col" class="navbox-title" colspan="2"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1129693374" /><style data-mw-deduplicate="TemplateStyles:r1239400231">.mw-parser-output .navbar{display:inline;font-size:88%;font-weight:normal}.mw-parser-output .navbar-collapse{float:left;text-align:left}.mw-parser-output .navbar-boxtext{word-spacing:0}.mw-parser-output .navbar ul{display:inline-block;white-space:nowrap;line-height:inherit}.mw-parser-output .navbar-brackets::before{margin-right:-0.125em;content:"[ "}.mw-parser-output .navbar-brackets::after{margin-left:-0.125em;content:" ]"}.mw-parser-output .navbar li{word-spacing:-0.125em}.mw-parser-output .navbar a>span,.mw-parser-output .navbar a>abbr{text-decoration:inherit}.mw-parser-output .navbar-mini abbr{font-variant:small-caps;border-bottom:none;text-decoration:none;cursor:inherit}.mw-parser-output .navbar-ct-full{font-size:114%;margin:0 7em}.mw-parser-output .navbar-ct-mini{font-size:114%;margin:0 4em}html.skin-theme-clientpref-night .mw-parser-output .navbar li a abbr{color:var(--color-base)!important}@media(prefers-color-scheme:dark){html.skin-theme-clientpref-os .mw-parser-output .navbar li a abbr{color:var(--color-base)!important}}@media print{.mw-parser-output .navbar{display:none!important}}</style><div class="navbar plainlinks hlist navbar-mini"><ul><li class="nv-view"><a href="/wiki/Template:Natural_language_processing" title="Template:Natural language processing"><abbr title="View this template">v</abbr></a></li><li class="nv-talk"><a href="/wiki/Template_talk:Natural_language_processing" title="Template talk:Natural language processing"><abbr title="Discuss this template">t</abbr></a></li><li class="nv-edit"><a href="/wiki/Special:EditPage/Template:Natural_language_processing" title="Special:EditPage/Template:Natural language processing"><abbr title="Edit this template">e</abbr></a></li></ul></div><div id="Natural_language_processing454" style="font-size:114%;margin:0 4em"><a href="/wiki/Natural_language_processing" title="Natural language processing">Natural language processing</a></div></th></tr><tr><th scope="row" class="navbox-group" style="width:1%">General terms</th><td class="navbox-list-with-group navbox-list navbox-odd" style="width:100%;padding:0"><div style="padding:0 0.25em"> <ul><li><a href="/wiki/AI-complete" title="AI-complete">AI-complete</a></li> <li><a href="/wiki/Bag-of-words_model" title="Bag-of-words model">Bag-of-words</a></li> <li><a href="/wiki/N-gram" title="N-gram">n-gram</a> <ul><li><a href="/wiki/Bigram" title="Bigram">Bigram</a></li> <li><a href="/wiki/Trigram" title="Trigram">Trigram</a></li></ul></li> <li><a href="/wiki/Computational_linguistics" title="Computational linguistics">Computational linguistics</a></li> <li><a href="/wiki/Natural_language_understanding" title="Natural language understanding">Natural language understanding</a></li> <li><a href="/wiki/Stop_word" title="Stop word">Stop words</a></li> <li><a href="/wiki/Text_processing" title="Text processing">Text processing</a></li></ul> </div></td></tr><tr><th scope="row" class="navbox-group" style="width:1%"><a href="/wiki/Text_mining" title="Text mining">Text analysis</a></th><td class="navbox-list-with-group navbox-list navbox-even" style="width:100%;padding:0"><div style="padding:0 0.25em"> <ul><li><a href="/wiki/Argument_mining" title="Argument mining">Argument mining</a></li> <li><a href="/wiki/Collocation_extraction" title="Collocation extraction">Collocation extraction</a></li> <li><a href="/wiki/Concept_mining" title="Concept mining">Concept mining</a></li> <li><a href="/wiki/Coreference#Coreference_resolution" title="Coreference">Coreference resolution</a></li> <li><a href="/wiki/Deep_linguistic_processing" title="Deep linguistic processing">Deep linguistic processing</a></li> <li><a href="/wiki/Distant_reading" title="Distant reading">Distant reading</a></li> <li><a href="/wiki/Information_extraction" title="Information extraction">Information extraction</a></li> <li><a href="/wiki/Named-entity_recognition" title="Named-entity recognition">Named-entity recognition</a></li> <li><a href="/wiki/Ontology_learning" title="Ontology learning">Ontology learning</a></li> <li><a href="/wiki/Parsing" title="Parsing">Parsing</a> <ul><li><a href="/wiki/Semantic_parsing" title="Semantic parsing">Semantic parsing</a></li> <li><a href="/wiki/Syntactic_parsing_(computational_linguistics)" title="Syntactic parsing (computational linguistics)">Syntactic parsing</a></li></ul></li> <li><a href="/wiki/Part-of-speech_tagging" title="Part-of-speech tagging">Part-of-speech tagging</a></li> <li><a href="/wiki/Semantic_analysis_(machine_learning)" title="Semantic analysis (machine learning)">Semantic analysis</a></li> <li><a href="/wiki/Semantic_role_labeling" title="Semantic role labeling">Semantic role labeling</a></li> <li><a href="/wiki/Semantic_decomposition_(natural_language_processing)" title="Semantic decomposition (natural language processing)">Semantic decomposition</a></li> <li><a href="/wiki/Semantic_similarity" title="Semantic similarity">Semantic similarity</a></li> <li><a href="/wiki/Sentiment_analysis" title="Sentiment analysis">Sentiment analysis</a></li></ul> <ul><li><a href="/wiki/Terminology_extraction" title="Terminology extraction">Terminology extraction</a></li> <li><a href="/wiki/Text_mining" title="Text mining">Text mining</a></li> <li><a href="/wiki/Textual_entailment" title="Textual entailment">Textual entailment</a></li> <li><a href="/wiki/Truecasing" title="Truecasing">Truecasing</a></li> <li><a href="/wiki/Word-sense_disambiguation" title="Word-sense disambiguation">Word-sense disambiguation</a></li> <li><a href="/wiki/Word-sense_induction" title="Word-sense induction">Word-sense induction</a></li></ul> </div><table class="nowraplinks navbox-subgroup" style="border-spacing:0"><tbody><tr><th id="Text_segmentation21" scope="row" class="navbox-group" style="width:1%"><a href="/wiki/Text_segmentation" title="Text segmentation">Text segmentation</a></th><td class="navbox-list-with-group navbox-list navbox-odd" style="width:100%;padding:0"><div style="padding:0 0.25em"> <ul><li><a href="/wiki/Compound-term_processing" title="Compound-term processing">Compound-term processing</a></li> <li><a href="/wiki/Lemmatisation" class="mw-redirect" title="Lemmatisation">Lemmatisation</a></li> <li><a href="/wiki/Lexical_analysis" title="Lexical analysis">Lexical analysis</a></li> <li><a href="/wiki/Shallow_parsing" title="Shallow parsing">Text chunking</a></li> <li><a href="/wiki/Stemming" title="Stemming">Stemming</a></li> <li><a href="/wiki/Sentence_boundary_disambiguation" title="Sentence boundary disambiguation">Sentence segmentation</a></li> <li><a href="/wiki/Word#Word_boundaries" title="Word">Word segmentation</a></li></ul> </div></td></tr></tbody></table><div> </div></td></tr><tr><th scope="row" class="navbox-group" style="width:1%"><a href="/wiki/Automatic_summarization" title="Automatic summarization">Automatic summarization</a></th><td class="navbox-list-with-group navbox-list navbox-even" style="width:100%;padding:0"><div style="padding:0 0.25em"> <ul><li><a href="/wiki/Multi-document_summarization" title="Multi-document summarization">Multi-document summarization</a></li> <li><a href="/wiki/Sentence_extraction" title="Sentence extraction">Sentence extraction</a></li> <li><a href="/wiki/Text_simplification" title="Text simplification">Text simplification</a></li></ul> </div></td></tr><tr><th scope="row" class="navbox-group" style="width:1%"><a href="/wiki/Machine_translation" title="Machine translation">Machine translation</a></th><td class="navbox-list-with-group navbox-list navbox-odd" style="width:100%;padding:0"><div style="padding:0 0.25em"> <ul><li><a href="/wiki/Computer-assisted_translation" title="Computer-assisted translation">Computer-assisted</a></li> <li><a href="/wiki/Example-based_machine_translation" title="Example-based machine translation">Example-based</a></li> <li><a href="/wiki/Rule-based_machine_translation" title="Rule-based machine translation">Rule-based</a></li> <li><a href="/wiki/Statistical_machine_translation" title="Statistical machine translation">Statistical</a></li> <li><a href="/wiki/Transfer-based_machine_translation" title="Transfer-based machine translation">Transfer-based</a></li> <li><a href="/wiki/Neural_machine_translation" title="Neural machine translation">Neural</a></li></ul> </div></td></tr><tr><th scope="row" class="navbox-group" style="width:1%"><a href="/wiki/Distributional_semantics" title="Distributional semantics">Distributional semantics</a> models</th><td class="navbox-list-with-group navbox-list navbox-even" style="width:100%;padding:0"><div style="padding:0 0.25em"> <ul><li><a href="/wiki/BERT_(language_model)" title="BERT (language model)">BERT</a></li> <li><a href="/wiki/Document-term_matrix" title="Document-term matrix">Document-term matrix</a></li> <li><a href="/wiki/Explicit_semantic_analysis" title="Explicit semantic analysis">Explicit semantic analysis</a></li> <li><a href="/wiki/FastText" title="FastText">fastText</a></li> <li><a href="/wiki/GloVe" title="GloVe">GloVe</a></li> <li><a href="/wiki/Language_model" title="Language model">Language model</a> (<a href="/wiki/Large_language_model" title="Large language model">large</a>)</li> <li><a href="/wiki/Latent_semantic_analysis" title="Latent semantic analysis">Latent semantic analysis</a></li> <li><a href="/wiki/Seq2seq" title="Seq2seq">Seq2seq</a></li> <li><a href="/wiki/Word_embedding" title="Word embedding">Word embedding</a></li> <li><a href="/wiki/Word2vec" title="Word2vec">Word2vec</a></li></ul> </div></td></tr><tr><th scope="row" class="navbox-group" style="width:1%"><a href="/wiki/Language_resource" title="Language resource">Language resources</a>,<br />datasets and corpora</th><td class="navbox-list-with-group navbox-list navbox-odd" style="width:100%;padding:0"><div style="padding:0 0.25em"></div><table class="nowraplinks navbox-subgroup" style="border-spacing:0"><tbody><tr><th scope="row" class="navbox-group" style="width:1%">Types and<br />standards</th><td class="navbox-list-with-group navbox-list navbox-odd" style="width:100%;padding:0"><div style="padding:0 0.25em"> <ul><li><a href="/wiki/Corpus_linguistics" title="Corpus linguistics">Corpus linguistics</a></li> <li><a href="/wiki/Lexical_resource" title="Lexical resource">Lexical resource</a></li> <li><a href="/wiki/Linguistic_Linked_Open_Data" title="Linguistic Linked Open Data">Linguistic Linked Open Data</a></li> <li><a href="/wiki/Machine-readable_dictionary" title="Machine-readable dictionary">Machine-readable dictionary</a></li> <li><a href="/wiki/Parallel_text" title="Parallel text">Parallel text</a></li> <li><a href="/wiki/PropBank" title="PropBank">PropBank</a></li> <li><a href="/wiki/Semantic_network" title="Semantic network">Semantic network</a></li> <li><a href="/wiki/Simple_Knowledge_Organization_System" title="Simple Knowledge Organization System">Simple Knowledge Organization System</a></li> <li><a href="/wiki/Speech_corpus" title="Speech corpus">Speech corpus</a></li> <li><a class="mw-selflink selflink">Text corpus</a></li> <li><a href="/wiki/Thesaurus_(information_retrieval)" title="Thesaurus (information retrieval)">Thesaurus (information retrieval)</a></li> <li><a href="/wiki/Treebank" title="Treebank">Treebank</a></li> <li><a href="/wiki/Universal_Dependencies" title="Universal Dependencies">Universal Dependencies</a></li></ul> </div></td></tr><tr><th scope="row" class="navbox-group" style="width:1%">Data</th><td class="navbox-list-with-group navbox-list navbox-even" style="width:100%;padding:0"><div style="padding:0 0.25em"> <ul><li><a href="/wiki/BabelNet" title="BabelNet">BabelNet</a></li> <li><a href="/wiki/Bank_of_English" title="Bank of English">Bank of English</a></li> <li><a href="/wiki/DBpedia" title="DBpedia">DBpedia</a></li> <li><a href="/wiki/FrameNet" title="FrameNet">FrameNet</a></li> <li><a href="/wiki/Google_Ngram_Viewer" class="mw-redirect" title="Google Ngram Viewer">Google Ngram Viewer</a></li> <li><a href="/wiki/UBY" title="UBY">UBY</a></li> <li><a href="/wiki/WordNet" title="WordNet">WordNet</a></li> <li><a href="/wiki/Wikidata" title="Wikidata">Wikidata</a></li></ul> </div></td></tr></tbody></table><div></div></td></tr><tr><th scope="row" class="navbox-group" style="width:1%"><a href="/wiki/Automatic_identification_and_data_capture" title="Automatic identification and data capture">Automatic identification<br />and data capture</a></th><td class="navbox-list-with-group navbox-list navbox-odd" style="width:100%;padding:0"><div style="padding:0 0.25em"> <ul><li><a href="/wiki/Speech_recognition" title="Speech recognition">Speech recognition</a></li> <li><a href="/wiki/Speech_segmentation" title="Speech segmentation">Speech segmentation</a></li> <li><a href="/wiki/Speech_synthesis" title="Speech synthesis">Speech synthesis</a></li> <li><a href="/wiki/Natural_language_generation" title="Natural language generation">Natural language generation</a></li> <li><a href="/wiki/Optical_character_recognition" title="Optical character recognition">Optical character recognition</a></li></ul> </div></td></tr><tr><th scope="row" class="navbox-group" style="width:1%"><a href="/wiki/Topic_model" title="Topic model">Topic model</a></th><td class="navbox-list-with-group navbox-list navbox-even" style="width:100%;padding:0"><div style="padding:0 0.25em"> <ul><li><a href="/wiki/Document_classification" title="Document classification">Document classification</a></li> <li><a href="/wiki/Latent_Dirichlet_allocation" title="Latent Dirichlet allocation">Latent Dirichlet allocation</a></li> <li><a href="/wiki/Pachinko_allocation" title="Pachinko allocation">Pachinko allocation</a></li></ul> </div></td></tr><tr><th scope="row" class="navbox-group" style="width:1%"><a href="/wiki/Computer-assisted_reviewing" title="Computer-assisted reviewing">Computer-assisted<br />reviewing</a></th><td class="navbox-list-with-group navbox-list navbox-odd" style="width:100%;padding:0"><div style="padding:0 0.25em"> <ul><li><a href="/wiki/Automated_essay_scoring" title="Automated essay scoring">Automated essay scoring</a></li> <li><a href="/wiki/Concordancer" title="Concordancer">Concordancer</a></li> <li><a href="/wiki/Grammar_checker" title="Grammar checker">Grammar checker</a></li> <li><a href="/wiki/Predictive_text" title="Predictive text">Predictive text</a></li> <li><a href="/wiki/Pronunciation_assessment" title="Pronunciation assessment">Pronunciation assessment</a></li> <li><a href="/wiki/Spell_checker" title="Spell checker">Spell checker</a></li></ul> </div></td></tr><tr><th scope="row" class="navbox-group" style="width:1%"><a href="/wiki/Natural-language_user_interface" title="Natural-language user interface">Natural language<br />user interface</a></th><td class="navbox-list-with-group navbox-list navbox-even" style="width:100%;padding:0"><div style="padding:0 0.25em"> <ul><li><a href="/wiki/Chatbot" title="Chatbot">Chatbot</a></li> <li><a href="/wiki/Interactive_fiction" title="Interactive fiction">Interactive fiction</a> (c.f. <a href="/wiki/Syntax_guessing" class="mw-redirect" title="Syntax guessing">Syntax guessing</a>)</li> <li><a href="/wiki/Question_answering" title="Question answering">Question answering</a></li> <li><a href="/wiki/Virtual_assistant" title="Virtual assistant">Virtual assistant</a></li> <li><a href="/wiki/Voice_user_interface" title="Voice user interface">Voice user interface</a></li></ul> </div></td></tr><tr><th scope="row" class="navbox-group" style="width:1%">Related</th><td class="navbox-list-with-group navbox-list navbox-odd" style="width:100%;padding:0"><div style="padding:0 0.25em"> <ul><li><a href="/wiki/Formal_semantics_(natural_language)" title="Formal semantics (natural language)">Formal semantics</a></li> <li><a href="/wiki/Hallucination_(artificial_intelligence)" title="Hallucination (artificial intelligence)">Hallucination</a></li> <li><a href="/wiki/Natural_Language_Toolkit" title="Natural Language Toolkit">Natural Language Toolkit</a></li> <li><a href="/wiki/SpaCy" title="SpaCy">spaCy</a></li></ul> </div></td></tr></tbody></table></div> <!-- NewPP limit report Parsed by mw‐web.eqiad.main‐8669bc5c8‐8nl5c Cached time: 20250318155255 Cache expiry: 2592000 Reduced expiry: false Complications: [vary‐revision‐sha1, show‐toc] CPU time usage: 0.289 seconds Real time usage: 0.475 seconds Preprocessor visited node count: 740/1000000 Post‐expand include size: 34940/2097152 bytes Template argument size: 768/2097152 bytes Highest expansion depth: 12/100 Expensive parser function count: 3/500 Unstrip recursion depth: 1/20 Unstrip post‐expand size: 19022/5000000 bytes Lua time usage: 0.186/10.000 seconds Lua memory usage: 4737031/52428800 bytes Number of Wikibase entities loaded: 0/400 --> <!-- Transclusion expansion time report (%,ms,calls,template) 100.00% 411.228 1 -total 29.67% 122.004 1 Template:Short_description 27.60% 113.514 1 Template:Reflist 27.06% 111.279 1 Template:Natural_Language_Processing 26.11% 107.390 3 Template:Navbox 21.32% 87.664 2 Template:Pagetype 21.18% 87.102 1 Template:Cite_book 5.94% 24.440 1 Template:Main_article 5.38% 22.108 4 Template:Main_other 4.75% 19.520 1 Template:SDcat --> <!-- Saved in parser cache with key enwiki:pcache:53887:|#|:idhash:canonical and timestamp 20250318155255 and revision id 1257324423. Rendering was triggered because: page-view --> </div><!--esi <esi:include src="/esitest-fa8a495983347898/content" /> --><noscript><img src="https://login.wikimedia.org/wiki/Special:CentralAutoLogin/start?useformat=desktop&type=1x1&usesul3=0" alt="" width="1" height="1" style="border: none; position: absolute;"></noscript> <div class="printfooter" data-nosnippet="">Retrieved from "<a dir="ltr" href="https://en.wikipedia.org/w/index.php?title=Text_corpus&oldid=1257324423">https://en.wikipedia.org/w/index.php?title=Text_corpus&oldid=1257324423</a>"</div></div> <div id="catlinks" class="catlinks" data-mw="interface"><div id="mw-normal-catlinks" class="mw-normal-catlinks"><a href="/wiki/Help:Category" title="Help:Category">Categories</a>: <ul><li><a href="/wiki/Category:Discourse_analysis" title="Category:Discourse analysis">Discourse analysis</a></li><li><a href="/wiki/Category:Corpus_linguistics" title="Category:Corpus linguistics">Corpus linguistics</a></li><li><a href="/wiki/Category:Computational_linguistics" title="Category:Computational linguistics">Computational linguistics</a></li><li><a href="/wiki/Category:Works_based_on_multiple_works" title="Category:Works based on multiple works">Works based on multiple works</a></li><li><a href="/wiki/Category:Test_items" title="Category:Test items">Test items</a></li></ul></div><div id="mw-hidden-catlinks" class="mw-hidden-catlinks mw-hidden-cats-hidden">Hidden categories: <ul><li><a href="/wiki/Category:Articles_with_short_description" title="Category:Articles with short description">Articles with short description</a></li><li><a href="/wiki/Category:Short_description_is_different_from_Wikidata" title="Category:Short description is different from Wikidata">Short description is different from Wikidata</a></li><li><a href="/wiki/Category:Use_dmy_dates_from_July_2022" title="Category:Use dmy dates from July 2022">Use dmy dates from July 2022</a></li><li><a href="/wiki/Category:Webarchive_template_wayback_links" title="Category:Webarchive template wayback links">Webarchive template wayback links</a></li></ul></div></div> </div> </main> </div> <div class="mw-footer-container"> <footer id="footer" class="mw-footer" > <ul id="footer-info"> <li id="footer-info-lastmod"> This page was last edited on 14 November 2024, at 09:48<span class="anonymous-show"> (UTC)</span>.</li> <li id="footer-info-copyright">Text is available under the <a href="/wiki/Wikipedia:Text_of_the_Creative_Commons_Attribution-ShareAlike_4.0_International_License" title="Wikipedia:Text of the Creative Commons Attribution-ShareAlike 4.0 International License">Creative Commons Attribution-ShareAlike 4.0 License</a>; additional terms may apply. By using this site, you agree to the <a href="https://foundation.wikimedia.org/wiki/Special:MyLanguage/Policy:Terms_of_Use" class="extiw" title="foundation:Special:MyLanguage/Policy:Terms of Use">Terms of Use</a> and <a href="https://foundation.wikimedia.org/wiki/Special:MyLanguage/Policy:Privacy_policy" class="extiw" title="foundation:Special:MyLanguage/Policy:Privacy policy">Privacy Policy</a>. Wikipedia® is a registered trademark of the <a rel="nofollow" class="external text" href="https://wikimediafoundation.org/">Wikimedia Foundation, Inc.</a>, a non-profit organization.</li> </ul> <ul id="footer-places"> <li id="footer-places-privacy"><a href="https://foundation.wikimedia.org/wiki/Special:MyLanguage/Policy:Privacy_policy">Privacy policy</a></li> <li id="footer-places-about"><a href="/wiki/Wikipedia:About">About Wikipedia</a></li> <li id="footer-places-disclaimers"><a href="/wiki/Wikipedia:General_disclaimer">Disclaimers</a></li> <li id="footer-places-contact"><a href="//en.wikipedia.org/wiki/Wikipedia:Contact_us">Contact Wikipedia</a></li> <li id="footer-places-wm-codeofconduct"><a href="https://foundation.wikimedia.org/wiki/Special:MyLanguage/Policy:Universal_Code_of_Conduct">Code of Conduct</a></li> <li id="footer-places-developers"><a href="https://developer.wikimedia.org">Developers</a></li> <li id="footer-places-statslink"><a href="https://stats.wikimedia.org/#/en.wikipedia.org">Statistics</a></li> <li id="footer-places-cookiestatement"><a href="https://foundation.wikimedia.org/wiki/Special:MyLanguage/Policy:Cookie_statement">Cookie statement</a></li> <li id="footer-places-mobileview"><a href="//en.m.wikipedia.org/w/index.php?title=Text_corpus&mobileaction=toggle_view_mobile" class="noprint stopMobileRedirectToggle">Mobile view</a></li> </ul> <ul id="footer-icons" class="noprint"> <li id="footer-copyrightico"><a href="https://wikimediafoundation.org/" class="cdx-button cdx-button--fake-button cdx-button--size-large cdx-button--fake-button--enabled"><picture><source media="(min-width: 500px)" srcset="/static/images/footer/wikimedia-button.svg" width="84" height="29"><img src="/static/images/footer/wikimedia.svg" width="25" height="25" alt="Wikimedia Foundation" lang="en" loading="lazy"></picture></a></li> <li id="footer-poweredbyico"><a href="https://www.mediawiki.org/" class="cdx-button cdx-button--fake-button cdx-button--size-large cdx-button--fake-button--enabled"><picture><source media="(min-width: 500px)" srcset="/w/resources/assets/poweredby_mediawiki.svg" width="88" height="31"><img src="/w/resources/assets/mediawiki_compact.svg" alt="Powered by MediaWiki" lang="en" width="25" height="25" loading="lazy"></picture></a></li> </ul> </footer> </div> </div> </div> <div class="vector-header-container vector-sticky-header-container"> <div id="vector-sticky-header" class="vector-sticky-header"> <div class="vector-sticky-header-start"> <div class="vector-sticky-header-icon-start vector-button-flush-left vector-button-flush-right" aria-hidden="true"> <button class="cdx-button cdx-button--weight-quiet cdx-button--icon-only vector-sticky-header-search-toggle" tabindex="-1" data-event-name="ui.vector-sticky-search-form.icon"><span class="vector-icon mw-ui-icon-search mw-ui-icon-wikimedia-search"></span> <span>Search</span> </button> </div> <div role="search" class="vector-search-box-vue vector-search-box-show-thumbnail vector-search-box"> <div class="vector-typeahead-search-container"> <div class="cdx-typeahead-search cdx-typeahead-search--show-thumbnail"> <form action="/w/index.php" id="vector-sticky-search-form" class="cdx-search-input cdx-search-input--has-end-button"> <div class="cdx-search-input__input-wrapper" data-search-loc="header-moved"> <div class="cdx-text-input cdx-text-input--has-start-icon"> <input class="cdx-text-input__input" type="search" name="search" placeholder="Search Wikipedia"> <span class="cdx-text-input__icon cdx-text-input__start-icon"></span> </div> <input type="hidden" name="title" value="Special:Search"> </div> <button class="cdx-button cdx-search-input__end-button">Search</button> </form> </div> </div> </div> <div class="vector-sticky-header-context-bar"> <nav aria-label="Contents" class="vector-toc-landmark"> <div id="vector-sticky-header-toc" class="vector-dropdown mw-portlet mw-portlet-sticky-header-toc vector-sticky-header-toc vector-button-flush-left" > <input type="checkbox" id="vector-sticky-header-toc-checkbox" role="button" aria-haspopup="true" data-event-name="ui.dropdown-vector-sticky-header-toc" class="vector-dropdown-checkbox " aria-label="Toggle the table of contents" > <label id="vector-sticky-header-toc-label" for="vector-sticky-header-toc-checkbox" class="vector-dropdown-label cdx-button cdx-button--fake-button cdx-button--fake-button--enabled cdx-button--weight-quiet cdx-button--icon-only " aria-hidden="true" ><span class="vector-icon mw-ui-icon-listBullet mw-ui-icon-wikimedia-listBullet"></span> <span class="vector-dropdown-label-text">Toggle the table of contents</span> </label> <div class="vector-dropdown-content"> <div id="vector-sticky-header-toc-unpinned-container" class="vector-unpinned-container"> </div> </div> </div> </nav> <div class="vector-sticky-header-context-bar-primary" aria-hidden="true" ><span class="mw-page-title-main">Text corpus</span></div> </div> </div> <div class="vector-sticky-header-end" aria-hidden="true"> <div class="vector-sticky-header-icons"> <a href="#" class="cdx-button cdx-button--fake-button cdx-button--fake-button--enabled cdx-button--weight-quiet cdx-button--icon-only" id="ca-talk-sticky-header" tabindex="-1" data-event-name="talk-sticky-header"><span class="vector-icon mw-ui-icon-speechBubbles mw-ui-icon-wikimedia-speechBubbles"></span> <span></span> </a> <a href="#" class="cdx-button cdx-button--fake-button cdx-button--fake-button--enabled cdx-button--weight-quiet cdx-button--icon-only" id="ca-subject-sticky-header" tabindex="-1" data-event-name="subject-sticky-header"><span class="vector-icon mw-ui-icon-article mw-ui-icon-wikimedia-article"></span> <span></span> </a> <a href="#" class="cdx-button cdx-button--fake-button cdx-button--fake-button--enabled cdx-button--weight-quiet cdx-button--icon-only" id="ca-history-sticky-header" tabindex="-1" data-event-name="history-sticky-header"><span class="vector-icon mw-ui-icon-wikimedia-history mw-ui-icon-wikimedia-wikimedia-history"></span> <span></span> </a> <a href="#" class="cdx-button cdx-button--fake-button cdx-button--fake-button--enabled cdx-button--weight-quiet cdx-button--icon-only mw-watchlink" id="ca-watchstar-sticky-header" tabindex="-1" data-event-name="watch-sticky-header"><span class="vector-icon mw-ui-icon-wikimedia-star mw-ui-icon-wikimedia-wikimedia-star"></span> <span></span> </a> <a href="#" class="cdx-button cdx-button--fake-button cdx-button--fake-button--enabled cdx-button--weight-quiet cdx-button--icon-only" id="ca-edit-sticky-header" tabindex="-1" data-event-name="wikitext-edit-sticky-header"><span class="vector-icon mw-ui-icon-wikimedia-wikiText mw-ui-icon-wikimedia-wikimedia-wikiText"></span> <span></span> </a> <a href="#" class="cdx-button cdx-button--fake-button cdx-button--fake-button--enabled cdx-button--weight-quiet cdx-button--icon-only" id="ca-ve-edit-sticky-header" tabindex="-1" data-event-name="ve-edit-sticky-header"><span class="vector-icon mw-ui-icon-wikimedia-edit mw-ui-icon-wikimedia-wikimedia-edit"></span> <span></span> </a> <a href="#" class="cdx-button cdx-button--fake-button cdx-button--fake-button--enabled cdx-button--weight-quiet cdx-button--icon-only" id="ca-viewsource-sticky-header" tabindex="-1" data-event-name="ve-edit-protected-sticky-header"><span class="vector-icon mw-ui-icon-wikimedia-editLock mw-ui-icon-wikimedia-wikimedia-editLock"></span> <span></span> </a> </div> <div class="vector-sticky-header-buttons"> <button class="cdx-button cdx-button--weight-quiet mw-interlanguage-selector" id="p-lang-btn-sticky-header" tabindex="-1" data-event-name="ui.dropdown-p-lang-btn-sticky-header"><span class="vector-icon mw-ui-icon-wikimedia-language mw-ui-icon-wikimedia-wikimedia-language"></span> <span>51 languages</span> </button> <a href="#" class="cdx-button cdx-button--fake-button cdx-button--fake-button--enabled cdx-button--weight-quiet cdx-button--action-progressive" id="ca-addsection-sticky-header" tabindex="-1" data-event-name="addsection-sticky-header"><span class="vector-icon mw-ui-icon-speechBubbleAdd-progressive mw-ui-icon-wikimedia-speechBubbleAdd-progressive"></span> <span>Add topic</span> </a> </div> <div class="vector-sticky-header-icon-end"> <div class="vector-user-links"> </div> </div> </div> </div> </div> <div class="mw-portlet mw-portlet-dock-bottom emptyPortlet" id="p-dock-bottom"> <ul> </ul> </div> <script>(RLQ=window.RLQ||[]).push(function(){mw.config.set({"wgHostname":"mw-web.eqiad.main-dd5c8cc7b-8n7jn","wgBackendResponseTime":160,"wgPageParseReport":{"limitreport":{"cputime":"0.289","walltime":"0.475","ppvisitednodes":{"value":740,"limit":1000000},"postexpandincludesize":{"value":34940,"limit":2097152},"templateargumentsize":{"value":768,"limit":2097152},"expansiondepth":{"value":12,"limit":100},"expensivefunctioncount":{"value":3,"limit":500},"unstrip-depth":{"value":1,"limit":20},"unstrip-size":{"value":19022,"limit":5000000},"entityaccesscount":{"value":0,"limit":400},"timingprofile":["100.00% 411.228 1 -total"," 29.67% 122.004 1 Template:Short_description"," 27.60% 113.514 1 Template:Reflist"," 27.06% 111.279 1 Template:Natural_Language_Processing"," 26.11% 107.390 3 Template:Navbox"," 21.32% 87.664 2 Template:Pagetype"," 21.18% 87.102 1 Template:Cite_book"," 5.94% 24.440 1 Template:Main_article"," 5.38% 22.108 4 Template:Main_other"," 4.75% 19.520 1 Template:SDcat"]},"scribunto":{"limitreport-timeusage":{"value":"0.186","limit":"10.000"},"limitreport-memusage":{"value":4737031,"limit":52428800}},"cachereport":{"origin":"mw-web.eqiad.main-8669bc5c8-8nl5c","timestamp":"20250318155255","ttl":2592000,"transientcontent":false}}});});</script> <script type="application/ld+json">{"@context":"https:\/\/schema.org","@type":"Article","name":"Text corpus","url":"https:\/\/en.wikipedia.org\/wiki\/Text_corpus","sameAs":"http:\/\/www.wikidata.org\/entity\/Q461183","mainEntity":"http:\/\/www.wikidata.org\/entity\/Q461183","author":{"@type":"Organization","name":"Contributors to Wikimedia projects"},"publisher":{"@type":"Organization","name":"Wikimedia Foundation, Inc.","logo":{"@type":"ImageObject","url":"https:\/\/www.wikimedia.org\/static\/images\/wmf-hor-googpub.png"}},"datePublished":"2002-05-28T01:58:52Z","dateModified":"2024-11-14T09:48:39Z","headline":"large and structured set of texts being the basis for linguistic research"}</script> </body> </html>