CINXE.COM
tf–idf - Wikipedia
<!DOCTYPE html> <html class="client-nojs vector-feature-language-in-header-enabled vector-feature-language-in-main-page-header-disabled vector-feature-sticky-header-disabled vector-feature-page-tools-pinned-disabled vector-feature-toc-pinned-clientpref-1 vector-feature-main-menu-pinned-disabled vector-feature-limited-width-clientpref-1 vector-feature-limited-width-content-enabled vector-feature-custom-font-size-clientpref-1 vector-feature-appearance-pinned-clientpref-1 vector-feature-night-mode-enabled skin-theme-clientpref-day vector-toc-available" lang="en" dir="ltr"> <head> <meta charset="UTF-8"> <title>tf–idf - Wikipedia</title> <script>(function(){var className="client-js vector-feature-language-in-header-enabled vector-feature-language-in-main-page-header-disabled vector-feature-sticky-header-disabled vector-feature-page-tools-pinned-disabled vector-feature-toc-pinned-clientpref-1 vector-feature-main-menu-pinned-disabled vector-feature-limited-width-clientpref-1 vector-feature-limited-width-content-enabled vector-feature-custom-font-size-clientpref-1 vector-feature-appearance-pinned-clientpref-1 vector-feature-night-mode-enabled skin-theme-clientpref-day vector-toc-available";var cookie=document.cookie.match(/(?:^|; )enwikimwclientpreferences=([^;]+)/);if(cookie){cookie[1].split('%2C').forEach(function(pref){className=className.replace(new RegExp('(^| )'+pref.replace(/-clientpref-\w+$|[^\w-]+/g,'')+'-clientpref-\\w+( |$)'),'$1'+pref+'$2');});}document.documentElement.className=className;}());RLCONF={"wgBreakFrames":false,"wgSeparatorTransformTable":["",""],"wgDigitTransformTable":["",""],"wgDefaultDateFormat":"dmy", "wgMonthNames":["","January","February","March","April","May","June","July","August","September","October","November","December"],"wgRequestId":"84e22ab3-76ca-4657-927a-eb67021a5d14","wgCanonicalNamespace":"","wgCanonicalSpecialPageName":false,"wgNamespaceNumber":0,"wgPageName":"Tf–idf","wgTitle":"Tf–idf","wgCurRevisionId":1236851603,"wgRevisionId":1236851603,"wgArticleId":2057290,"wgIsArticle":true,"wgIsRedirect":false,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgCategories":["Articles with short description","Short description is different from Wikidata","Statistical natural language processing","Ranking functions","Vector space model"],"wgPageViewLanguage":"en","wgPageContentLanguage":"en","wgPageContentModel":"wikitext","wgRelevantPageName":"Tf–idf","wgRelevantArticleId":2057290,"wgIsProbablyEditable":true,"wgRelevantPageIsProbablyEditable":true,"wgRestrictionEdit":[],"wgRestrictionMove":[],"wgRedirectedFrom":"Tf-idf","wgNoticeProject":"wikipedia", "wgCiteReferencePreviewsActive":false,"wgFlaggedRevsParams":{"tags":{"status":{"levels":1}}},"wgMediaViewerOnClick":true,"wgMediaViewerEnabledByDefault":true,"wgPopupsFlags":0,"wgVisualEditor":{"pageLanguageCode":"en","pageLanguageDir":"ltr","pageVariantFallbacks":"en"},"wgMFDisplayWikibaseDescriptions":{"search":true,"watchlist":true,"tagline":false,"nearby":true},"wgWMESchemaEditAttemptStepOversample":false,"wgWMEPageLength":20000,"wgInternalRedirectTargetUrl":"/wiki/Tf%E2%80%93idf","wgRelatedArticlesCompat":[],"wgCentralAuthMobileDomain":false,"wgEditSubmitButtonLabelPublish":true,"wgULSPosition":"interlanguage","wgULSisCompactLinksEnabled":false,"wgVector2022LanguageInHeader":true,"wgULSisLanguageSelectorEmpty":false,"wgWikibaseItemId":"Q796584","wgCheckUserClientHintsHeadersJsApi":["brands","architecture","bitness","fullVersionList","mobile","model","platform","platformVersion"],"GEHomepageSuggestedEditsEnableTopics":true,"wgGETopicsMatchModeEnabled":false, "wgGEStructuredTaskRejectionReasonTextInputEnabled":false,"wgGELevelingUpEnabledForUser":false};RLSTATE={"ext.globalCssJs.user.styles":"ready","site.styles":"ready","user.styles":"ready","ext.globalCssJs.user":"ready","user":"ready","user.options":"loading","ext.cite.styles":"ready","ext.math.styles":"ready","skins.vector.search.codex.styles":"ready","skins.vector.styles":"ready","skins.vector.icons":"ready","ext.wikimediamessages.styles":"ready","ext.visualEditor.desktopArticleTarget.noscript":"ready","ext.uls.interlanguage":"ready","wikibase.client.init":"ready","ext.wikimediaBadges":"ready"};RLPAGEMODULES=["mediawiki.action.view.redirect","ext.cite.ux-enhancements","mediawiki.page.media","site","mediawiki.page.ready","mediawiki.toc","skins.vector.js","ext.centralNotice.geoIP","ext.centralNotice.startUp","ext.gadget.ReferenceTooltips","ext.gadget.switcher","ext.urlShortener.toolbar","ext.centralauth.centralautologin","mmv.bootstrap","ext.popups", "ext.visualEditor.desktopArticleTarget.init","ext.visualEditor.targetLoader","ext.echo.centralauth","ext.eventLogging","ext.wikimediaEvents","ext.navigationTiming","ext.uls.interface","ext.cx.eventlogging.campaigns","ext.cx.uls.quick.actions","wikibase.client.vector-2022","ext.checkUser.clientHints","ext.growthExperiments.SuggestedEditSession","wikibase.sidebar.tracking"];</script> <script>(RLQ=window.RLQ||[]).push(function(){mw.loader.impl(function(){return["user.options@12s5i",function($,jQuery,require,module){mw.user.tokens.set({"patrolToken":"+\\","watchToken":"+\\","csrfToken":"+\\"}); }];});});</script> <link rel="stylesheet" href="/w/load.php?lang=en&modules=ext.cite.styles%7Cext.math.styles%7Cext.uls.interlanguage%7Cext.visualEditor.desktopArticleTarget.noscript%7Cext.wikimediaBadges%7Cext.wikimediamessages.styles%7Cskins.vector.icons%2Cstyles%7Cskins.vector.search.codex.styles%7Cwikibase.client.init&only=styles&skin=vector-2022"> <script async="" src="/w/load.php?lang=en&modules=startup&only=scripts&raw=1&skin=vector-2022"></script> <meta name="ResourceLoaderDynamicStyles" content=""> <link rel="stylesheet" href="/w/load.php?lang=en&modules=site.styles&only=styles&skin=vector-2022"> <meta name="generator" content="MediaWiki 1.44.0-wmf.4"> <meta name="referrer" content="origin"> <meta name="referrer" content="origin-when-cross-origin"> <meta name="robots" content="max-image-preview:standard"> <meta name="format-detection" content="telephone=no"> <meta name="viewport" content="width=1120"> <meta property="og:title" content="tf–idf - Wikipedia"> <meta property="og:type" content="website"> <link rel="preconnect" href="//upload.wikimedia.org"> <link rel="alternate" media="only screen and (max-width: 640px)" href="//en.m.wikipedia.org/wiki/Tf%E2%80%93idf"> <link rel="alternate" type="application/x-wiki" title="Edit this page" href="/w/index.php?title=Tf%E2%80%93idf&action=edit"> <link rel="apple-touch-icon" href="/static/apple-touch/wikipedia.png"> <link rel="icon" href="/static/favicon/wikipedia.ico"> <link rel="search" type="application/opensearchdescription+xml" href="/w/rest.php/v1/search" title="Wikipedia (en)"> <link rel="EditURI" type="application/rsd+xml" href="//en.wikipedia.org/w/api.php?action=rsd"> <link rel="canonical" href="https://en.wikipedia.org/wiki/Tf%E2%80%93idf"> <link rel="license" href="https://creativecommons.org/licenses/by-sa/4.0/deed.en"> <link rel="alternate" type="application/atom+xml" title="Wikipedia Atom feed" href="/w/index.php?title=Special:RecentChanges&feed=atom"> <link rel="dns-prefetch" href="//meta.wikimedia.org" /> <link rel="dns-prefetch" href="//login.wikimedia.org"> </head> <body class="skin--responsive skin-vector skin-vector-search-vue mediawiki ltr sitedir-ltr mw-hide-empty-elt ns-0 ns-subject mw-editable page-Tf–idf rootpage-Tf–idf skin-vector-2022 action-view"><a class="mw-jump-link" href="#bodyContent">Jump to content</a> <div class="vector-header-container"> <header class="vector-header mw-header"> <div class="vector-header-start"> <nav class="vector-main-menu-landmark" aria-label="Site"> <div id="vector-main-menu-dropdown" class="vector-dropdown vector-main-menu-dropdown vector-button-flush-left vector-button-flush-right" > <input type="checkbox" id="vector-main-menu-dropdown-checkbox" role="button" aria-haspopup="true" data-event-name="ui.dropdown-vector-main-menu-dropdown" class="vector-dropdown-checkbox " aria-label="Main menu" > <label id="vector-main-menu-dropdown-label" for="vector-main-menu-dropdown-checkbox" class="vector-dropdown-label cdx-button cdx-button--fake-button cdx-button--fake-button--enabled cdx-button--weight-quiet cdx-button--icon-only " aria-hidden="true" ><span class="vector-icon mw-ui-icon-menu mw-ui-icon-wikimedia-menu"></span> <span class="vector-dropdown-label-text">Main menu</span> </label> <div class="vector-dropdown-content"> <div id="vector-main-menu-unpinned-container" class="vector-unpinned-container"> <div id="vector-main-menu" class="vector-main-menu vector-pinnable-element"> <div class="vector-pinnable-header vector-main-menu-pinnable-header vector-pinnable-header-unpinned" data-feature-name="main-menu-pinned" data-pinnable-element-id="vector-main-menu" data-pinned-container-id="vector-main-menu-pinned-container" data-unpinned-container-id="vector-main-menu-unpinned-container" > <div class="vector-pinnable-header-label">Main menu</div> <button class="vector-pinnable-header-toggle-button vector-pinnable-header-pin-button" data-event-name="pinnable-header.vector-main-menu.pin">move to sidebar</button> <button class="vector-pinnable-header-toggle-button vector-pinnable-header-unpin-button" data-event-name="pinnable-header.vector-main-menu.unpin">hide</button> </div> <div id="p-navigation" class="vector-menu mw-portlet mw-portlet-navigation" > <div class="vector-menu-heading"> Navigation </div> <div class="vector-menu-content"> <ul class="vector-menu-content-list"> <li id="n-mainpage-description" class="mw-list-item"><a href="/wiki/Main_Page" title="Visit the main page [z]" accesskey="z"><span>Main page</span></a></li><li id="n-contents" class="mw-list-item"><a href="/wiki/Wikipedia:Contents" title="Guides to browsing Wikipedia"><span>Contents</span></a></li><li id="n-currentevents" class="mw-list-item"><a href="/wiki/Portal:Current_events" title="Articles related to current events"><span>Current events</span></a></li><li id="n-randompage" class="mw-list-item"><a href="/wiki/Special:Random" title="Visit a randomly selected article [x]" accesskey="x"><span>Random article</span></a></li><li id="n-aboutsite" class="mw-list-item"><a href="/wiki/Wikipedia:About" title="Learn about Wikipedia and how it works"><span>About Wikipedia</span></a></li><li id="n-contactpage" class="mw-list-item"><a href="//en.wikipedia.org/wiki/Wikipedia:Contact_us" title="How to contact Wikipedia"><span>Contact us</span></a></li> </ul> </div> </div> <div id="p-interaction" class="vector-menu mw-portlet mw-portlet-interaction" > <div class="vector-menu-heading"> Contribute </div> <div class="vector-menu-content"> <ul class="vector-menu-content-list"> <li id="n-help" class="mw-list-item"><a href="/wiki/Help:Contents" title="Guidance on how to use and edit Wikipedia"><span>Help</span></a></li><li id="n-introduction" class="mw-list-item"><a href="/wiki/Help:Introduction" title="Learn how to edit Wikipedia"><span>Learn to edit</span></a></li><li id="n-portal" class="mw-list-item"><a href="/wiki/Wikipedia:Community_portal" title="The hub for editors"><span>Community portal</span></a></li><li id="n-recentchanges" class="mw-list-item"><a href="/wiki/Special:RecentChanges" title="A list of recent changes to Wikipedia [r]" accesskey="r"><span>Recent changes</span></a></li><li id="n-upload" class="mw-list-item"><a href="/wiki/Wikipedia:File_upload_wizard" title="Add images or other media for use on Wikipedia"><span>Upload file</span></a></li> </ul> </div> </div> </div> </div> </div> </div> </nav> <a href="/wiki/Main_Page" class="mw-logo"> <img class="mw-logo-icon" src="/static/images/icons/wikipedia.png" alt="" aria-hidden="true" height="50" width="50"> <span class="mw-logo-container skin-invert"> <img class="mw-logo-wordmark" alt="Wikipedia" src="/static/images/mobile/copyright/wikipedia-wordmark-en.svg" style="width: 7.5em; height: 1.125em;"> <img class="mw-logo-tagline" alt="The Free Encyclopedia" src="/static/images/mobile/copyright/wikipedia-tagline-en.svg" width="117" height="13" style="width: 7.3125em; height: 0.8125em;"> </span> </a> </div> <div class="vector-header-end"> <div id="p-search" role="search" class="vector-search-box-vue vector-search-box-collapses vector-search-box-show-thumbnail vector-search-box-auto-expand-width vector-search-box"> <a href="/wiki/Special:Search" class="cdx-button cdx-button--fake-button cdx-button--fake-button--enabled cdx-button--weight-quiet cdx-button--icon-only search-toggle" title="Search Wikipedia [f]" accesskey="f"><span class="vector-icon mw-ui-icon-search mw-ui-icon-wikimedia-search"></span> <span>Search</span> </a> <div class="vector-typeahead-search-container"> <div class="cdx-typeahead-search cdx-typeahead-search--show-thumbnail cdx-typeahead-search--auto-expand-width"> <form action="/w/index.php" id="searchform" class="cdx-search-input cdx-search-input--has-end-button"> <div id="simpleSearch" class="cdx-search-input__input-wrapper" data-search-loc="header-moved"> <div class="cdx-text-input cdx-text-input--has-start-icon"> <input class="cdx-text-input__input" type="search" name="search" placeholder="Search Wikipedia" aria-label="Search Wikipedia" autocapitalize="sentences" title="Search Wikipedia [f]" accesskey="f" id="searchInput" > <span class="cdx-text-input__icon cdx-text-input__start-icon"></span> </div> <input type="hidden" name="title" value="Special:Search"> </div> <button class="cdx-button cdx-search-input__end-button">Search</button> </form> </div> </div> </div> <nav class="vector-user-links vector-user-links-wide" aria-label="Personal tools"> <div class="vector-user-links-main"> <div id="p-vector-user-menu-preferences" class="vector-menu mw-portlet emptyPortlet" > <div class="vector-menu-content"> <ul class="vector-menu-content-list"> </ul> </div> </div> <div id="p-vector-user-menu-userpage" class="vector-menu mw-portlet emptyPortlet" > <div class="vector-menu-content"> <ul class="vector-menu-content-list"> </ul> </div> </div> <nav class="vector-appearance-landmark" aria-label="Appearance"> <div id="vector-appearance-dropdown" class="vector-dropdown " title="Change the appearance of the page's font size, width, and color" > <input type="checkbox" id="vector-appearance-dropdown-checkbox" role="button" aria-haspopup="true" data-event-name="ui.dropdown-vector-appearance-dropdown" class="vector-dropdown-checkbox " aria-label="Appearance" > <label id="vector-appearance-dropdown-label" for="vector-appearance-dropdown-checkbox" class="vector-dropdown-label cdx-button cdx-button--fake-button cdx-button--fake-button--enabled cdx-button--weight-quiet cdx-button--icon-only " aria-hidden="true" ><span class="vector-icon mw-ui-icon-appearance mw-ui-icon-wikimedia-appearance"></span> <span class="vector-dropdown-label-text">Appearance</span> </label> <div class="vector-dropdown-content"> <div id="vector-appearance-unpinned-container" class="vector-unpinned-container"> </div> </div> </div> </nav> <div id="p-vector-user-menu-notifications" class="vector-menu mw-portlet emptyPortlet" > <div class="vector-menu-content"> <ul class="vector-menu-content-list"> </ul> </div> </div> <div id="p-vector-user-menu-overflow" class="vector-menu mw-portlet" > <div class="vector-menu-content"> <ul class="vector-menu-content-list"> <li id="pt-sitesupport-2" class="user-links-collapsible-item mw-list-item user-links-collapsible-item"><a data-mw="interface" href="https://donate.wikimedia.org/wiki/Special:FundraiserRedirector?utm_source=donate&utm_medium=sidebar&utm_campaign=C13_en.wikipedia.org&uselang=en" class=""><span>Donate</span></a> </li> <li id="pt-createaccount-2" class="user-links-collapsible-item mw-list-item user-links-collapsible-item"><a data-mw="interface" href="/w/index.php?title=Special:CreateAccount&returnto=Tf%E2%80%93idf" title="You are encouraged to create an account and log in; however, it is not mandatory" class=""><span>Create account</span></a> </li> <li id="pt-login-2" class="user-links-collapsible-item mw-list-item user-links-collapsible-item"><a data-mw="interface" href="/w/index.php?title=Special:UserLogin&returnto=Tf%E2%80%93idf" title="You're encouraged to log in; however, it's not mandatory. [o]" accesskey="o" class=""><span>Log in</span></a> </li> </ul> </div> </div> </div> <div id="vector-user-links-dropdown" class="vector-dropdown vector-user-menu vector-button-flush-right vector-user-menu-logged-out" title="Log in and more options" > <input type="checkbox" id="vector-user-links-dropdown-checkbox" role="button" aria-haspopup="true" data-event-name="ui.dropdown-vector-user-links-dropdown" class="vector-dropdown-checkbox " aria-label="Personal tools" > <label id="vector-user-links-dropdown-label" for="vector-user-links-dropdown-checkbox" class="vector-dropdown-label cdx-button cdx-button--fake-button cdx-button--fake-button--enabled cdx-button--weight-quiet cdx-button--icon-only " aria-hidden="true" ><span class="vector-icon mw-ui-icon-ellipsis mw-ui-icon-wikimedia-ellipsis"></span> <span class="vector-dropdown-label-text">Personal tools</span> </label> <div class="vector-dropdown-content"> <div id="p-personal" class="vector-menu mw-portlet mw-portlet-personal user-links-collapsible-item" title="User menu" > <div class="vector-menu-content"> <ul class="vector-menu-content-list"> <li id="pt-sitesupport" class="user-links-collapsible-item mw-list-item"><a href="https://donate.wikimedia.org/wiki/Special:FundraiserRedirector?utm_source=donate&utm_medium=sidebar&utm_campaign=C13_en.wikipedia.org&uselang=en"><span>Donate</span></a></li><li id="pt-createaccount" class="user-links-collapsible-item mw-list-item"><a href="/w/index.php?title=Special:CreateAccount&returnto=Tf%E2%80%93idf" title="You are encouraged to create an account and log in; however, it is not mandatory"><span class="vector-icon mw-ui-icon-userAdd mw-ui-icon-wikimedia-userAdd"></span> <span>Create account</span></a></li><li id="pt-login" class="user-links-collapsible-item mw-list-item"><a href="/w/index.php?title=Special:UserLogin&returnto=Tf%E2%80%93idf" title="You're encouraged to log in; however, it's not mandatory. [o]" accesskey="o"><span class="vector-icon mw-ui-icon-logIn mw-ui-icon-wikimedia-logIn"></span> <span>Log in</span></a></li> </ul> </div> </div> <div id="p-user-menu-anon-editor" class="vector-menu mw-portlet mw-portlet-user-menu-anon-editor" > <div class="vector-menu-heading"> Pages for logged out editors <a href="/wiki/Help:Introduction" aria-label="Learn more about editing"><span>learn more</span></a> </div> <div class="vector-menu-content"> <ul class="vector-menu-content-list"> <li id="pt-anoncontribs" class="mw-list-item"><a href="/wiki/Special:MyContributions" title="A list of edits made from this IP address [y]" accesskey="y"><span>Contributions</span></a></li><li id="pt-anontalk" class="mw-list-item"><a href="/wiki/Special:MyTalk" title="Discussion about edits from this IP address [n]" accesskey="n"><span>Talk</span></a></li> </ul> </div> </div> </div> </div> </nav> </div> </header> </div> <div class="mw-page-container"> <div class="mw-page-container-inner"> <div class="vector-sitenotice-container"> <div id="siteNotice"><!-- CentralNotice --></div> </div> <div class="vector-column-start"> <div class="vector-main-menu-container"> <div id="mw-navigation"> <nav id="mw-panel" class="vector-main-menu-landmark" aria-label="Site"> <div id="vector-main-menu-pinned-container" class="vector-pinned-container"> </div> </nav> </div> </div> <div class="vector-sticky-pinned-container"> <nav id="mw-panel-toc" aria-label="Contents" data-event-name="ui.sidebar-toc" class="mw-table-of-contents-container vector-toc-landmark"> <div id="vector-toc-pinned-container" class="vector-pinned-container"> <div id="vector-toc" class="vector-toc vector-pinnable-element"> <div class="vector-pinnable-header vector-toc-pinnable-header vector-pinnable-header-pinned" data-feature-name="toc-pinned" data-pinnable-element-id="vector-toc" > <h2 class="vector-pinnable-header-label">Contents</h2> <button class="vector-pinnable-header-toggle-button vector-pinnable-header-pin-button" data-event-name="pinnable-header.vector-toc.pin">move to sidebar</button> <button class="vector-pinnable-header-toggle-button vector-pinnable-header-unpin-button" data-event-name="pinnable-header.vector-toc.unpin">hide</button> </div> <ul class="vector-toc-contents" id="mw-panel-toc-list"> <li id="toc-mw-content-text" class="vector-toc-list-item vector-toc-level-1"> <a href="#" class="vector-toc-link"> <div class="vector-toc-text">(Top)</div> </a> </li> <li id="toc-Motivations" class="vector-toc-list-item vector-toc-level-1 vector-toc-list-item-expanded"> <a class="vector-toc-link" href="#Motivations"> <div class="vector-toc-text"> <span class="vector-toc-numb">1</span> <span>Motivations</span> </div> </a> <ul id="toc-Motivations-sublist" class="vector-toc-list"> </ul> </li> <li id="toc-Definition" class="vector-toc-list-item vector-toc-level-1 vector-toc-list-item-expanded"> <a class="vector-toc-link" href="#Definition"> <div class="vector-toc-text"> <span class="vector-toc-numb">2</span> <span>Definition</span> </div> </a> <button aria-controls="toc-Definition-sublist" class="cdx-button cdx-button--weight-quiet cdx-button--icon-only vector-toc-toggle"> <span class="vector-icon mw-ui-icon-wikimedia-expand"></span> <span>Toggle Definition subsection</span> </button> <ul id="toc-Definition-sublist" class="vector-toc-list"> <li id="toc-Term_frequency" class="vector-toc-list-item vector-toc-level-2"> <a class="vector-toc-link" href="#Term_frequency"> <div class="vector-toc-text"> <span class="vector-toc-numb">2.1</span> <span>Term frequency</span> </div> </a> <ul id="toc-Term_frequency-sublist" class="vector-toc-list"> </ul> </li> <li id="toc-Inverse_document_frequency" class="vector-toc-list-item vector-toc-level-2"> <a class="vector-toc-link" href="#Inverse_document_frequency"> <div class="vector-toc-text"> <span class="vector-toc-numb">2.2</span> <span>Inverse document frequency</span> </div> </a> <ul id="toc-Inverse_document_frequency-sublist" class="vector-toc-list"> </ul> </li> <li id="toc-Term_frequency–inverse_document_frequency" class="vector-toc-list-item vector-toc-level-2"> <a class="vector-toc-link" href="#Term_frequency–inverse_document_frequency"> <div class="vector-toc-text"> <span class="vector-toc-numb">2.3</span> <span>Term frequency–inverse document frequency</span> </div> </a> <ul id="toc-Term_frequency–inverse_document_frequency-sublist" class="vector-toc-list"> </ul> </li> </ul> </li> <li id="toc-Justification_of_idf" class="vector-toc-list-item vector-toc-level-1 vector-toc-list-item-expanded"> <a class="vector-toc-link" href="#Justification_of_idf"> <div class="vector-toc-text"> <span class="vector-toc-numb">3</span> <span>Justification of idf</span> </div> </a> <ul id="toc-Justification_of_idf-sublist" class="vector-toc-list"> </ul> </li> <li id="toc-Link_with_information_theory" class="vector-toc-list-item vector-toc-level-1 vector-toc-list-item-expanded"> <a class="vector-toc-link" href="#Link_with_information_theory"> <div class="vector-toc-text"> <span class="vector-toc-numb">4</span> <span>Link with information theory</span> </div> </a> <ul id="toc-Link_with_information_theory-sublist" class="vector-toc-list"> </ul> </li> <li id="toc-Example_of_tf–idf" class="vector-toc-list-item vector-toc-level-1 vector-toc-list-item-expanded"> <a class="vector-toc-link" href="#Example_of_tf–idf"> <div class="vector-toc-text"> <span class="vector-toc-numb">5</span> <span>Example of tf–idf</span> </div> </a> <ul id="toc-Example_of_tf–idf-sublist" class="vector-toc-list"> </ul> </li> <li id="toc-Beyond_terms" class="vector-toc-list-item vector-toc-level-1 vector-toc-list-item-expanded"> <a class="vector-toc-link" href="#Beyond_terms"> <div class="vector-toc-text"> <span class="vector-toc-numb">6</span> <span>Beyond terms</span> </div> </a> <ul id="toc-Beyond_terms-sublist" class="vector-toc-list"> </ul> </li> <li id="toc-Derivatives" class="vector-toc-list-item vector-toc-level-1 vector-toc-list-item-expanded"> <a class="vector-toc-link" href="#Derivatives"> <div class="vector-toc-text"> <span class="vector-toc-numb">7</span> <span>Derivatives</span> </div> </a> <ul id="toc-Derivatives-sublist" class="vector-toc-list"> </ul> </li> <li id="toc-See_also" class="vector-toc-list-item vector-toc-level-1 vector-toc-list-item-expanded"> <a class="vector-toc-link" href="#See_also"> <div class="vector-toc-text"> <span class="vector-toc-numb">8</span> <span>See also</span> </div> </a> <ul id="toc-See_also-sublist" class="vector-toc-list"> </ul> </li> <li id="toc-References" class="vector-toc-list-item vector-toc-level-1 vector-toc-list-item-expanded"> <a class="vector-toc-link" href="#References"> <div class="vector-toc-text"> <span class="vector-toc-numb">9</span> <span>References</span> </div> </a> <ul id="toc-References-sublist" class="vector-toc-list"> </ul> </li> <li id="toc-External_links_and_suggested_reading" class="vector-toc-list-item vector-toc-level-1 vector-toc-list-item-expanded"> <a class="vector-toc-link" href="#External_links_and_suggested_reading"> <div class="vector-toc-text"> <span class="vector-toc-numb">10</span> <span>External links and suggested reading</span> </div> </a> <ul id="toc-External_links_and_suggested_reading-sublist" class="vector-toc-list"> </ul> </li> </ul> </div> </div> </nav> </div> </div> <div class="mw-content-container"> <main id="content" class="mw-body"> <header class="mw-body-header vector-page-titlebar"> <nav aria-label="Contents" class="vector-toc-landmark"> <div id="vector-page-titlebar-toc" class="vector-dropdown vector-page-titlebar-toc vector-button-flush-left" > <input type="checkbox" id="vector-page-titlebar-toc-checkbox" role="button" aria-haspopup="true" data-event-name="ui.dropdown-vector-page-titlebar-toc" class="vector-dropdown-checkbox " aria-label="Toggle the table of contents" > <label id="vector-page-titlebar-toc-label" for="vector-page-titlebar-toc-checkbox" class="vector-dropdown-label cdx-button cdx-button--fake-button cdx-button--fake-button--enabled cdx-button--weight-quiet cdx-button--icon-only " aria-hidden="true" ><span class="vector-icon mw-ui-icon-listBullet mw-ui-icon-wikimedia-listBullet"></span> <span class="vector-dropdown-label-text">Toggle the table of contents</span> </label> <div class="vector-dropdown-content"> <div id="vector-page-titlebar-toc-unpinned-container" class="vector-unpinned-container"> </div> </div> </div> </nav> <h1 id="firstHeading" class="firstHeading mw-first-heading">tf–idf</h1> <div id="p-lang-btn" class="vector-dropdown mw-portlet mw-portlet-lang" > <input type="checkbox" id="p-lang-btn-checkbox" role="button" aria-haspopup="true" data-event-name="ui.dropdown-p-lang-btn" class="vector-dropdown-checkbox mw-interlanguage-selector" aria-label="Go to an article in another language. Available in 23 languages" > <label id="p-lang-btn-label" for="p-lang-btn-checkbox" class="vector-dropdown-label cdx-button cdx-button--fake-button cdx-button--fake-button--enabled cdx-button--weight-quiet cdx-button--action-progressive mw-portlet-lang-heading-23" aria-hidden="true" ><span class="vector-icon mw-ui-icon-language-progressive mw-ui-icon-wikimedia-language-progressive"></span> <span class="vector-dropdown-label-text">23 languages</span> </label> <div class="vector-dropdown-content"> <div class="vector-menu-content"> <ul class="vector-menu-content-list"> <li class="interlanguage-link interwiki-ar mw-list-item"><a href="https://ar.wikipedia.org/wiki/%D8%AA%D9%8A_%D8%A7%D9%81-%D8%A7%D9%8A_%D8%AF%D9%8A_%D8%AF%D9%81" title="تي اف-اي دي دف – Arabic" lang="ar" hreflang="ar" data-title="تي اف-اي دي دف" data-language-autonym="العربية" data-language-local-name="Arabic" class="interlanguage-link-target"><span>العربية</span></a></li><li class="interlanguage-link interwiki-ca mw-list-item"><a href="https://ca.wikipedia.org/wiki/Tf-idf" title="Tf-idf – Catalan" lang="ca" hreflang="ca" data-title="Tf-idf" data-language-autonym="Català" data-language-local-name="Catalan" class="interlanguage-link-target"><span>Català</span></a></li><li class="interlanguage-link interwiki-cs mw-list-item"><a href="https://cs.wikipedia.org/wiki/Tf-idf" title="Tf-idf – Czech" lang="cs" hreflang="cs" data-title="Tf-idf" data-language-autonym="Čeština" data-language-local-name="Czech" class="interlanguage-link-target"><span>Čeština</span></a></li><li class="interlanguage-link interwiki-de mw-list-item"><a href="https://de.wikipedia.org/wiki/Tf-idf-Ma%C3%9F" title="Tf-idf-Maß – German" lang="de" hreflang="de" data-title="Tf-idf-Maß" data-language-autonym="Deutsch" data-language-local-name="German" class="interlanguage-link-target"><span>Deutsch</span></a></li><li class="interlanguage-link interwiki-et mw-list-item"><a href="https://et.wikipedia.org/wiki/Tf%E2%80%93idf" title="Tf–idf – Estonian" lang="et" hreflang="et" data-title="Tf–idf" data-language-autonym="Eesti" data-language-local-name="Estonian" class="interlanguage-link-target"><span>Eesti</span></a></li><li class="interlanguage-link interwiki-es mw-list-item"><a href="https://es.wikipedia.org/wiki/Tf-idf" title="Tf-idf – Spanish" lang="es" hreflang="es" data-title="Tf-idf" data-language-autonym="Español" data-language-local-name="Spanish" class="interlanguage-link-target"><span>Español</span></a></li><li class="interlanguage-link interwiki-eu mw-list-item"><a href="https://eu.wikipedia.org/wiki/Tf%E2%80%93idf" title="Tf–idf – Basque" lang="eu" hreflang="eu" data-title="Tf–idf" data-language-autonym="Euskara" data-language-local-name="Basque" class="interlanguage-link-target"><span>Euskara</span></a></li><li class="interlanguage-link interwiki-fa mw-list-item"><a href="https://fa.wikipedia.org/wiki/%D9%81%D8%B1%D8%A7%D9%88%D8%A7%D9%86%DB%8C_%D9%88%D8%B2%D9%86%DB%8C_%D8%AA%DB%8C%E2%80%8C%D8%A7%D9%81-%D8%A2%DB%8C%E2%80%8C%D8%AF%DB%8C%E2%80%8C%D8%A7%D9%81" title="فراوانی وزنی تیاف-آیدیاف – Persian" lang="fa" hreflang="fa" data-title="فراوانی وزنی تیاف-آیدیاف" data-language-autonym="فارسی" data-language-local-name="Persian" class="interlanguage-link-target"><span>فارسی</span></a></li><li class="interlanguage-link interwiki-fr mw-list-item"><a href="https://fr.wikipedia.org/wiki/TF-IDF" title="TF-IDF – French" lang="fr" hreflang="fr" data-title="TF-IDF" data-language-autonym="Français" data-language-local-name="French" class="interlanguage-link-target"><span>Français</span></a></li><li class="interlanguage-link interwiki-ko mw-list-item"><a href="https://ko.wikipedia.org/wiki/Tf-idf" title="Tf-idf – Korean" lang="ko" hreflang="ko" data-title="Tf-idf" data-language-autonym="한국어" data-language-local-name="Korean" class="interlanguage-link-target"><span>한국어</span></a></li><li class="interlanguage-link interwiki-id mw-list-item"><a href="https://id.wikipedia.org/wiki/Tf%E2%80%93idf" title="Tf–idf – Indonesian" lang="id" hreflang="id" data-title="Tf–idf" data-language-autonym="Bahasa Indonesia" data-language-local-name="Indonesian" class="interlanguage-link-target"><span>Bahasa Indonesia</span></a></li><li class="interlanguage-link interwiki-it mw-list-item"><a href="https://it.wikipedia.org/wiki/Tf-idf" title="Tf-idf – Italian" lang="it" hreflang="it" data-title="Tf-idf" data-language-autonym="Italiano" data-language-local-name="Italian" class="interlanguage-link-target"><span>Italiano</span></a></li><li class="interlanguage-link interwiki-he mw-list-item"><a href="https://he.wikipedia.org/wiki/Tf%E2%80%93idf" title="Tf–idf – Hebrew" lang="he" hreflang="he" data-title="Tf–idf" data-language-autonym="עברית" data-language-local-name="Hebrew" class="interlanguage-link-target"><span>עברית</span></a></li><li class="interlanguage-link interwiki-ja mw-list-item"><a href="https://ja.wikipedia.org/wiki/Tf-idf" title="Tf-idf – Japanese" lang="ja" hreflang="ja" data-title="Tf-idf" data-language-autonym="日本語" data-language-local-name="Japanese" class="interlanguage-link-target"><span>日本語</span></a></li><li class="interlanguage-link interwiki-no mw-list-item"><a href="https://no.wikipedia.org/wiki/Tf-idf" title="Tf-idf – Norwegian Bokmål" lang="nb" hreflang="nb" data-title="Tf-idf" data-language-autonym="Norsk bokmål" data-language-local-name="Norwegian Bokmål" class="interlanguage-link-target"><span>Norsk bokmål</span></a></li><li class="interlanguage-link interwiki-pl mw-list-item"><a href="https://pl.wikipedia.org/wiki/TFIDF" title="TFIDF – Polish" lang="pl" hreflang="pl" data-title="TFIDF" data-language-autonym="Polski" data-language-local-name="Polish" class="interlanguage-link-target"><span>Polski</span></a></li><li class="interlanguage-link interwiki-pt mw-list-item"><a href="https://pt.wikipedia.org/wiki/Tf%E2%80%93idf" title="Tf–idf – Portuguese" lang="pt" hreflang="pt" data-title="Tf–idf" data-language-autonym="Português" data-language-local-name="Portuguese" class="interlanguage-link-target"><span>Português</span></a></li><li class="interlanguage-link interwiki-ro mw-list-item"><a href="https://ro.wikipedia.org/wiki/Tf%E2%80%93idf" title="Tf–idf – Romanian" lang="ro" hreflang="ro" data-title="Tf–idf" data-language-autonym="Română" data-language-local-name="Romanian" class="interlanguage-link-target"><span>Română</span></a></li><li class="interlanguage-link interwiki-ru mw-list-item"><a href="https://ru.wikipedia.org/wiki/TF-IDF" title="TF-IDF – Russian" lang="ru" hreflang="ru" data-title="TF-IDF" data-language-autonym="Русский" data-language-local-name="Russian" class="interlanguage-link-target"><span>Русский</span></a></li><li class="interlanguage-link interwiki-uk mw-list-item"><a href="https://uk.wikipedia.org/wiki/TF-IDF" title="TF-IDF – Ukrainian" lang="uk" hreflang="uk" data-title="TF-IDF" data-language-autonym="Українська" data-language-local-name="Ukrainian" class="interlanguage-link-target"><span>Українська</span></a></li><li class="interlanguage-link interwiki-vi mw-list-item"><a href="https://vi.wikipedia.org/wiki/Tf%E2%80%93idf" title="Tf–idf – Vietnamese" lang="vi" hreflang="vi" data-title="Tf–idf" data-language-autonym="Tiếng Việt" data-language-local-name="Vietnamese" class="interlanguage-link-target"><span>Tiếng Việt</span></a></li><li class="interlanguage-link interwiki-zh-yue mw-list-item"><a href="https://zh-yue.wikipedia.org/wiki/Tf-idf" title="Tf-idf – Cantonese" lang="yue" hreflang="yue" data-title="Tf-idf" data-language-autonym="粵語" data-language-local-name="Cantonese" class="interlanguage-link-target"><span>粵語</span></a></li><li class="interlanguage-link interwiki-zh mw-list-item"><a href="https://zh.wikipedia.org/wiki/Tf-idf" title="Tf-idf – Chinese" lang="zh" hreflang="zh" data-title="Tf-idf" data-language-autonym="中文" data-language-local-name="Chinese" class="interlanguage-link-target"><span>中文</span></a></li> </ul> <div class="after-portlet after-portlet-lang"><span class="wb-langlinks-edit wb-langlinks-link"><a href="https://www.wikidata.org/wiki/Special:EntityPage/Q796584#sitelinks-wikipedia" title="Edit interlanguage links" class="wbc-editpage">Edit links</a></span></div> </div> </div> </div> </header> <div class="vector-page-toolbar"> <div class="vector-page-toolbar-container"> <div id="left-navigation"> <nav aria-label="Namespaces"> <div id="p-associated-pages" class="vector-menu vector-menu-tabs mw-portlet mw-portlet-associated-pages" > <div class="vector-menu-content"> <ul class="vector-menu-content-list"> <li id="ca-nstab-main" class="selected vector-tab-noicon mw-list-item"><a href="/wiki/Tf%E2%80%93idf" title="View the content page [c]" accesskey="c"><span>Article</span></a></li><li id="ca-talk" class="vector-tab-noicon mw-list-item"><a href="/wiki/Talk:Tf%E2%80%93idf" rel="discussion" title="Discuss improvements to the content page [t]" accesskey="t"><span>Talk</span></a></li> </ul> </div> </div> <div id="vector-variants-dropdown" class="vector-dropdown emptyPortlet" > <input type="checkbox" id="vector-variants-dropdown-checkbox" role="button" aria-haspopup="true" data-event-name="ui.dropdown-vector-variants-dropdown" class="vector-dropdown-checkbox " aria-label="Change language variant" > <label id="vector-variants-dropdown-label" for="vector-variants-dropdown-checkbox" class="vector-dropdown-label cdx-button cdx-button--fake-button cdx-button--fake-button--enabled cdx-button--weight-quiet" aria-hidden="true" ><span class="vector-dropdown-label-text">English</span> </label> <div class="vector-dropdown-content"> <div id="p-variants" class="vector-menu mw-portlet mw-portlet-variants emptyPortlet" > <div class="vector-menu-content"> <ul class="vector-menu-content-list"> </ul> </div> </div> </div> </div> </nav> </div> <div id="right-navigation" class="vector-collapsible"> <nav aria-label="Views"> <div id="p-views" class="vector-menu vector-menu-tabs mw-portlet mw-portlet-views" > <div class="vector-menu-content"> <ul class="vector-menu-content-list"> <li id="ca-view" class="selected vector-tab-noicon mw-list-item"><a href="/wiki/Tf%E2%80%93idf"><span>Read</span></a></li><li id="ca-edit" class="vector-tab-noicon mw-list-item"><a href="/w/index.php?title=Tf%E2%80%93idf&action=edit" title="Edit this page [e]" accesskey="e"><span>Edit</span></a></li><li id="ca-history" class="vector-tab-noicon mw-list-item"><a href="/w/index.php?title=Tf%E2%80%93idf&action=history" title="Past revisions of this page [h]" accesskey="h"><span>View history</span></a></li> </ul> </div> </div> </nav> <nav class="vector-page-tools-landmark" aria-label="Page tools"> <div id="vector-page-tools-dropdown" class="vector-dropdown vector-page-tools-dropdown" > <input type="checkbox" id="vector-page-tools-dropdown-checkbox" role="button" aria-haspopup="true" data-event-name="ui.dropdown-vector-page-tools-dropdown" class="vector-dropdown-checkbox " aria-label="Tools" > <label id="vector-page-tools-dropdown-label" for="vector-page-tools-dropdown-checkbox" class="vector-dropdown-label cdx-button cdx-button--fake-button cdx-button--fake-button--enabled cdx-button--weight-quiet" aria-hidden="true" ><span class="vector-dropdown-label-text">Tools</span> </label> <div class="vector-dropdown-content"> <div id="vector-page-tools-unpinned-container" class="vector-unpinned-container"> <div id="vector-page-tools" class="vector-page-tools vector-pinnable-element"> <div class="vector-pinnable-header vector-page-tools-pinnable-header vector-pinnable-header-unpinned" data-feature-name="page-tools-pinned" data-pinnable-element-id="vector-page-tools" data-pinned-container-id="vector-page-tools-pinned-container" data-unpinned-container-id="vector-page-tools-unpinned-container" > <div class="vector-pinnable-header-label">Tools</div> <button class="vector-pinnable-header-toggle-button vector-pinnable-header-pin-button" data-event-name="pinnable-header.vector-page-tools.pin">move to sidebar</button> <button class="vector-pinnable-header-toggle-button vector-pinnable-header-unpin-button" data-event-name="pinnable-header.vector-page-tools.unpin">hide</button> </div> <div id="p-cactions" class="vector-menu mw-portlet mw-portlet-cactions emptyPortlet vector-has-collapsible-items" title="More options" > <div class="vector-menu-heading"> Actions </div> <div class="vector-menu-content"> <ul class="vector-menu-content-list"> <li id="ca-more-view" class="selected vector-more-collapsible-item mw-list-item"><a href="/wiki/Tf%E2%80%93idf"><span>Read</span></a></li><li id="ca-more-edit" class="vector-more-collapsible-item mw-list-item"><a href="/w/index.php?title=Tf%E2%80%93idf&action=edit" title="Edit this page [e]" accesskey="e"><span>Edit</span></a></li><li id="ca-more-history" class="vector-more-collapsible-item mw-list-item"><a href="/w/index.php?title=Tf%E2%80%93idf&action=history"><span>View history</span></a></li> </ul> </div> </div> <div id="p-tb" class="vector-menu mw-portlet mw-portlet-tb" > <div class="vector-menu-heading"> General </div> <div class="vector-menu-content"> <ul class="vector-menu-content-list"> <li id="t-whatlinkshere" class="mw-list-item"><a href="/wiki/Special:WhatLinksHere/Tf%E2%80%93idf" title="List of all English Wikipedia pages containing links to this page [j]" accesskey="j"><span>What links here</span></a></li><li id="t-recentchangeslinked" class="mw-list-item"><a href="/wiki/Special:RecentChangesLinked/Tf%E2%80%93idf" rel="nofollow" title="Recent changes in pages linked from this page [k]" accesskey="k"><span>Related changes</span></a></li><li id="t-upload" class="mw-list-item"><a href="/wiki/Wikipedia:File_Upload_Wizard" title="Upload files [u]" accesskey="u"><span>Upload file</span></a></li><li id="t-specialpages" class="mw-list-item"><a href="/wiki/Special:SpecialPages" title="A list of all special pages [q]" accesskey="q"><span>Special pages</span></a></li><li id="t-permalink" class="mw-list-item"><a href="/w/index.php?title=Tf%E2%80%93idf&oldid=1236851603" title="Permanent link to this revision of this page"><span>Permanent link</span></a></li><li id="t-info" class="mw-list-item"><a href="/w/index.php?title=Tf%E2%80%93idf&action=info" title="More information about this page"><span>Page information</span></a></li><li id="t-cite" class="mw-list-item"><a href="/w/index.php?title=Special:CiteThisPage&page=Tf%E2%80%93idf&id=1236851603&wpFormIdentifier=titleform" title="Information on how to cite this page"><span>Cite this page</span></a></li><li id="t-urlshortener" class="mw-list-item"><a href="/w/index.php?title=Special:UrlShortener&url=https%3A%2F%2Fen.wikipedia.org%2Fwiki%2FTf%25E2%2580%2593idf"><span>Get shortened URL</span></a></li><li id="t-urlshortener-qrcode" class="mw-list-item"><a href="/w/index.php?title=Special:QrCode&url=https%3A%2F%2Fen.wikipedia.org%2Fwiki%2FTf%25E2%2580%2593idf"><span>Download QR code</span></a></li> </ul> </div> </div> <div id="p-coll-print_export" class="vector-menu mw-portlet mw-portlet-coll-print_export" > <div class="vector-menu-heading"> Print/export </div> <div class="vector-menu-content"> <ul class="vector-menu-content-list"> <li id="coll-download-as-rl" class="mw-list-item"><a href="/w/index.php?title=Special:DownloadAsPdf&page=Tf%E2%80%93idf&action=show-download-screen" title="Download this page as a PDF file"><span>Download as PDF</span></a></li><li id="t-print" class="mw-list-item"><a href="/w/index.php?title=Tf%E2%80%93idf&printable=yes" title="Printable version of this page [p]" accesskey="p"><span>Printable version</span></a></li> </ul> </div> </div> <div id="p-wikibase-otherprojects" class="vector-menu mw-portlet mw-portlet-wikibase-otherprojects" > <div class="vector-menu-heading"> In other projects </div> <div class="vector-menu-content"> <ul class="vector-menu-content-list"> <li id="t-wikibase" class="wb-otherproject-link wb-otherproject-wikibase-dataitem mw-list-item"><a href="https://www.wikidata.org/wiki/Special:EntityPage/Q796584" title="Structured data on this page hosted by Wikidata [g]" accesskey="g"><span>Wikidata item</span></a></li> </ul> </div> </div> </div> </div> </div> </div> </nav> </div> </div> </div> <div class="vector-column-end"> <div class="vector-sticky-pinned-container"> <nav class="vector-page-tools-landmark" aria-label="Page tools"> <div id="vector-page-tools-pinned-container" class="vector-pinned-container"> </div> </nav> <nav class="vector-appearance-landmark" aria-label="Appearance"> <div id="vector-appearance-pinned-container" class="vector-pinned-container"> <div id="vector-appearance" class="vector-appearance vector-pinnable-element"> <div class="vector-pinnable-header vector-appearance-pinnable-header vector-pinnable-header-pinned" data-feature-name="appearance-pinned" data-pinnable-element-id="vector-appearance" data-pinned-container-id="vector-appearance-pinned-container" data-unpinned-container-id="vector-appearance-unpinned-container" > <div class="vector-pinnable-header-label">Appearance</div> <button class="vector-pinnable-header-toggle-button vector-pinnable-header-pin-button" data-event-name="pinnable-header.vector-appearance.pin">move to sidebar</button> <button class="vector-pinnable-header-toggle-button vector-pinnable-header-unpin-button" data-event-name="pinnable-header.vector-appearance.unpin">hide</button> </div> </div> </div> </nav> </div> </div> <div id="bodyContent" class="vector-body" aria-labelledby="firstHeading" data-mw-ve-target-container> <div class="vector-body-before-content"> <div class="mw-indicators"> </div> <div id="siteSub" class="noprint">From Wikipedia, the free encyclopedia</div> </div> <div id="contentSub"><div id="mw-content-subtitle"><span class="mw-redirectedfrom">(Redirected from <a href="/w/index.php?title=Tf-idf&redirect=no" class="mw-redirect" title="Tf-idf">Tf-idf</a>)</span></div></div> <div id="mw-content-text" class="mw-body-content"><div class="mw-content-ltr mw-parser-output" lang="en" dir="ltr"><div class="shortdescription nomobile noexcerpt noprint searchaux" style="display:none">Estimate of the importance of a word in a document</div> <p>In <a href="/wiki/Information_retrieval" title="Information retrieval">information retrieval</a>, <b>tf–idf</b> (also <b>TF*IDF</b>, <b>TFIDF</b>, <b>TF–IDF</b>, or <b>Tf–idf</b>), short for <b>term frequency–inverse document frequency</b>, is a measure of importance of a word to a <a href="/wiki/Document" title="Document">document</a> in a collection or <a href="/wiki/Text_corpus" title="Text corpus">corpus</a>, adjusted for the fact that some words appear more frequently in general.<sup id="cite_ref-1" class="reference"><a href="#cite_note-1"><span class="cite-bracket">[</span>1<span class="cite-bracket">]</span></a></sup> Like the bag-of-words model, it models a document as a <a href="/wiki/Multiset" title="Multiset">multiset</a> of words, without <a href="/wiki/Word_order" title="Word order">word order</a>. It is a refinement over the simple <a href="/wiki/Bag-of-words_model" title="Bag-of-words model">bag-of-words model</a>, by allowing the weight of words to depend on the rest of the corpus. </p><p>It was often used as a <a href="/wiki/Weighting_factor" class="mw-redirect" title="Weighting factor">weighting factor</a> in searches of information retrieval, <a href="/wiki/Text_mining" title="Text mining">text mining</a>, and <a href="/wiki/User_modeling" title="User modeling">user modeling</a>. A survey conducted in 2015 showed that 83% of text-based recommender systems in digital libraries used tf–idf.<sup id="cite_ref-2" class="reference"><a href="#cite_note-2"><span class="cite-bracket">[</span>2<span class="cite-bracket">]</span></a></sup> Variations of the tf–idf weighting scheme were often used by <a href="/wiki/Search_engine" title="Search engine">search engines</a> as a central tool in scoring and ranking a document's <a href="/wiki/Relevance_(information_retrieval)" title="Relevance (information retrieval)">relevance</a> given a user <a href="/wiki/Information_retrieval" title="Information retrieval">query</a>. </p><p>One of the simplest <a href="/wiki/Ranking_function" class="mw-redirect" title="Ranking function">ranking functions</a> is computed by summing the tf–idf for each query term; many more sophisticated ranking functions are variants of this simple model. </p> <meta property="mw:PageProp/toc" /> <div class="mw-heading mw-heading2"><h2 id="Motivations">Motivations</h2><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Tf%E2%80%93idf&action=edit&section=1" title="Edit section: Motivations"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <p><a href="/wiki/Karen_Sp%C3%A4rck_Jones" title="Karen Spärck Jones">Karen Spärck Jones</a> (1972) conceived a statistical interpretation of term-specificity called Inverse Document Frequency (idf), which became a cornerstone of term weighting:<sup id="cite_ref-3" class="reference"><a href="#cite_note-3"><span class="cite-bracket">[</span>3<span class="cite-bracket">]</span></a></sup> </p> <style data-mw-deduplicate="TemplateStyles:r1244412712">.mw-parser-output .templatequote{overflow:hidden;margin:1em 0;padding:0 32px}.mw-parser-output .templatequotecite{line-height:1.5em;text-align:left;margin-top:0}@media(min-width:500px){.mw-parser-output .templatequotecite{padding-left:1.6em}}</style><blockquote class="templatequote"><p>The specificity of a term can be quantified as an inverse function of the number of documents in which it occurs.</p></blockquote><p>For example, the df (document frequency) and idf for some words in Shakespeare's 37 plays are as follows:<sup id="cite_ref-4" class="reference"><a href="#cite_note-4"><span class="cite-bracket">[</span>4<span class="cite-bracket">]</span></a></sup> </p><table class="wikitable"> <caption> </caption> <tbody><tr> <th>Word </th> <th>df </th> <th>idf </th></tr> <tr> <td>Romeo </td> <td>1 </td> <td>1.57 </td></tr> <tr> <td>salad </td> <td>2 </td> <td>1.27 </td></tr> <tr> <td>Falstaff </td> <td>4 </td> <td>0.967 </td></tr> <tr> <td>forest </td> <td>12 </td> <td>0.489 </td></tr> <tr> <td>battle </td> <td>21 </td> <td>0.246 </td></tr> <tr> <td>wit </td> <td>34 </td> <td>0.037 </td></tr> <tr> <td>fool </td> <td>36 </td> <td>0.012 </td></tr> <tr> <td>good </td> <td>37 </td> <td>0 </td></tr> <tr> <td>sweet </td> <td>37 </td> <td>0 </td></tr></tbody></table> <p>We see that "<a href="/wiki/Romeo" title="Romeo">Romeo</a>", "<a href="/wiki/John_Falstaff" title="John Falstaff">Falstaff</a>", and "salad" appears in very few plays, so seeing these words, one could get a good idea as to which play it might be. In contrast, "good" and "sweet" appears in every play and are completely uninformative as to which play it is. </p> <div class="mw-heading mw-heading2"><h2 id="Definition">Definition</h2><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Tf%E2%80%93idf&action=edit&section=2" title="Edit section: Definition"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <ol><li>The tf–idf is the product of two statistics, <i>term frequency</i> and <i>inverse document frequency</i>. There are various ways for determining the exact values of both statistics.</li> <li>A formula that aims to define the importance of a keyword or phrase within a document or a web page.</li></ol> <table class="wikitable" style="float: right; margin-left: 1.5em; margin-right: 0; margin-top: 0;"> <caption>Variants of term frequency (tf) weight </caption> <tbody><tr> <th>weighting scheme </th> <th>tf weight </th></tr> <tr> <td>binary</td> <td><span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle {0,1}}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mrow class="MJX-TeXAtom-ORD"> <mn>0</mn> <mo>,</mo> <mn>1</mn> </mrow> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle {0,1}}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/a019735e07635e5a74673d6e1a34919027e645f5" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.671ex; width:3.359ex; height:2.509ex;" alt="{\displaystyle {0,1}}"></span> </td></tr> <tr> <td>raw count</td> <td><span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle f_{t,d}}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <msub> <mi>f</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>t</mi> <mo>,</mo> <mi>d</mi> </mrow> </msub> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle f_{t,d}}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/69faba5875c1ba7d6a3820c813ba22fba35185f5" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -1.005ex; width:3.282ex; height:2.843ex;" alt="{\displaystyle f_{t,d}}"></span> </td></tr> <tr> <td>term frequency </td> <td><span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle f_{t,d}{\Bigg /}{\sum _{t'\in d}{f_{t',d}}}}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <msub> <mi>f</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>t</mi> <mo>,</mo> <mi>d</mi> </mrow> </msub> <mrow class="MJX-TeXAtom-ORD"> <mrow class="MJX-TeXAtom-ORD"> <mo fence="true" stretchy="true" symmetric="true" maxsize="2.470em" minsize="2.470em">/</mo> </mrow> </mrow> <mrow class="MJX-TeXAtom-ORD"> <munder> <mo>∑<!-- ∑ --></mo> <mrow class="MJX-TeXAtom-ORD"> <msup> <mi>t</mi> <mo>′</mo> </msup> <mo>∈<!-- ∈ --></mo> <mi>d</mi> </mrow> </munder> <mrow class="MJX-TeXAtom-ORD"> <msub> <mi>f</mi> <mrow class="MJX-TeXAtom-ORD"> <msup> <mi>t</mi> <mo>′</mo> </msup> <mo>,</mo> <mi>d</mi> </mrow> </msub> </mrow> </mrow> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle f_{t,d}{\Bigg /}{\sum _{t'\in d}{f_{t',d}}}}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/91699003abf4fe8bdf861bbce08e73e71acf5fd4" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -3.338ex; width:13.808ex; height:7.676ex;" alt="{\displaystyle f_{t,d}{\Bigg /}{\sum _{t'\in d}{f_{t',d}}}}"></span> </td></tr> <tr> <td>log normalization</td> <td><span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle \log(1+f_{t,d})}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mi>log</mi> <mo>⁡<!-- --></mo> <mo stretchy="false">(</mo> <mn>1</mn> <mo>+</mo> <msub> <mi>f</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>t</mi> <mo>,</mo> <mi>d</mi> </mrow> </msub> <mo stretchy="false">)</mo> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle \log(1+f_{t,d})}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/9c173382612c58c00325c4e9f593739ab3afc324" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -1.005ex; width:12.066ex; height:3.009ex;" alt="{\displaystyle \log(1+f_{t,d})}"></span> </td></tr> <tr> <td>double normalization 0.5</td> <td><span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle 0.5+0.5\cdot {\frac {f_{t,d}}{\max _{\{t'\in d\}}{f_{t',d}}}}}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mn>0.5</mn> <mo>+</mo> <mn>0.5</mn> <mo>⋅<!-- ⋅ --></mo> <mrow class="MJX-TeXAtom-ORD"> <mfrac> <msub> <mi>f</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>t</mi> <mo>,</mo> <mi>d</mi> </mrow> </msub> <mrow> <munder> <mo movablelimits="true" form="prefix">max</mo> <mrow class="MJX-TeXAtom-ORD"> <mo fence="false" stretchy="false">{</mo> <msup> <mi>t</mi> <mo>′</mo> </msup> <mo>∈<!-- ∈ --></mo> <mi>d</mi> <mo fence="false" stretchy="false">}</mo> </mrow> </munder> <mrow class="MJX-TeXAtom-ORD"> <msub> <mi>f</mi> <mrow class="MJX-TeXAtom-ORD"> <msup> <mi>t</mi> <mo>′</mo> </msup> <mo>,</mo> <mi>d</mi> </mrow> </msub> </mrow> </mrow> </mfrac> </mrow> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle 0.5+0.5\cdot {\frac {f_{t,d}}{\max _{\{t'\in d\}}{f_{t',d}}}}}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/45badc1c70ec2caa00ed8c21ed75bd9f8d3e650c" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -2.838ex; width:24.783ex; height:6.676ex;" alt="{\displaystyle 0.5+0.5\cdot {\frac {f_{t,d}}{\max _{\{t'\in d\}}{f_{t',d}}}}}"></span> </td></tr> <tr> <td>double normalization K</td> <td><span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle K+(1-K){\frac {f_{t,d}}{\max _{\{t'\in d\}}{f_{t',d}}}}}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mi>K</mi> <mo>+</mo> <mo stretchy="false">(</mo> <mn>1</mn> <mo>−<!-- − --></mo> <mi>K</mi> <mo stretchy="false">)</mo> <mrow class="MJX-TeXAtom-ORD"> <mfrac> <msub> <mi>f</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>t</mi> <mo>,</mo> <mi>d</mi> </mrow> </msub> <mrow> <munder> <mo movablelimits="true" form="prefix">max</mo> <mrow class="MJX-TeXAtom-ORD"> <mo fence="false" stretchy="false">{</mo> <msup> <mi>t</mi> <mo>′</mo> </msup> <mo>∈<!-- ∈ --></mo> <mi>d</mi> <mo fence="false" stretchy="false">}</mo> </mrow> </munder> <mrow class="MJX-TeXAtom-ORD"> <msub> <mi>f</mi> <mrow class="MJX-TeXAtom-ORD"> <msup> <mi>t</mi> <mo>′</mo> </msup> <mo>,</mo> <mi>d</mi> </mrow> </msub> </mrow> </mrow> </mfrac> </mrow> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle K+(1-K){\frac {f_{t,d}}{\max _{\{t'\in d\}}{f_{t',d}}}}}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/65b776d7a3f8e42f15c880fb7582282b987684fe" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -2.838ex; width:27.105ex; height:6.676ex;" alt="{\displaystyle K+(1-K){\frac {f_{t,d}}{\max _{\{t'\in d\}}{f_{t',d}}}}}"></span> </td></tr></tbody></table> <div class="mw-heading mw-heading3"><h3 id="Term_frequency">Term frequency</h3><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Tf%E2%80%93idf&action=edit&section=3" title="Edit section: Term frequency"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <p>Term frequency, <span class="texhtml">tf(<i>t</i>,<i>d</i>)</span>, is the relative frequency of term <span class="texhtml"><i>t</i></span> within document <span class="texhtml"><i>d</i></span>, </p> <dl><dd><span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle \mathrm {tf} (t,d)={\frac {f_{t,d}}{\sum _{t'\in d}{f_{t',d}}}}}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mrow class="MJX-TeXAtom-ORD"> <mi mathvariant="normal">t</mi> <mi mathvariant="normal">f</mi> </mrow> <mo stretchy="false">(</mo> <mi>t</mi> <mo>,</mo> <mi>d</mi> <mo stretchy="false">)</mo> <mo>=</mo> <mrow class="MJX-TeXAtom-ORD"> <mfrac> <msub> <mi>f</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>t</mi> <mo>,</mo> <mi>d</mi> </mrow> </msub> <mrow> <munder> <mo>∑<!-- ∑ --></mo> <mrow class="MJX-TeXAtom-ORD"> <msup> <mi>t</mi> <mo>′</mo> </msup> <mo>∈<!-- ∈ --></mo> <mi>d</mi> </mrow> </munder> <mrow class="MJX-TeXAtom-ORD"> <msub> <mi>f</mi> <mrow class="MJX-TeXAtom-ORD"> <msup> <mi>t</mi> <mo>′</mo> </msup> <mo>,</mo> <mi>d</mi> </mrow> </msub> </mrow> </mrow> </mfrac> </mrow> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle \mathrm {tf} (t,d)={\frac {f_{t,d}}{\sum _{t'\in d}{f_{t',d}}}}}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/dd4f8a91dd0d28a11c00c94a13a315a5b49a8070" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -2.838ex; width:20.571ex; height:6.676ex;" alt="{\displaystyle \mathrm {tf} (t,d)={\frac {f_{t,d}}{\sum _{t'\in d}{f_{t',d}}}}}"></span>,</dd></dl> <p>where <span class="texhtml"><i>f</i><sub><i>t</i>,<i>d</i></sub></span> is the <i>raw count</i> of a term in a document, i.e., the number of times that term <span class="texhtml mvar" style="font-style:italic;">t</span> occurs in document <span class="texhtml mvar" style="font-style:italic;">d</span>. Note the denominator is simply the total number of terms in document <span class="texhtml"><i>d</i></span> (counting each occurrence of the same term separately). There are various other ways to define term frequency:<sup id="cite_ref-5" class="reference"><a href="#cite_note-5"><span class="cite-bracket">[</span>5<span class="cite-bracket">]</span></a></sup><sup class="reference nowrap"><span title="Page / location: 128">: 128 </span></sup> </p> <ul><li>the raw count itself: <span class="texhtml">tf(<i>t</i>,<i>d</i>) = <i>f</i><sub><i>t</i>,<i>d</i></sub></span></li> <li><a href="/wiki/Boolean_data_type" title="Boolean data type">Boolean</a> "frequencies": <span class="texhtml">tf(<i>t</i>,<i>d</i>) = 1</span> if <span class="texhtml mvar" style="font-style:italic;">t</span> occurs in <span class="texhtml mvar" style="font-style:italic;">d</span> and 0 otherwise;</li> <li><a href="/wiki/Logarithmic_scale" title="Logarithmic scale">logarithmically scaled</a> frequency: <span class="texhtml">tf(<i>t</i>,<i>d</i>) = log (1 + <i>f</i><sub><i>t</i>,<i>d</i></sub>)</span>;<sup id="cite_ref-6" class="reference"><a href="#cite_note-6"><span class="cite-bracket">[</span>6<span class="cite-bracket">]</span></a></sup></li> <li>augmented frequency, to prevent a bias towards longer documents, e.g. raw frequency divided by the raw frequency of the most frequently occurring term in the document:</li></ul> <dl><dd><span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle \mathrm {tf} (t,d)=0.5+0.5\cdot {\frac {f_{t,d}}{\max\{f_{t',d}:t'\in d\}}}}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mrow class="MJX-TeXAtom-ORD"> <mi mathvariant="normal">t</mi> <mi mathvariant="normal">f</mi> </mrow> <mo stretchy="false">(</mo> <mi>t</mi> <mo>,</mo> <mi>d</mi> <mo stretchy="false">)</mo> <mo>=</mo> <mn>0.5</mn> <mo>+</mo> <mn>0.5</mn> <mo>⋅<!-- ⋅ --></mo> <mrow class="MJX-TeXAtom-ORD"> <mfrac> <msub> <mi>f</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>t</mi> <mo>,</mo> <mi>d</mi> </mrow> </msub> <mrow> <mo movablelimits="true" form="prefix">max</mo> <mo fence="false" stretchy="false">{</mo> <msub> <mi>f</mi> <mrow class="MJX-TeXAtom-ORD"> <msup> <mi>t</mi> <mo>′</mo> </msup> <mo>,</mo> <mi>d</mi> </mrow> </msub> <mo>:</mo> <msup> <mi>t</mi> <mo>′</mo> </msup> <mo>∈<!-- ∈ --></mo> <mi>d</mi> <mo fence="false" stretchy="false">}</mo> </mrow> </mfrac> </mrow> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle \mathrm {tf} (t,d)=0.5+0.5\cdot {\frac {f_{t,d}}{\max\{f_{t',d}:t'\in d\}}}}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/da4be29a89f4c67ff5a8ad0c7355df1aff67a65b" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -2.838ex; width:39.049ex; height:6.676ex;" alt="{\displaystyle \mathrm {tf} (t,d)=0.5+0.5\cdot {\frac {f_{t,d}}{\max\{f_{t',d}:t'\in d\}}}}"></span></dd></dl> <div style="clear:right;" class=""></div> <div class="mw-heading mw-heading3"><h3 id="Inverse_document_frequency">Inverse document frequency</h3><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Tf%E2%80%93idf&action=edit&section=4" title="Edit section: Inverse document frequency"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <table class="wikitable" style="float: right; margin-left: 1.5em; margin-right: 0; margin-top: 0;"> <caption>Variants of inverse document frequency (idf) weight </caption> <tbody><tr> <th>weighting scheme </th> <th>idf weight (<span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle n_{t}=|\{d\in D:t\in d\}|}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <msub> <mi>n</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>t</mi> </mrow> </msub> <mo>=</mo> <mrow class="MJX-TeXAtom-ORD"> <mo stretchy="false">|</mo> </mrow> <mo fence="false" stretchy="false">{</mo> <mi>d</mi> <mo>∈<!-- ∈ --></mo> <mi>D</mi> <mo>:</mo> <mi>t</mi> <mo>∈<!-- ∈ --></mo> <mi>d</mi> <mo fence="false" stretchy="false">}</mo> <mrow class="MJX-TeXAtom-ORD"> <mo stretchy="false">|</mo> </mrow> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle n_{t}=|\{d\in D:t\in d\}|}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/e5d6fca54b5ec3fa4680ead41736fc1c16a58ca1" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.838ex; width:21.752ex; height:2.843ex;" alt="{\displaystyle n_{t}=|\{d\in D:t\in d\}|}"></span>) </th></tr> <tr> <td>unary</td> <td>1 </td></tr> <tr> <td>inverse document frequency</td> <td><span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle \log {\frac {N}{n_{t}}}=-\log {\frac {n_{t}}{N}}}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mi>log</mi> <mo>⁡<!-- --></mo> <mrow class="MJX-TeXAtom-ORD"> <mfrac> <mi>N</mi> <msub> <mi>n</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>t</mi> </mrow> </msub> </mfrac> </mrow> <mo>=</mo> <mo>−<!-- − --></mo> <mi>log</mi> <mo>⁡<!-- --></mo> <mrow class="MJX-TeXAtom-ORD"> <mfrac> <msub> <mi>n</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>t</mi> </mrow> </msub> <mi>N</mi> </mfrac> </mrow> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle \log {\frac {N}{n_{t}}}=-\log {\frac {n_{t}}{N}}}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/864fcfdc0c16344c11509f724f1aa7081cf9f657" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -2.171ex; width:18.125ex; height:5.509ex;" alt="{\displaystyle \log {\frac {N}{n_{t}}}=-\log {\frac {n_{t}}{N}}}"></span> </td></tr> <tr> <td>inverse document frequency smooth</td> <td><span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle \log \left({\frac {N}{1+n_{t}}}\right)+1}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mi>log</mi> <mo>⁡<!-- --></mo> <mrow> <mo>(</mo> <mrow class="MJX-TeXAtom-ORD"> <mfrac> <mi>N</mi> <mrow> <mn>1</mn> <mo>+</mo> <msub> <mi>n</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>t</mi> </mrow> </msub> </mrow> </mfrac> </mrow> <mo>)</mo> </mrow> <mo>+</mo> <mn>1</mn> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle \log \left({\frac {N}{1+n_{t}}}\right)+1}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/25f4d6690acaaef1f15f308d24f6f8a439de971d" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -2.505ex; width:17.455ex; height:6.176ex;" alt="{\displaystyle \log \left({\frac {N}{1+n_{t}}}\right)+1}"></span> </td></tr> <tr> <td>inverse document frequency max</td> <td><span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle \log \left({\frac {\max _{\{t'\in d\}}n_{t'}}{1+n_{t}}}\right)}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mi>log</mi> <mo>⁡<!-- --></mo> <mrow> <mo>(</mo> <mrow class="MJX-TeXAtom-ORD"> <mfrac> <mrow> <munder> <mo movablelimits="true" form="prefix">max</mo> <mrow class="MJX-TeXAtom-ORD"> <mo fence="false" stretchy="false">{</mo> <msup> <mi>t</mi> <mo>′</mo> </msup> <mo>∈<!-- ∈ --></mo> <mi>d</mi> <mo fence="false" stretchy="false">}</mo> </mrow> </munder> <msub> <mi>n</mi> <mrow class="MJX-TeXAtom-ORD"> <msup> <mi>t</mi> <mo>′</mo> </msup> </mrow> </msub> </mrow> <mrow> <mn>1</mn> <mo>+</mo> <msub> <mi>n</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>t</mi> </mrow> </msub> </mrow> </mfrac> </mrow> <mo>)</mo> </mrow> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle \log \left({\frac {\max _{\{t'\in d\}}n_{t'}}{1+n_{t}}}\right)}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/f15c125a1d7f1327afeecc4e2b89272a9a094338" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -2.505ex; width:19.652ex; height:6.176ex;" alt="{\displaystyle \log \left({\frac {\max _{\{t'\in d\}}n_{t'}}{1+n_{t}}}\right)}"></span> </td></tr> <tr> <td>probabilistic inverse document frequency</td> <td><span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle \log {\frac {N-n_{t}}{n_{t}}}}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mi>log</mi> <mo>⁡<!-- --></mo> <mrow class="MJX-TeXAtom-ORD"> <mfrac> <mrow> <mi>N</mi> <mo>−<!-- − --></mo> <msub> <mi>n</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>t</mi> </mrow> </msub> </mrow> <msub> <mi>n</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>t</mi> </mrow> </msub> </mfrac> </mrow> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle \log {\frac {N-n_{t}}{n_{t}}}}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/1868194cba8431aa2d556dd1aac90d78833eaaf3" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -2.171ex; width:11.32ex; height:5.509ex;" alt="{\displaystyle \log {\frac {N-n_{t}}{n_{t}}}}"></span> </td></tr></tbody></table> <p>The <b>inverse document frequency</b> is a measure of how much information the word provides, i.e., how common or rare it is across all documents. It is the <a href="/wiki/Logarithmic_scale" title="Logarithmic scale">logarithmically scaled</a> inverse fraction of the documents that contain the word (obtained by dividing the total number of documents by the number of documents containing the term, and then taking the logarithm of that quotient): </p> <dl><dd><span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle \mathrm {idf} (t,D)=\log {\frac {N}{|\{d:d\in D{\text{ and }}t\in d\}|}}}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mrow class="MJX-TeXAtom-ORD"> <mi mathvariant="normal">i</mi> <mi mathvariant="normal">d</mi> <mi mathvariant="normal">f</mi> </mrow> <mo stretchy="false">(</mo> <mi>t</mi> <mo>,</mo> <mi>D</mi> <mo stretchy="false">)</mo> <mo>=</mo> <mi>log</mi> <mo>⁡<!-- --></mo> <mrow class="MJX-TeXAtom-ORD"> <mfrac> <mi>N</mi> <mrow> <mrow class="MJX-TeXAtom-ORD"> <mo stretchy="false">|</mo> </mrow> <mo fence="false" stretchy="false">{</mo> <mi>d</mi> <mo>:</mo> <mi>d</mi> <mo>∈<!-- ∈ --></mo> <mi>D</mi> <mrow class="MJX-TeXAtom-ORD"> <mtext> and </mtext> </mrow> <mi>t</mi> <mo>∈<!-- ∈ --></mo> <mi>d</mi> <mo fence="false" stretchy="false">}</mo> <mrow class="MJX-TeXAtom-ORD"> <mo stretchy="false">|</mo> </mrow> </mrow> </mfrac> </mrow> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle \mathrm {idf} (t,D)=\log {\frac {N}{|\{d:d\in D{\text{ and }}t\in d\}|}}}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/77c7aafe5a18fe13babd1eece6a5a015c85fc1bb" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -2.671ex; width:38.262ex; height:6.009ex;" alt="{\displaystyle \mathrm {idf} (t,D)=\log {\frac {N}{|\{d:d\in D{\text{ and }}t\in d\}|}}}"></span></dd></dl> <p>with </p> <ul><li><span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle N}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mi>N</mi> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle N}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/f5e3890c981ae85503089652feb48b191b57aae3" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.338ex; width:2.064ex; height:2.176ex;" alt="{\displaystyle N}"></span>: total number of documents in the corpus <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle N={|D|}}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mi>N</mi> <mo>=</mo> <mrow class="MJX-TeXAtom-ORD"> <mrow class="MJX-TeXAtom-ORD"> <mo stretchy="false">|</mo> </mrow> <mi>D</mi> <mrow class="MJX-TeXAtom-ORD"> <mo stretchy="false">|</mo> </mrow> </mrow> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle N={|D|}}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/ac5fda2321c816116367dea5a306bdc40727e61e" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.838ex; width:8.38ex; height:2.843ex;" alt="{\displaystyle N={|D|}}"></span></li> <li><span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle |\{d\in D:t\in d\}|}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mrow class="MJX-TeXAtom-ORD"> <mo stretchy="false">|</mo> </mrow> <mo fence="false" stretchy="false">{</mo> <mi>d</mi> <mo>∈<!-- ∈ --></mo> <mi>D</mi> <mo>:</mo> <mi>t</mi> <mo>∈<!-- ∈ --></mo> <mi>d</mi> <mo fence="false" stretchy="false">}</mo> <mrow class="MJX-TeXAtom-ORD"> <mo stretchy="false">|</mo> </mrow> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle |\{d\in D:t\in d\}|}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/3ddbc54aba98eaf15ef0f1a0677d3de5c6b4df07" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.838ex; width:16.433ex; height:2.843ex;" alt="{\displaystyle |\{d\in D:t\in d\}|}"></span> : number of documents where the term <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle t}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mi>t</mi> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle t}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/65658b7b223af9e1acc877d848888ecdb4466560" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.338ex; width:0.84ex; height:2.009ex;" alt="{\displaystyle t}"></span> appears (i.e., <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle \mathrm {tf} (t,d)\neq 0}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mrow class="MJX-TeXAtom-ORD"> <mi mathvariant="normal">t</mi> <mi mathvariant="normal">f</mi> </mrow> <mo stretchy="false">(</mo> <mi>t</mi> <mo>,</mo> <mi>d</mi> <mo stretchy="false">)</mo> <mo>≠<!-- ≠ --></mo> <mn>0</mn> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle \mathrm {tf} (t,d)\neq 0}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/0b53fe00b6a3d033f183a8f3f0cfae9982206d5a" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.838ex; width:10.929ex; height:2.843ex;" alt="{\displaystyle \mathrm {tf} (t,d)\neq 0}"></span>). If the term is not in the corpus, this will lead to a division-by-zero. It is therefore common to adjust the numerator <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle 1+N}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mn>1</mn> <mo>+</mo> <mi>N</mi> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle 1+N}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/778c966eef93a41f8d9c16261fbbb03785cb36ab" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.505ex; width:6.066ex; height:2.343ex;" alt="{\displaystyle 1+N}"></span> and denominator to <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle 1+|\{d\in D:t\in d\}|}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mn>1</mn> <mo>+</mo> <mrow class="MJX-TeXAtom-ORD"> <mo stretchy="false">|</mo> </mrow> <mo fence="false" stretchy="false">{</mo> <mi>d</mi> <mo>∈<!-- ∈ --></mo> <mi>D</mi> <mo>:</mo> <mi>t</mi> <mo>∈<!-- ∈ --></mo> <mi>d</mi> <mo fence="false" stretchy="false">}</mo> <mrow class="MJX-TeXAtom-ORD"> <mo stretchy="false">|</mo> </mrow> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle 1+|\{d\in D:t\in d\}|}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/104921c6994a66054acc909a6bb811a47f983a53" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.838ex; width:20.436ex; height:2.843ex;" alt="{\displaystyle 1+|\{d\in D:t\in d\}|}"></span>.</li></ul> <figure class="mw-default-size" typeof="mw:File/Thumb"><a href="/wiki/File:Plot_IDF_functions.png" class="mw-file-description"><img src="//upload.wikimedia.org/wikipedia/commons/thumb/0/05/Plot_IDF_functions.png/220px-Plot_IDF_functions.png" decoding="async" width="220" height="124" class="mw-file-element" srcset="//upload.wikimedia.org/wikipedia/commons/thumb/0/05/Plot_IDF_functions.png/330px-Plot_IDF_functions.png 1.5x, //upload.wikimedia.org/wikipedia/commons/thumb/0/05/Plot_IDF_functions.png/440px-Plot_IDF_functions.png 2x" data-file-width="2647" data-file-height="1486" /></a><figcaption>Plot of different inverse document frequency functions: standard, smooth, probabilistic.</figcaption></figure> <div style="clear:right;" class=""></div> <div class="mw-heading mw-heading3"><h3 id="Term_frequency–inverse_document_frequency"><span id="Term_frequency.E2.80.93inverse_document_frequency"></span>Term frequency–inverse document frequency</h3><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Tf%E2%80%93idf&action=edit&section=5" title="Edit section: Term frequency–inverse document frequency"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <table class="wikitable" style="float: right; margin-left: 1.5em; margin-right: 0; margin-top: 0;"> <caption>Variants of term frequency-inverse document frequency (tf–idf) weights </caption> <tbody><tr> <th>weighting scheme </th> <th>tf-idf </th></tr> <tr> <td>count-idf </td> <td><span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle f_{t,d}\cdot \log {\frac {N}{n_{t}}}}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <msub> <mi>f</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>t</mi> <mo>,</mo> <mi>d</mi> </mrow> </msub> <mo>⋅<!-- ⋅ --></mo> <mi>log</mi> <mo>⁡<!-- --></mo> <mrow class="MJX-TeXAtom-ORD"> <mfrac> <mi>N</mi> <msub> <mi>n</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>t</mi> </mrow> </msub> </mfrac> </mrow> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle f_{t,d}\cdot \log {\frac {N}{n_{t}}}}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/74db3d91105b74e8450a78fadbf7ea7f241bc737" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -2.171ex; width:11.377ex; height:5.509ex;" alt="{\displaystyle f_{t,d}\cdot \log {\frac {N}{n_{t}}}}"></span> </td></tr> <tr> <td>double normalization-idf </td> <td><span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle \left(0.5+0.5{\frac {f_{t,q}}{\max _{t}f_{t,q}}}\right)\cdot \log {\frac {N}{n_{t}}}}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mrow> <mo>(</mo> <mrow> <mn>0.5</mn> <mo>+</mo> <mn>0.5</mn> <mrow class="MJX-TeXAtom-ORD"> <mfrac> <msub> <mi>f</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>t</mi> <mo>,</mo> <mi>q</mi> </mrow> </msub> <mrow> <munder> <mo movablelimits="true" form="prefix">max</mo> <mrow class="MJX-TeXAtom-ORD"> <mi>t</mi> </mrow> </munder> <msub> <mi>f</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>t</mi> <mo>,</mo> <mi>q</mi> </mrow> </msub> </mrow> </mfrac> </mrow> </mrow> <mo>)</mo> </mrow> <mo>⋅<!-- ⋅ --></mo> <mi>log</mi> <mo>⁡<!-- --></mo> <mrow class="MJX-TeXAtom-ORD"> <mfrac> <mi>N</mi> <msub> <mi>n</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>t</mi> </mrow> </msub> </mfrac> </mrow> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle \left(0.5+0.5{\frac {f_{t,q}}{\max _{t}f_{t,q}}}\right)\cdot \log {\frac {N}{n_{t}}}}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/fe1629098093a97eb293bd5085009c51c3cdd77b" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -2.671ex; width:29.854ex; height:6.509ex;" alt="{\displaystyle \left(0.5+0.5{\frac {f_{t,q}}{\max _{t}f_{t,q}}}\right)\cdot \log {\frac {N}{n_{t}}}}"></span> </td></tr> <tr> <td>log normalization-idf </td> <td><span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle (1+\log f_{t,d})\cdot \log {\frac {N}{n_{t}}}}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mo stretchy="false">(</mo> <mn>1</mn> <mo>+</mo> <mi>log</mi> <mo>⁡<!-- --></mo> <msub> <mi>f</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>t</mi> <mo>,</mo> <mi>d</mi> </mrow> </msub> <mo stretchy="false">)</mo> <mo>⋅<!-- ⋅ --></mo> <mi>log</mi> <mo>⁡<!-- --></mo> <mrow class="MJX-TeXAtom-ORD"> <mfrac> <mi>N</mi> <msub> <mi>n</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>t</mi> </mrow> </msub> </mfrac> </mrow> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle (1+\log f_{t,d})\cdot \log {\frac {N}{n_{t}}}}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/c271425fa07254a10379f1379e02d0fd034f4f21" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -2.171ex; width:20.548ex; height:5.509ex;" alt="{\displaystyle (1+\log f_{t,d})\cdot \log {\frac {N}{n_{t}}}}"></span> </td></tr></tbody></table> <p>Then tf–idf is calculated as </p> <dl><dd><span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle \mathrm {tfidf} (t,d,D)=\mathrm {tf} (t,d)\cdot \mathrm {idf} (t,D)}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mrow class="MJX-TeXAtom-ORD"> <mi mathvariant="normal">t</mi> <mi mathvariant="normal">f</mi> <mi mathvariant="normal">i</mi> <mi mathvariant="normal">d</mi> <mi mathvariant="normal">f</mi> </mrow> <mo stretchy="false">(</mo> <mi>t</mi> <mo>,</mo> <mi>d</mi> <mo>,</mo> <mi>D</mi> <mo stretchy="false">)</mo> <mo>=</mo> <mrow class="MJX-TeXAtom-ORD"> <mi mathvariant="normal">t</mi> <mi mathvariant="normal">f</mi> </mrow> <mo stretchy="false">(</mo> <mi>t</mi> <mo>,</mo> <mi>d</mi> <mo stretchy="false">)</mo> <mo>⋅<!-- ⋅ --></mo> <mrow class="MJX-TeXAtom-ORD"> <mi mathvariant="normal">i</mi> <mi mathvariant="normal">d</mi> <mi mathvariant="normal">f</mi> </mrow> <mo stretchy="false">(</mo> <mi>t</mi> <mo>,</mo> <mi>D</mi> <mo stretchy="false">)</mo> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle \mathrm {tfidf} (t,d,D)=\mathrm {tf} (t,d)\cdot \mathrm {idf} (t,D)}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/10109d0e60cc9d50a1ea2f189bac0ac29a030a00" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.838ex; width:32.289ex; height:2.843ex;" alt="{\displaystyle \mathrm {tfidf} (t,d,D)=\mathrm {tf} (t,d)\cdot \mathrm {idf} (t,D)}"></span></dd></dl> <p>A high weight in tf–idf is reached by a high term <a href="/wiki/Frequency_(statistics)" title="Frequency (statistics)">frequency</a> (in the given document) and a low document frequency of the term in the whole collection of documents; the weights hence tend to filter out common terms. Since the ratio inside the idf's log function is always greater than or equal to 1, the value of idf (and tf–idf) is greater than or equal to 0. As a term appears in more documents, the ratio inside the logarithm approaches 1, bringing the idf and tf–idf closer to 0. </p> <div style="clear:right;" class=""></div> <div class="mw-heading mw-heading2"><h2 id="Justification_of_idf">Justification of idf</h2><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Tf%E2%80%93idf&action=edit&section=6" title="Edit section: Justification of idf"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <p>Idf was introduced as "term specificity" by <a href="/wiki/Karen_Sp%C3%A4rck_Jones" title="Karen Spärck Jones">Karen Spärck Jones</a> in a 1972 paper. Although it has worked well as a <a href="/wiki/Heuristic" title="Heuristic">heuristic</a>, its theoretical foundations have been troublesome for at least three decades afterward, with many researchers trying to find <a href="/wiki/Information_theory" title="Information theory">information theoretic</a> justifications for it.<sup id="cite_ref-understanding_7-0" class="reference"><a href="#cite_note-understanding-7"><span class="cite-bracket">[</span>7<span class="cite-bracket">]</span></a></sup> </p><p>Spärck Jones's own explanation did not propose much theory, aside from a connection to <a href="/wiki/Zipf%27s_law" title="Zipf's law">Zipf's law</a>.<sup id="cite_ref-understanding_7-1" class="reference"><a href="#cite_note-understanding-7"><span class="cite-bracket">[</span>7<span class="cite-bracket">]</span></a></sup> Attempts have been made to put idf on a <a href="/wiki/Probability_theory" title="Probability theory">probabilistic</a> footing,<sup id="cite_ref-8" class="reference"><a href="#cite_note-8"><span class="cite-bracket">[</span>8<span class="cite-bracket">]</span></a></sup> by estimating the probability that a given document <span class="texhtml mvar" style="font-style:italic;">d</span> contains a term <span class="texhtml mvar" style="font-style:italic;">t</span> as the relative document frequency, </p> <dl><dd><span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle P(t|D)={\frac {|\{d\in D:t\in d\}|}{N}},}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mi>P</mi> <mo stretchy="false">(</mo> <mi>t</mi> <mrow class="MJX-TeXAtom-ORD"> <mo stretchy="false">|</mo> </mrow> <mi>D</mi> <mo stretchy="false">)</mo> <mo>=</mo> <mrow class="MJX-TeXAtom-ORD"> <mfrac> <mrow> <mrow class="MJX-TeXAtom-ORD"> <mo stretchy="false">|</mo> </mrow> <mo fence="false" stretchy="false">{</mo> <mi>d</mi> <mo>∈<!-- ∈ --></mo> <mi>D</mi> <mo>:</mo> <mi>t</mi> <mo>∈<!-- ∈ --></mo> <mi>d</mi> <mo fence="false" stretchy="false">}</mo> <mrow class="MJX-TeXAtom-ORD"> <mo stretchy="false">|</mo> </mrow> </mrow> <mi>N</mi> </mfrac> </mrow> <mo>,</mo> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle P(t|D)={\frac {|\{d\in D:t\in d\}|}{N}},}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/c9bd4ecbef91389190266ae2724f3c3884c81e80" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -1.838ex; width:27.98ex; height:5.676ex;" alt="{\displaystyle P(t|D)={\frac {|\{d\in D:t\in d\}|}{N}},}"></span></dd></dl> <p>so that we can define idf as </p> <dl><dd><span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle {\begin{aligned}\mathrm {idf} &=-\log P(t|D)\\&=\log {\frac {1}{P(t|D)}}\\&=\log {\frac {N}{|\{d\in D:t\in d\}|}}\end{aligned}}}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mrow class="MJX-TeXAtom-ORD"> <mtable columnalign="right left right left right left right left right left right left" rowspacing="3pt" columnspacing="0em 2em 0em 2em 0em 2em 0em 2em 0em 2em 0em" displaystyle="true"> <mtr> <mtd> <mrow class="MJX-TeXAtom-ORD"> <mi mathvariant="normal">i</mi> <mi mathvariant="normal">d</mi> <mi mathvariant="normal">f</mi> </mrow> </mtd> <mtd> <mi></mi> <mo>=</mo> <mo>−<!-- − --></mo> <mi>log</mi> <mo>⁡<!-- --></mo> <mi>P</mi> <mo stretchy="false">(</mo> <mi>t</mi> <mrow class="MJX-TeXAtom-ORD"> <mo stretchy="false">|</mo> </mrow> <mi>D</mi> <mo stretchy="false">)</mo> </mtd> </mtr> <mtr> <mtd /> <mtd> <mi></mi> <mo>=</mo> <mi>log</mi> <mo>⁡<!-- --></mo> <mrow class="MJX-TeXAtom-ORD"> <mfrac> <mn>1</mn> <mrow> <mi>P</mi> <mo stretchy="false">(</mo> <mi>t</mi> <mrow class="MJX-TeXAtom-ORD"> <mo stretchy="false">|</mo> </mrow> <mi>D</mi> <mo stretchy="false">)</mo> </mrow> </mfrac> </mrow> </mtd> </mtr> <mtr> <mtd /> <mtd> <mi></mi> <mo>=</mo> <mi>log</mi> <mo>⁡<!-- --></mo> <mrow class="MJX-TeXAtom-ORD"> <mfrac> <mi>N</mi> <mrow> <mrow class="MJX-TeXAtom-ORD"> <mo stretchy="false">|</mo> </mrow> <mo fence="false" stretchy="false">{</mo> <mi>d</mi> <mo>∈<!-- ∈ --></mo> <mi>D</mi> <mo>:</mo> <mi>t</mi> <mo>∈<!-- ∈ --></mo> <mi>d</mi> <mo fence="false" stretchy="false">}</mo> <mrow class="MJX-TeXAtom-ORD"> <mo stretchy="false">|</mo> </mrow> </mrow> </mfrac> </mrow> </mtd> </mtr> </mtable> </mrow> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle {\begin{aligned}\mathrm {idf} &=-\log P(t|D)\\&=\log {\frac {1}{P(t|D)}}\\&=\log {\frac {N}{|\{d\in D:t\in d\}|}}\end{aligned}}}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/d15231909526bd3ce99442f0b10ef9793d281d98" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -7.171ex; width:27.282ex; height:15.509ex;" alt="{\displaystyle {\begin{aligned}\mathrm {idf} &=-\log P(t|D)\\&=\log {\frac {1}{P(t|D)}}\\&=\log {\frac {N}{|\{d\in D:t\in d\}|}}\end{aligned}}}"></span></dd></dl> <p>Namely, the inverse document frequency is the logarithm of "inverse" relative document frequency. </p><p>This probabilistic interpretation in turn takes the same form as that of <a href="/wiki/Self-information" class="mw-redirect" title="Self-information">self-information</a>. However, applying such information-theoretic notions to problems in information retrieval leads to problems when trying to define the appropriate <a href="/wiki/Event_space" class="mw-redirect" title="Event space">event spaces</a> for the required <a href="/wiki/Probability_distribution" title="Probability distribution">probability distributions</a>: not only documents need to be taken into account, but also queries and terms.<sup id="cite_ref-understanding_7-2" class="reference"><a href="#cite_note-understanding-7"><span class="cite-bracket">[</span>7<span class="cite-bracket">]</span></a></sup> </p> <div class="mw-heading mw-heading2"><h2 id="Link_with_information_theory">Link with information theory</h2><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Tf%E2%80%93idf&action=edit&section=7" title="Edit section: Link with information theory"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <p>Both term frequency and inverse document frequency can be formulated in terms of <a href="/wiki/Information_theory" title="Information theory">information theory</a>; it helps to understand why their product has a meaning in terms of joint informational content of a document. A characteristic assumption about the distribution <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle p(d,t)}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mi>p</mi> <mo stretchy="false">(</mo> <mi>d</mi> <mo>,</mo> <mi>t</mi> <mo stretchy="false">)</mo> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle p(d,t)}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/efd0eaa0ccf9a3aace0021b8f26d532783b521e9" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.838ex; margin-left: -0.089ex; width:6.158ex; height:2.843ex;" alt="{\displaystyle p(d,t)}"></span> is that: </p> <dl><dd><span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle p(d|t)={\frac {1}{|\{d\in D:t\in d\}|}}}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mi>p</mi> <mo stretchy="false">(</mo> <mi>d</mi> <mrow class="MJX-TeXAtom-ORD"> <mo stretchy="false">|</mo> </mrow> <mi>t</mi> <mo stretchy="false">)</mo> <mo>=</mo> <mrow class="MJX-TeXAtom-ORD"> <mfrac> <mn>1</mn> <mrow> <mrow class="MJX-TeXAtom-ORD"> <mo stretchy="false">|</mo> </mrow> <mo fence="false" stretchy="false">{</mo> <mi>d</mi> <mo>∈<!-- ∈ --></mo> <mi>D</mi> <mo>:</mo> <mi>t</mi> <mo>∈<!-- ∈ --></mo> <mi>d</mi> <mo fence="false" stretchy="false">}</mo> <mrow class="MJX-TeXAtom-ORD"> <mo stretchy="false">|</mo> </mrow> </mrow> </mfrac> </mrow> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle p(d|t)={\frac {1}{|\{d\in D:t\in d\}|}}}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/ca5bb2e1fdbbf54ac7583c4736cd0db12cfb996b" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -2.671ex; margin-left: -0.089ex; width:26.138ex; height:6.009ex;" alt="{\displaystyle p(d|t)={\frac {1}{|\{d\in D:t\in d\}|}}}"></span></dd></dl> <p>This assumption and its implications, according to Aizawa: "represent the heuristic that tf–idf employs."<sup id="cite_ref-Aizawa_2003_45–65_9-0" class="reference"><a href="#cite_note-Aizawa_2003_45–65-9"><span class="cite-bracket">[</span>9<span class="cite-bracket">]</span></a></sup> </p><p>The <a href="/wiki/Conditional_entropy" title="Conditional entropy">conditional entropy</a> of a "randomly chosen" document in the corpus <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle D}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mi>D</mi> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle D}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/f34a0c600395e5d4345287e21fb26efd386990e6" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.338ex; width:1.924ex; height:2.176ex;" alt="{\displaystyle D}"></span>, conditional to the fact it contains a specific term <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle t}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mi>t</mi> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle t}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/65658b7b223af9e1acc877d848888ecdb4466560" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.338ex; width:0.84ex; height:2.009ex;" alt="{\displaystyle t}"></span> (and assuming that all documents have equal probability to be chosen) is: </p> <dl><dd><span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle H({\cal {D}}|{\cal {T}}=t)=-\sum _{d}p_{d|t}\log p_{d|t}=-\log {\frac {1}{|\{d\in D:t\in d\}|}}=\log {\frac {|\{d\in D:t\in d\}|}{|D|}}+\log |D|=-\mathrm {idf} (t)+\log |D|}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mi>H</mi> <mo stretchy="false">(</mo> <mrow class="MJX-TeXAtom-ORD"> <mrow class="MJX-TeXAtom-ORD"> <mi class="MJX-tex-caligraphic" mathvariant="script">D</mi> </mrow> </mrow> <mrow class="MJX-TeXAtom-ORD"> <mo stretchy="false">|</mo> </mrow> <mrow class="MJX-TeXAtom-ORD"> <mrow class="MJX-TeXAtom-ORD"> <mi class="MJX-tex-caligraphic" mathvariant="script">T</mi> </mrow> </mrow> <mo>=</mo> <mi>t</mi> <mo stretchy="false">)</mo> <mo>=</mo> <mo>−<!-- − --></mo> <munder> <mo>∑<!-- ∑ --></mo> <mrow class="MJX-TeXAtom-ORD"> <mi>d</mi> </mrow> </munder> <msub> <mi>p</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>d</mi> <mrow class="MJX-TeXAtom-ORD"> <mo stretchy="false">|</mo> </mrow> <mi>t</mi> </mrow> </msub> <mi>log</mi> <mo>⁡<!-- --></mo> <msub> <mi>p</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>d</mi> <mrow class="MJX-TeXAtom-ORD"> <mo stretchy="false">|</mo> </mrow> <mi>t</mi> </mrow> </msub> <mo>=</mo> <mo>−<!-- − --></mo> <mi>log</mi> <mo>⁡<!-- --></mo> <mrow class="MJX-TeXAtom-ORD"> <mfrac> <mn>1</mn> <mrow> <mrow class="MJX-TeXAtom-ORD"> <mo stretchy="false">|</mo> </mrow> <mo fence="false" stretchy="false">{</mo> <mi>d</mi> <mo>∈<!-- ∈ --></mo> <mi>D</mi> <mo>:</mo> <mi>t</mi> <mo>∈<!-- ∈ --></mo> <mi>d</mi> <mo fence="false" stretchy="false">}</mo> <mrow class="MJX-TeXAtom-ORD"> <mo stretchy="false">|</mo> </mrow> </mrow> </mfrac> </mrow> <mo>=</mo> <mi>log</mi> <mo>⁡<!-- --></mo> <mrow class="MJX-TeXAtom-ORD"> <mfrac> <mrow> <mrow class="MJX-TeXAtom-ORD"> <mo stretchy="false">|</mo> </mrow> <mo fence="false" stretchy="false">{</mo> <mi>d</mi> <mo>∈<!-- ∈ --></mo> <mi>D</mi> <mo>:</mo> <mi>t</mi> <mo>∈<!-- ∈ --></mo> <mi>d</mi> <mo fence="false" stretchy="false">}</mo> <mrow class="MJX-TeXAtom-ORD"> <mo stretchy="false">|</mo> </mrow> </mrow> <mrow> <mrow class="MJX-TeXAtom-ORD"> <mo stretchy="false">|</mo> </mrow> <mi>D</mi> <mrow class="MJX-TeXAtom-ORD"> <mo stretchy="false">|</mo> </mrow> </mrow> </mfrac> </mrow> <mo>+</mo> <mi>log</mi> <mo>⁡<!-- --></mo> <mrow class="MJX-TeXAtom-ORD"> <mo stretchy="false">|</mo> </mrow> <mi>D</mi> <mrow class="MJX-TeXAtom-ORD"> <mo stretchy="false">|</mo> </mrow> <mo>=</mo> <mo>−<!-- − --></mo> <mrow class="MJX-TeXAtom-ORD"> <mi mathvariant="normal">i</mi> <mi mathvariant="normal">d</mi> <mi mathvariant="normal">f</mi> </mrow> <mo stretchy="false">(</mo> <mi>t</mi> <mo stretchy="false">)</mo> <mo>+</mo> <mi>log</mi> <mo>⁡<!-- --></mo> <mrow class="MJX-TeXAtom-ORD"> <mo stretchy="false">|</mo> </mrow> <mi>D</mi> <mrow class="MJX-TeXAtom-ORD"> <mo stretchy="false">|</mo> </mrow> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle H({\cal {D}}|{\cal {T}}=t)=-\sum _{d}p_{d|t}\log p_{d|t}=-\log {\frac {1}{|\{d\in D:t\in d\}|}}=\log {\frac {|\{d\in D:t\in d\}|}{|D|}}+\log |D|=-\mathrm {idf} (t)+\log |D|}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/22e87dbf6f9c04a296a720508fab893c7c4656a3" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -3.005ex; width:110.434ex; height:6.843ex;" alt="{\displaystyle H({\cal {D}}|{\cal {T}}=t)=-\sum _{d}p_{d|t}\log p_{d|t}=-\log {\frac {1}{|\{d\in D:t\in d\}|}}=\log {\frac {|\{d\in D:t\in d\}|}{|D|}}+\log |D|=-\mathrm {idf} (t)+\log |D|}"></span></dd></dl> <p>In terms of notation, <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle {\cal {D}}}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mrow class="MJX-TeXAtom-ORD"> <mrow class="MJX-TeXAtom-ORD"> <mi class="MJX-tex-caligraphic" mathvariant="script">D</mi> </mrow> </mrow> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle {\cal {D}}}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/3124aa3641d5b81c775608eab28b8508e60d313d" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.338ex; width:1.792ex; height:2.176ex;" alt="{\displaystyle {\cal {D}}}"></span> and <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle {\cal {T}}}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mrow class="MJX-TeXAtom-ORD"> <mrow class="MJX-TeXAtom-ORD"> <mi class="MJX-tex-caligraphic" mathvariant="script">T</mi> </mrow> </mrow> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle {\cal {T}}}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/9577c444f2a8ba4863cf7363a2790bbefdd88f37" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.338ex; width:1.936ex; height:2.343ex;" alt="{\displaystyle {\cal {T}}}"></span> are "random variables" corresponding to respectively draw a document or a term. The <a href="/wiki/Mutual_information" title="Mutual information">mutual information</a> can be expressed as </p> <dl><dd><span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle M({\cal {T}};{\cal {D}})=H({\cal {D}})-H({\cal {D}}|{\cal {T}})=\sum _{t}p_{t}\cdot (H({\cal {D}})-H({\cal {D}}|W=t))=\sum _{t}p_{t}\cdot \mathrm {idf} (t)}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mi>M</mi> <mo stretchy="false">(</mo> <mrow class="MJX-TeXAtom-ORD"> <mrow class="MJX-TeXAtom-ORD"> <mi class="MJX-tex-caligraphic" mathvariant="script">T</mi> </mrow> </mrow> <mo>;</mo> <mrow class="MJX-TeXAtom-ORD"> <mrow class="MJX-TeXAtom-ORD"> <mi class="MJX-tex-caligraphic" mathvariant="script">D</mi> </mrow> </mrow> <mo stretchy="false">)</mo> <mo>=</mo> <mi>H</mi> <mo stretchy="false">(</mo> <mrow class="MJX-TeXAtom-ORD"> <mrow class="MJX-TeXAtom-ORD"> <mi class="MJX-tex-caligraphic" mathvariant="script">D</mi> </mrow> </mrow> <mo stretchy="false">)</mo> <mo>−<!-- − --></mo> <mi>H</mi> <mo stretchy="false">(</mo> <mrow class="MJX-TeXAtom-ORD"> <mrow class="MJX-TeXAtom-ORD"> <mi class="MJX-tex-caligraphic" mathvariant="script">D</mi> </mrow> </mrow> <mrow class="MJX-TeXAtom-ORD"> <mo stretchy="false">|</mo> </mrow> <mrow class="MJX-TeXAtom-ORD"> <mrow class="MJX-TeXAtom-ORD"> <mi class="MJX-tex-caligraphic" mathvariant="script">T</mi> </mrow> </mrow> <mo stretchy="false">)</mo> <mo>=</mo> <munder> <mo>∑<!-- ∑ --></mo> <mrow class="MJX-TeXAtom-ORD"> <mi>t</mi> </mrow> </munder> <msub> <mi>p</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>t</mi> </mrow> </msub> <mo>⋅<!-- ⋅ --></mo> <mo stretchy="false">(</mo> <mi>H</mi> <mo stretchy="false">(</mo> <mrow class="MJX-TeXAtom-ORD"> <mrow class="MJX-TeXAtom-ORD"> <mi class="MJX-tex-caligraphic" mathvariant="script">D</mi> </mrow> </mrow> <mo stretchy="false">)</mo> <mo>−<!-- − --></mo> <mi>H</mi> <mo stretchy="false">(</mo> <mrow class="MJX-TeXAtom-ORD"> <mrow class="MJX-TeXAtom-ORD"> <mi class="MJX-tex-caligraphic" mathvariant="script">D</mi> </mrow> </mrow> <mrow class="MJX-TeXAtom-ORD"> <mo stretchy="false">|</mo> </mrow> <mi>W</mi> <mo>=</mo> <mi>t</mi> <mo stretchy="false">)</mo> <mo stretchy="false">)</mo> <mo>=</mo> <munder> <mo>∑<!-- ∑ --></mo> <mrow class="MJX-TeXAtom-ORD"> <mi>t</mi> </mrow> </munder> <msub> <mi>p</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>t</mi> </mrow> </msub> <mo>⋅<!-- ⋅ --></mo> <mrow class="MJX-TeXAtom-ORD"> <mi mathvariant="normal">i</mi> <mi mathvariant="normal">d</mi> <mi mathvariant="normal">f</mi> </mrow> <mo stretchy="false">(</mo> <mi>t</mi> <mo stretchy="false">)</mo> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle M({\cal {T}};{\cal {D}})=H({\cal {D}})-H({\cal {D}}|{\cal {T}})=\sum _{t}p_{t}\cdot (H({\cal {D}})-H({\cal {D}}|W=t))=\sum _{t}p_{t}\cdot \mathrm {idf} (t)}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/4a247d28c19acb3f6495827b99d9b738b3f3ca9f" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -3.005ex; width:78.347ex; height:5.509ex;" alt="{\displaystyle M({\cal {T}};{\cal {D}})=H({\cal {D}})-H({\cal {D}}|{\cal {T}})=\sum _{t}p_{t}\cdot (H({\cal {D}})-H({\cal {D}}|W=t))=\sum _{t}p_{t}\cdot \mathrm {idf} (t)}"></span></dd></dl> <p>The last step is to expand <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle p_{t}}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <msub> <mi>p</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>t</mi> </mrow> </msub> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle p_{t}}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/269b78ef9fb6e6bf8b70831dd8fcbe830c27ddd9" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.671ex; margin-left: -0.089ex; width:2.085ex; height:2.009ex;" alt="{\displaystyle p_{t}}"></span>, the unconditional probability to draw a term, with respect to the (random) choice of a document, to obtain: </p> <dl><dd><span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle M({\cal {T}};{\cal {D}})=\sum _{t,d}p_{t|d}\cdot p_{d}\cdot \mathrm {idf} (t)=\sum _{t,d}\mathrm {tf} (t,d)\cdot {\frac {1}{|D|}}\cdot \mathrm {idf} (t)={\frac {1}{|D|}}\sum _{t,d}\mathrm {tf} (t,d)\cdot \mathrm {idf} (t).}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mi>M</mi> <mo stretchy="false">(</mo> <mrow class="MJX-TeXAtom-ORD"> <mrow class="MJX-TeXAtom-ORD"> <mi class="MJX-tex-caligraphic" mathvariant="script">T</mi> </mrow> </mrow> <mo>;</mo> <mrow class="MJX-TeXAtom-ORD"> <mrow class="MJX-TeXAtom-ORD"> <mi class="MJX-tex-caligraphic" mathvariant="script">D</mi> </mrow> </mrow> <mo stretchy="false">)</mo> <mo>=</mo> <munder> <mo>∑<!-- ∑ --></mo> <mrow class="MJX-TeXAtom-ORD"> <mi>t</mi> <mo>,</mo> <mi>d</mi> </mrow> </munder> <msub> <mi>p</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>t</mi> <mrow class="MJX-TeXAtom-ORD"> <mo stretchy="false">|</mo> </mrow> <mi>d</mi> </mrow> </msub> <mo>⋅<!-- ⋅ --></mo> <msub> <mi>p</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>d</mi> </mrow> </msub> <mo>⋅<!-- ⋅ --></mo> <mrow class="MJX-TeXAtom-ORD"> <mi mathvariant="normal">i</mi> <mi mathvariant="normal">d</mi> <mi mathvariant="normal">f</mi> </mrow> <mo stretchy="false">(</mo> <mi>t</mi> <mo stretchy="false">)</mo> <mo>=</mo> <munder> <mo>∑<!-- ∑ --></mo> <mrow class="MJX-TeXAtom-ORD"> <mi>t</mi> <mo>,</mo> <mi>d</mi> </mrow> </munder> <mrow class="MJX-TeXAtom-ORD"> <mi mathvariant="normal">t</mi> <mi mathvariant="normal">f</mi> </mrow> <mo stretchy="false">(</mo> <mi>t</mi> <mo>,</mo> <mi>d</mi> <mo stretchy="false">)</mo> <mo>⋅<!-- ⋅ --></mo> <mrow class="MJX-TeXAtom-ORD"> <mfrac> <mn>1</mn> <mrow> <mrow class="MJX-TeXAtom-ORD"> <mo stretchy="false">|</mo> </mrow> <mi>D</mi> <mrow class="MJX-TeXAtom-ORD"> <mo stretchy="false">|</mo> </mrow> </mrow> </mfrac> </mrow> <mo>⋅<!-- ⋅ --></mo> <mrow class="MJX-TeXAtom-ORD"> <mi mathvariant="normal">i</mi> <mi mathvariant="normal">d</mi> <mi mathvariant="normal">f</mi> </mrow> <mo stretchy="false">(</mo> <mi>t</mi> <mo stretchy="false">)</mo> <mo>=</mo> <mrow class="MJX-TeXAtom-ORD"> <mfrac> <mn>1</mn> <mrow> <mrow class="MJX-TeXAtom-ORD"> <mo stretchy="false">|</mo> </mrow> <mi>D</mi> <mrow class="MJX-TeXAtom-ORD"> <mo stretchy="false">|</mo> </mrow> </mrow> </mfrac> </mrow> <munder> <mo>∑<!-- ∑ --></mo> <mrow class="MJX-TeXAtom-ORD"> <mi>t</mi> <mo>,</mo> <mi>d</mi> </mrow> </munder> <mrow class="MJX-TeXAtom-ORD"> <mi mathvariant="normal">t</mi> <mi mathvariant="normal">f</mi> </mrow> <mo stretchy="false">(</mo> <mi>t</mi> <mo>,</mo> <mi>d</mi> <mo stretchy="false">)</mo> <mo>⋅<!-- ⋅ --></mo> <mrow class="MJX-TeXAtom-ORD"> <mi mathvariant="normal">i</mi> <mi mathvariant="normal">d</mi> <mi mathvariant="normal">f</mi> </mrow> <mo stretchy="false">(</mo> <mi>t</mi> <mo stretchy="false">)</mo> <mo>.</mo> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle M({\cal {T}};{\cal {D}})=\sum _{t,d}p_{t|d}\cdot p_{d}\cdot \mathrm {idf} (t)=\sum _{t,d}\mathrm {tf} (t,d)\cdot {\frac {1}{|D|}}\cdot \mathrm {idf} (t)={\frac {1}{|D|}}\sum _{t,d}\mathrm {tf} (t,d)\cdot \mathrm {idf} (t).}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/4cfcdec4114484dc9a43059a108529f0a312b083" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -3.338ex; width:82.344ex; height:6.676ex;" alt="{\displaystyle M({\cal {T}};{\cal {D}})=\sum _{t,d}p_{t|d}\cdot p_{d}\cdot \mathrm {idf} (t)=\sum _{t,d}\mathrm {tf} (t,d)\cdot {\frac {1}{|D|}}\cdot \mathrm {idf} (t)={\frac {1}{|D|}}\sum _{t,d}\mathrm {tf} (t,d)\cdot \mathrm {idf} (t).}"></span></dd></dl> <p>This expression shows that summing the Tf–idf of all possible terms and documents recovers the mutual information between documents and term taking into account all the specificities of their joint distribution.<sup id="cite_ref-Aizawa_2003_45–65_9-1" class="reference"><a href="#cite_note-Aizawa_2003_45–65-9"><span class="cite-bracket">[</span>9<span class="cite-bracket">]</span></a></sup> Each Tf–idf hence carries the "bit of information" attached to a term x document pair. </p> <div class="mw-heading mw-heading2"><h2 id="Example_of_tf–idf"><span id="Example_of_tf.E2.80.93idf"></span>Example of tf–idf</h2><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Tf%E2%80%93idf&action=edit&section=8" title="Edit section: Example of tf–idf"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <p>Suppose that we have term count tables of a corpus consisting of only two documents, as listed on the right. </p> <table class="wikitable" style="float: right; margin-left: 1.5em; margin-right: 0; margin-top: 0;"> <caption>Document 2 </caption> <tbody><tr> <th>Term </th> <th>Term Count </th></tr> <tr> <td>this</td> <td>1 </td></tr> <tr> <td>is </td> <td>1 </td></tr> <tr> <td>another </td> <td>2 </td></tr> <tr> <td>example </td> <td>3 </td></tr></tbody></table> <table class="wikitable" style="float: right; margin-left: 1.5em; margin-right: 0; margin-top: 0;"> <caption>Document 1 </caption> <tbody><tr> <th>Term </th> <th>Term Count </th></tr> <tr> <td>this</td> <td>1 </td></tr> <tr> <td>is </td> <td>1 </td></tr> <tr> <td>a </td> <td>2 </td></tr> <tr> <td>sample </td> <td>1 </td></tr></tbody></table> <p>The calculation of tf–idf for the term "this" is performed as follows: </p><p>In its raw frequency form, tf is just the frequency of the "this" for each document. In each document, the word "this" appears once; but as the document 2 has more words, its relative frequency is smaller. </p> <dl><dd><span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle \mathrm {tf} ({\mathsf {''this''}},d_{1})={\frac {1}{5}}=0.2}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mrow class="MJX-TeXAtom-ORD"> <mi mathvariant="normal">t</mi> <mi mathvariant="normal">f</mi> </mrow> <mo stretchy="false">(</mo> <mrow class="MJX-TeXAtom-ORD"> <mrow class="MJX-TeXAtom-ORD"> <msup> <mi></mi> <mo>″</mo> </msup> <mi mathvariant="sans-serif">t</mi> <mi mathvariant="sans-serif">h</mi> <mi mathvariant="sans-serif">i</mi> <msup> <mi mathvariant="sans-serif">s</mi> <mo>″</mo> </msup> </mrow> </mrow> <mo>,</mo> <msub> <mi>d</mi> <mrow class="MJX-TeXAtom-ORD"> <mn>1</mn> </mrow> </msub> <mo stretchy="false">)</mo> <mo>=</mo> <mrow class="MJX-TeXAtom-ORD"> <mfrac> <mn>1</mn> <mn>5</mn> </mfrac> </mrow> <mo>=</mo> <mn>0.2</mn> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle \mathrm {tf} ({\mathsf {''this''}},d_{1})={\frac {1}{5}}=0.2}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/16c015f44d033993b3e2ede79dac61c98bf96fcb" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -1.838ex; width:23.806ex; height:5.176ex;" alt="{\displaystyle \mathrm {tf} ({\mathsf {''this''}},d_{1})={\frac {1}{5}}=0.2}"></span></dd> <dd><span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle \mathrm {tf} ({\mathsf {''this''}},d_{2})={\frac {1}{7}}\approx 0.14}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mrow class="MJX-TeXAtom-ORD"> <mi mathvariant="normal">t</mi> <mi mathvariant="normal">f</mi> </mrow> <mo stretchy="false">(</mo> <mrow class="MJX-TeXAtom-ORD"> <mrow class="MJX-TeXAtom-ORD"> <msup> <mi></mi> <mo>″</mo> </msup> <mi mathvariant="sans-serif">t</mi> <mi mathvariant="sans-serif">h</mi> <mi mathvariant="sans-serif">i</mi> <msup> <mi mathvariant="sans-serif">s</mi> <mo>″</mo> </msup> </mrow> </mrow> <mo>,</mo> <msub> <mi>d</mi> <mrow class="MJX-TeXAtom-ORD"> <mn>2</mn> </mrow> </msub> <mo stretchy="false">)</mo> <mo>=</mo> <mrow class="MJX-TeXAtom-ORD"> <mfrac> <mn>1</mn> <mn>7</mn> </mfrac> </mrow> <mo>≈<!-- ≈ --></mo> <mn>0.14</mn> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle \mathrm {tf} ({\mathsf {''this''}},d_{2})={\frac {1}{7}}\approx 0.14}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/9587323bafed1255880bae568ed8c36222eb9665" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -2.005ex; width:24.969ex; height:5.343ex;" alt="{\displaystyle \mathrm {tf} ({\mathsf {''this''}},d_{2})={\frac {1}{7}}\approx 0.14}"></span></dd></dl> <p>An idf is constant per corpus, and <b>accounts</b> for the ratio of documents that include the word "this". In this case, we have a corpus of two documents and all of them include the word "this". </p> <dl><dd><span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle \mathrm {idf} ({\mathsf {''this''}},D)=\log \left({\frac {2}{2}}\right)=0}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mrow class="MJX-TeXAtom-ORD"> <mi mathvariant="normal">i</mi> <mi mathvariant="normal">d</mi> <mi mathvariant="normal">f</mi> </mrow> <mo stretchy="false">(</mo> <mrow class="MJX-TeXAtom-ORD"> <mrow class="MJX-TeXAtom-ORD"> <msup> <mi></mi> <mo>″</mo> </msup> <mi mathvariant="sans-serif">t</mi> <mi mathvariant="sans-serif">h</mi> <mi mathvariant="sans-serif">i</mi> <msup> <mi mathvariant="sans-serif">s</mi> <mo>″</mo> </msup> </mrow> </mrow> <mo>,</mo> <mi>D</mi> <mo stretchy="false">)</mo> <mo>=</mo> <mi>log</mi> <mo>⁡<!-- --></mo> <mrow> <mo>(</mo> <mrow class="MJX-TeXAtom-ORD"> <mfrac> <mn>2</mn> <mn>2</mn> </mfrac> </mrow> <mo>)</mo> </mrow> <mo>=</mo> <mn>0</mn> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle \mathrm {idf} ({\mathsf {''this''}},D)=\log \left({\frac {2}{2}}\right)=0}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/d9cade99407a0e585a9bd7201d0b11056bf86354" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -2.505ex; width:29.086ex; height:6.176ex;" alt="{\displaystyle \mathrm {idf} ({\mathsf {''this''}},D)=\log \left({\frac {2}{2}}\right)=0}"></span></dd></dl> <p>So tf–idf is zero for the word "this", which implies that the word is not very informative as it appears in all documents. </p> <dl><dd><span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle \mathrm {tfidf} ({\mathsf {''this''}},d_{1},D)=0.2\times 0=0}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mrow class="MJX-TeXAtom-ORD"> <mi mathvariant="normal">t</mi> <mi mathvariant="normal">f</mi> <mi mathvariant="normal">i</mi> <mi mathvariant="normal">d</mi> <mi mathvariant="normal">f</mi> </mrow> <mo stretchy="false">(</mo> <mrow class="MJX-TeXAtom-ORD"> <mrow class="MJX-TeXAtom-ORD"> <msup> <mi></mi> <mo>″</mo> </msup> <mi mathvariant="sans-serif">t</mi> <mi mathvariant="sans-serif">h</mi> <mi mathvariant="sans-serif">i</mi> <msup> <mi mathvariant="sans-serif">s</mi> <mo>″</mo> </msup> </mrow> </mrow> <mo>,</mo> <msub> <mi>d</mi> <mrow class="MJX-TeXAtom-ORD"> <mn>1</mn> </mrow> </msub> <mo>,</mo> <mi>D</mi> <mo stretchy="false">)</mo> <mo>=</mo> <mn>0.2</mn> <mo>×<!-- × --></mo> <mn>0</mn> <mo>=</mo> <mn>0</mn> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle \mathrm {tfidf} ({\mathsf {''this''}},d_{1},D)=0.2\times 0=0}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/b2204ebc15347494e5beb337bb44df070c20528e" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.838ex; width:32.736ex; height:3.009ex;" alt="{\displaystyle \mathrm {tfidf} ({\mathsf {''this''}},d_{1},D)=0.2\times 0=0}"></span></dd> <dd><span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle \mathrm {tfidf} ({\mathsf {''this''}},d_{2},D)=0.14\times 0=0}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mrow class="MJX-TeXAtom-ORD"> <mi mathvariant="normal">t</mi> <mi mathvariant="normal">f</mi> <mi mathvariant="normal">i</mi> <mi mathvariant="normal">d</mi> <mi mathvariant="normal">f</mi> </mrow> <mo stretchy="false">(</mo> <mrow class="MJX-TeXAtom-ORD"> <mrow class="MJX-TeXAtom-ORD"> <msup> <mi></mi> <mo>″</mo> </msup> <mi mathvariant="sans-serif">t</mi> <mi mathvariant="sans-serif">h</mi> <mi mathvariant="sans-serif">i</mi> <msup> <mi mathvariant="sans-serif">s</mi> <mo>″</mo> </msup> </mrow> </mrow> <mo>,</mo> <msub> <mi>d</mi> <mrow class="MJX-TeXAtom-ORD"> <mn>2</mn> </mrow> </msub> <mo>,</mo> <mi>D</mi> <mo stretchy="false">)</mo> <mo>=</mo> <mn>0.14</mn> <mo>×<!-- × --></mo> <mn>0</mn> <mo>=</mo> <mn>0</mn> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle \mathrm {tfidf} ({\mathsf {''this''}},d_{2},D)=0.14\times 0=0}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/b0534e29ba724bedabc2af3be2064d7e71f3d35c" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.838ex; width:33.898ex; height:3.009ex;" alt="{\displaystyle \mathrm {tfidf} ({\mathsf {''this''}},d_{2},D)=0.14\times 0=0}"></span></dd></dl> <p>The word "example" is more interesting - it occurs three times, but only in the second document: </p> <dl><dd><span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle \mathrm {tf} ({\mathsf {''example''}},d_{1})={\frac {0}{5}}=0}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mrow class="MJX-TeXAtom-ORD"> <mi mathvariant="normal">t</mi> <mi mathvariant="normal">f</mi> </mrow> <mo stretchy="false">(</mo> <mrow class="MJX-TeXAtom-ORD"> <mrow class="MJX-TeXAtom-ORD"> <msup> <mi></mi> <mo>″</mo> </msup> <mi mathvariant="sans-serif">e</mi> <mi mathvariant="sans-serif">x</mi> <mi mathvariant="sans-serif">a</mi> <mi mathvariant="sans-serif">m</mi> <mi mathvariant="sans-serif">p</mi> <mi mathvariant="sans-serif">l</mi> <msup> <mi mathvariant="sans-serif">e</mi> <mo>″</mo> </msup> </mrow> </mrow> <mo>,</mo> <msub> <mi>d</mi> <mrow class="MJX-TeXAtom-ORD"> <mn>1</mn> </mrow> </msub> <mo stretchy="false">)</mo> <mo>=</mo> <mrow class="MJX-TeXAtom-ORD"> <mfrac> <mn>0</mn> <mn>5</mn> </mfrac> </mrow> <mo>=</mo> <mn>0</mn> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle \mathrm {tf} ({\mathsf {''example''}},d_{1})={\frac {0}{5}}=0}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/bbb87d4d366ac55ba7c95d813f044b9f7077b2d3" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -1.838ex; width:26.367ex; height:5.176ex;" alt="{\displaystyle \mathrm {tf} ({\mathsf {''example''}},d_{1})={\frac {0}{5}}=0}"></span></dd> <dd><span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle \mathrm {tf} ({\mathsf {''example''}},d_{2})={\frac {3}{7}}\approx 0.429}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mrow class="MJX-TeXAtom-ORD"> <mi mathvariant="normal">t</mi> <mi mathvariant="normal">f</mi> </mrow> <mo stretchy="false">(</mo> <mrow class="MJX-TeXAtom-ORD"> <mrow class="MJX-TeXAtom-ORD"> <msup> <mi></mi> <mo>″</mo> </msup> <mi mathvariant="sans-serif">e</mi> <mi mathvariant="sans-serif">x</mi> <mi mathvariant="sans-serif">a</mi> <mi mathvariant="sans-serif">m</mi> <mi mathvariant="sans-serif">p</mi> <mi mathvariant="sans-serif">l</mi> <msup> <mi mathvariant="sans-serif">e</mi> <mo>″</mo> </msup> </mrow> </mrow> <mo>,</mo> <msub> <mi>d</mi> <mrow class="MJX-TeXAtom-ORD"> <mn>2</mn> </mrow> </msub> <mo stretchy="false">)</mo> <mo>=</mo> <mrow class="MJX-TeXAtom-ORD"> <mfrac> <mn>3</mn> <mn>7</mn> </mfrac> </mrow> <mo>≈<!-- ≈ --></mo> <mn>0.429</mn> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle \mathrm {tf} ({\mathsf {''example''}},d_{2})={\frac {3}{7}}\approx 0.429}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/adbf6bc802f75c4281953c7ab7a25cb59bc75068" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -2.005ex; width:30.501ex; height:5.343ex;" alt="{\displaystyle \mathrm {tf} ({\mathsf {''example''}},d_{2})={\frac {3}{7}}\approx 0.429}"></span></dd> <dd><span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle \mathrm {idf} ({\mathsf {''example''}},D)=\log \left({\frac {2}{1}}\right)=0.301}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mrow class="MJX-TeXAtom-ORD"> <mi mathvariant="normal">i</mi> <mi mathvariant="normal">d</mi> <mi mathvariant="normal">f</mi> </mrow> <mo stretchy="false">(</mo> <mrow class="MJX-TeXAtom-ORD"> <mrow class="MJX-TeXAtom-ORD"> <msup> <mi></mi> <mo>″</mo> </msup> <mi mathvariant="sans-serif">e</mi> <mi mathvariant="sans-serif">x</mi> <mi mathvariant="sans-serif">a</mi> <mi mathvariant="sans-serif">m</mi> <mi mathvariant="sans-serif">p</mi> <mi mathvariant="sans-serif">l</mi> <msup> <mi mathvariant="sans-serif">e</mi> <mo>″</mo> </msup> </mrow> </mrow> <mo>,</mo> <mi>D</mi> <mo stretchy="false">)</mo> <mo>=</mo> <mi>log</mi> <mo>⁡<!-- --></mo> <mrow> <mo>(</mo> <mrow class="MJX-TeXAtom-ORD"> <mfrac> <mn>2</mn> <mn>1</mn> </mfrac> </mrow> <mo>)</mo> </mrow> <mo>=</mo> <mn>0.301</mn> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle \mathrm {idf} ({\mathsf {''example''}},D)=\log \left({\frac {2}{1}}\right)=0.301}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/2d47d3b7373ec1a75121aebb62c22db147b67db9" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -2.505ex; width:37.59ex; height:6.176ex;" alt="{\displaystyle \mathrm {idf} ({\mathsf {''example''}},D)=\log \left({\frac {2}{1}}\right)=0.301}"></span></dd></dl> <p>Finally, </p> <dl><dd><span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle \mathrm {tfidf} ({\mathsf {''example''}},d_{1},D)=\mathrm {tf} ({\mathsf {''example''}},d_{1})\times \mathrm {idf} ({\mathsf {''example''}},D)=0\times 0.301=0}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mrow class="MJX-TeXAtom-ORD"> <mi mathvariant="normal">t</mi> <mi mathvariant="normal">f</mi> <mi mathvariant="normal">i</mi> <mi mathvariant="normal">d</mi> <mi mathvariant="normal">f</mi> </mrow> <mo stretchy="false">(</mo> <mrow class="MJX-TeXAtom-ORD"> <mrow class="MJX-TeXAtom-ORD"> <msup> <mi></mi> <mo>″</mo> </msup> <mi mathvariant="sans-serif">e</mi> <mi mathvariant="sans-serif">x</mi> <mi mathvariant="sans-serif">a</mi> <mi mathvariant="sans-serif">m</mi> <mi mathvariant="sans-serif">p</mi> <mi mathvariant="sans-serif">l</mi> <msup> <mi mathvariant="sans-serif">e</mi> <mo>″</mo> </msup> </mrow> </mrow> <mo>,</mo> <msub> <mi>d</mi> <mrow class="MJX-TeXAtom-ORD"> <mn>1</mn> </mrow> </msub> <mo>,</mo> <mi>D</mi> <mo stretchy="false">)</mo> <mo>=</mo> <mrow class="MJX-TeXAtom-ORD"> <mi mathvariant="normal">t</mi> <mi mathvariant="normal">f</mi> </mrow> <mo stretchy="false">(</mo> <mrow class="MJX-TeXAtom-ORD"> <mrow class="MJX-TeXAtom-ORD"> <msup> <mi></mi> <mo>″</mo> </msup> <mi mathvariant="sans-serif">e</mi> <mi mathvariant="sans-serif">x</mi> <mi mathvariant="sans-serif">a</mi> <mi mathvariant="sans-serif">m</mi> <mi mathvariant="sans-serif">p</mi> <mi mathvariant="sans-serif">l</mi> <msup> <mi mathvariant="sans-serif">e</mi> <mo>″</mo> </msup> </mrow> </mrow> <mo>,</mo> <msub> <mi>d</mi> <mrow class="MJX-TeXAtom-ORD"> <mn>1</mn> </mrow> </msub> <mo stretchy="false">)</mo> <mo>×<!-- × --></mo> <mrow class="MJX-TeXAtom-ORD"> <mi mathvariant="normal">i</mi> <mi mathvariant="normal">d</mi> <mi mathvariant="normal">f</mi> </mrow> <mo stretchy="false">(</mo> <mrow class="MJX-TeXAtom-ORD"> <mrow class="MJX-TeXAtom-ORD"> <msup> <mi></mi> <mo>″</mo> </msup> <mi mathvariant="sans-serif">e</mi> <mi mathvariant="sans-serif">x</mi> <mi mathvariant="sans-serif">a</mi> <mi mathvariant="sans-serif">m</mi> <mi mathvariant="sans-serif">p</mi> <mi mathvariant="sans-serif">l</mi> <msup> <mi mathvariant="sans-serif">e</mi> <mo>″</mo> </msup> </mrow> </mrow> <mo>,</mo> <mi>D</mi> <mo stretchy="false">)</mo> <mo>=</mo> <mn>0</mn> <mo>×<!-- × --></mo> <mn>0.301</mn> <mo>=</mo> <mn>0</mn> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle \mathrm {tfidf} ({\mathsf {''example''}},d_{1},D)=\mathrm {tf} ({\mathsf {''example''}},d_{1})\times \mathrm {idf} ({\mathsf {''example''}},D)=0\times 0.301=0}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/c8bb6b1e5cb00d360f094832cae4cf31d472878b" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.838ex; width:80.083ex; height:3.009ex;" alt="{\displaystyle \mathrm {tfidf} ({\mathsf {''example''}},d_{1},D)=\mathrm {tf} ({\mathsf {''example''}},d_{1})\times \mathrm {idf} ({\mathsf {''example''}},D)=0\times 0.301=0}"></span></dd> <dd><span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle \mathrm {tfidf} ({\mathsf {''example''}},d_{2},D)=\mathrm {tf} ({\mathsf {''example''}},d_{2})\times \mathrm {idf} ({\mathsf {''example''}},D)=0.429\times 0.301\approx 0.129}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mrow class="MJX-TeXAtom-ORD"> <mi mathvariant="normal">t</mi> <mi mathvariant="normal">f</mi> <mi mathvariant="normal">i</mi> <mi mathvariant="normal">d</mi> <mi mathvariant="normal">f</mi> </mrow> <mo stretchy="false">(</mo> <mrow class="MJX-TeXAtom-ORD"> <mrow class="MJX-TeXAtom-ORD"> <msup> <mi></mi> <mo>″</mo> </msup> <mi mathvariant="sans-serif">e</mi> <mi mathvariant="sans-serif">x</mi> <mi mathvariant="sans-serif">a</mi> <mi mathvariant="sans-serif">m</mi> <mi mathvariant="sans-serif">p</mi> <mi mathvariant="sans-serif">l</mi> <msup> <mi mathvariant="sans-serif">e</mi> <mo>″</mo> </msup> </mrow> </mrow> <mo>,</mo> <msub> <mi>d</mi> <mrow class="MJX-TeXAtom-ORD"> <mn>2</mn> </mrow> </msub> <mo>,</mo> <mi>D</mi> <mo stretchy="false">)</mo> <mo>=</mo> <mrow class="MJX-TeXAtom-ORD"> <mi mathvariant="normal">t</mi> <mi mathvariant="normal">f</mi> </mrow> <mo stretchy="false">(</mo> <mrow class="MJX-TeXAtom-ORD"> <mrow class="MJX-TeXAtom-ORD"> <msup> <mi></mi> <mo>″</mo> </msup> <mi mathvariant="sans-serif">e</mi> <mi mathvariant="sans-serif">x</mi> <mi mathvariant="sans-serif">a</mi> <mi mathvariant="sans-serif">m</mi> <mi mathvariant="sans-serif">p</mi> <mi mathvariant="sans-serif">l</mi> <msup> <mi mathvariant="sans-serif">e</mi> <mo>″</mo> </msup> </mrow> </mrow> <mo>,</mo> <msub> <mi>d</mi> <mrow class="MJX-TeXAtom-ORD"> <mn>2</mn> </mrow> </msub> <mo stretchy="false">)</mo> <mo>×<!-- × --></mo> <mrow class="MJX-TeXAtom-ORD"> <mi mathvariant="normal">i</mi> <mi mathvariant="normal">d</mi> <mi mathvariant="normal">f</mi> </mrow> <mo stretchy="false">(</mo> <mrow class="MJX-TeXAtom-ORD"> <mrow class="MJX-TeXAtom-ORD"> <msup> <mi></mi> <mo>″</mo> </msup> <mi mathvariant="sans-serif">e</mi> <mi mathvariant="sans-serif">x</mi> <mi mathvariant="sans-serif">a</mi> <mi mathvariant="sans-serif">m</mi> <mi mathvariant="sans-serif">p</mi> <mi mathvariant="sans-serif">l</mi> <msup> <mi mathvariant="sans-serif">e</mi> <mo>″</mo> </msup> </mrow> </mrow> <mo>,</mo> <mi>D</mi> <mo stretchy="false">)</mo> <mo>=</mo> <mn>0.429</mn> <mo>×<!-- × --></mo> <mn>0.301</mn> <mo>≈<!-- ≈ --></mo> <mn>0.129</mn> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle \mathrm {tfidf} ({\mathsf {''example''}},d_{2},D)=\mathrm {tf} ({\mathsf {''example''}},d_{2})\times \mathrm {idf} ({\mathsf {''example''}},D)=0.429\times 0.301\approx 0.129}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/7c4ad75eaca58669eaabf2164793511fe96098a9" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.838ex; width:88.352ex; height:3.009ex;" alt="{\displaystyle \mathrm {tfidf} ({\mathsf {''example''}},d_{2},D)=\mathrm {tf} ({\mathsf {''example''}},d_{2})\times \mathrm {idf} ({\mathsf {''example''}},D)=0.429\times 0.301\approx 0.129}"></span></dd></dl> <p>(using the <a href="/wiki/Base_10_logarithm" class="mw-redirect" title="Base 10 logarithm">base 10 logarithm</a>). </p> <div class="mw-heading mw-heading2"><h2 id="Beyond_terms">Beyond terms</h2><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Tf%E2%80%93idf&action=edit&section=9" title="Edit section: Beyond terms"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <p>The idea behind tf–idf also applies to entities other than terms. In 1998, the concept of idf was applied to citations.<sup id="cite_ref-10" class="reference"><a href="#cite_note-10"><span class="cite-bracket">[</span>10<span class="cite-bracket">]</span></a></sup> The authors argued that "if a very uncommon citation is shared by two documents, this should be weighted more highly than a citation made by a large number of documents". In addition, tf–idf was applied to "visual words" with the purpose of conducting object matching in videos,<sup id="cite_ref-11" class="reference"><a href="#cite_note-11"><span class="cite-bracket">[</span>11<span class="cite-bracket">]</span></a></sup> and entire sentences.<sup id="cite_ref-12" class="reference"><a href="#cite_note-12"><span class="cite-bracket">[</span>12<span class="cite-bracket">]</span></a></sup> However, the concept of tf–idf did not prove to be more effective in all cases than a plain tf scheme (without idf). When tf–idf was applied to citations, researchers could find no improvement over a simple citation-count weight that had no idf component.<sup id="cite_ref-13" class="reference"><a href="#cite_note-13"><span class="cite-bracket">[</span>13<span class="cite-bracket">]</span></a></sup> </p> <div class="mw-heading mw-heading2"><h2 id="Derivatives">Derivatives</h2><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Tf%E2%80%93idf&action=edit&section=10" title="Edit section: Derivatives"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <p>A number of term-weighting schemes have derived from tf–idf. One of them is TF–PDF (term frequency * proportional document frequency).<sup id="cite_ref-14" class="reference"><a href="#cite_note-14"><span class="cite-bracket">[</span>14<span class="cite-bracket">]</span></a></sup> TF–PDF was introduced in 2001 in the context of identifying emerging topics in the media. The PDF component measures the difference of how often a term occurs in different domains. Another derivate is TF–IDuF. In TF–IDuF,<sup id="cite_ref-15" class="reference"><a href="#cite_note-15"><span class="cite-bracket">[</span>15<span class="cite-bracket">]</span></a></sup> idf is not calculated based on the document corpus that is to be searched or recommended. Instead, idf is calculated on users' personal document collections. The authors report that TF–IDuF was equally effective as tf–idf but could also be applied in situations when, e.g., a user modeling system has no access to a global document corpus. </p> <div class="mw-heading mw-heading2"><h2 id="See_also">See also</h2><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Tf%E2%80%93idf&action=edit&section=11" title="Edit section: See also"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <style data-mw-deduplicate="TemplateStyles:r1184024115">.mw-parser-output .div-col{margin-top:0.3em;column-width:30em}.mw-parser-output .div-col-small{font-size:90%}.mw-parser-output .div-col-rules{column-rule:1px solid #aaa}.mw-parser-output .div-col dl,.mw-parser-output .div-col ol,.mw-parser-output .div-col ul{margin-top:0}.mw-parser-output .div-col li,.mw-parser-output .div-col dd{page-break-inside:avoid;break-inside:avoid-column}</style><div class="div-col" style="column-width: 25em;"> <ul><li><a href="/wiki/Word_embedding" title="Word embedding">Word embedding</a></li> <li><a href="/wiki/Kullback%E2%80%93Leibler_divergence" title="Kullback–Leibler divergence">Kullback–Leibler divergence</a></li> <li><a href="/wiki/Latent_Dirichlet_allocation" title="Latent Dirichlet allocation">Latent Dirichlet allocation</a></li> <li><a href="/wiki/Latent_semantic_analysis" title="Latent semantic analysis">Latent semantic analysis</a></li> <li><a href="/wiki/Mutual_information" title="Mutual information">Mutual information</a></li> <li><a href="/wiki/Noun_phrase" title="Noun phrase">Noun phrase</a></li> <li><a href="/wiki/Okapi_BM25" title="Okapi BM25">Okapi BM25</a></li> <li><a href="/wiki/PageRank" title="PageRank">PageRank</a></li> <li><a href="/wiki/Vector_space_model" title="Vector space model">Vector space model</a></li> <li><a href="/wiki/Word_count" title="Word count">Word count</a></li> <li><a href="/wiki/SMART_Information_Retrieval_System" title="SMART Information Retrieval System">SMART Information Retrieval System</a></li></ul> </div> <div class="mw-heading mw-heading2"><h2 id="References">References</h2><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Tf%E2%80%93idf&action=edit&section=12" title="Edit section: References"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <style data-mw-deduplicate="TemplateStyles:r1239543626">.mw-parser-output .reflist{margin-bottom:0.5em;list-style-type:decimal}@media screen{.mw-parser-output .reflist{font-size:90%}}.mw-parser-output .reflist .references{font-size:100%;margin-bottom:0;list-style-type:inherit}.mw-parser-output .reflist-columns-2{column-width:30em}.mw-parser-output .reflist-columns-3{column-width:25em}.mw-parser-output .reflist-columns{margin-top:0.3em}.mw-parser-output .reflist-columns ol{margin-top:0}.mw-parser-output .reflist-columns li{page-break-inside:avoid;break-inside:avoid-column}.mw-parser-output .reflist-upper-alpha{list-style-type:upper-alpha}.mw-parser-output .reflist-upper-roman{list-style-type:upper-roman}.mw-parser-output .reflist-lower-alpha{list-style-type:lower-alpha}.mw-parser-output .reflist-lower-greek{list-style-type:lower-greek}.mw-parser-output .reflist-lower-roman{list-style-type:lower-roman}</style><div class="reflist"> <div class="mw-references-wrap mw-references-columns"><ol class="references"> <li id="cite_note-1"><span class="mw-cite-backlink"><b><a href="#cite_ref-1">^</a></b></span> <span class="reference-text"><style data-mw-deduplicate="TemplateStyles:r1238218222">.mw-parser-output cite.citation{font-style:inherit;word-wrap:break-word}.mw-parser-output .citation q{quotes:"\"""\"""'""'"}.mw-parser-output .citation:target{background-color:rgba(0,127,255,0.133)}.mw-parser-output .id-lock-free.id-lock-free a{background:url("//upload.wikimedia.org/wikipedia/commons/6/65/Lock-green.svg")right 0.1em center/9px no-repeat}.mw-parser-output .id-lock-limited.id-lock-limited a,.mw-parser-output .id-lock-registration.id-lock-registration a{background:url("//upload.wikimedia.org/wikipedia/commons/d/d6/Lock-gray-alt-2.svg")right 0.1em center/9px no-repeat}.mw-parser-output .id-lock-subscription.id-lock-subscription a{background:url("//upload.wikimedia.org/wikipedia/commons/a/aa/Lock-red-alt-2.svg")right 0.1em center/9px no-repeat}.mw-parser-output .cs1-ws-icon a{background:url("//upload.wikimedia.org/wikipedia/commons/4/4c/Wikisource-logo.svg")right 0.1em center/12px no-repeat}body:not(.skin-timeless):not(.skin-minerva) .mw-parser-output .id-lock-free a,body:not(.skin-timeless):not(.skin-minerva) .mw-parser-output .id-lock-limited a,body:not(.skin-timeless):not(.skin-minerva) .mw-parser-output .id-lock-registration a,body:not(.skin-timeless):not(.skin-minerva) .mw-parser-output .id-lock-subscription a,body:not(.skin-timeless):not(.skin-minerva) .mw-parser-output .cs1-ws-icon a{background-size:contain;padding:0 1em 0 0}.mw-parser-output .cs1-code{color:inherit;background:inherit;border:none;padding:inherit}.mw-parser-output .cs1-hidden-error{display:none;color:var(--color-error,#d33)}.mw-parser-output .cs1-visible-error{color:var(--color-error,#d33)}.mw-parser-output .cs1-maint{display:none;color:#085;margin-left:0.3em}.mw-parser-output .cs1-kern-left{padding-left:0.2em}.mw-parser-output .cs1-kern-right{padding-right:0.2em}.mw-parser-output .citation .mw-selflink{font-weight:inherit}@media screen{.mw-parser-output .cs1-format{font-size:95%}html.skin-theme-clientpref-night .mw-parser-output .cs1-maint{color:#18911f}}@media screen and (prefers-color-scheme:dark){html.skin-theme-clientpref-os .mw-parser-output .cs1-maint{color:#18911f}}</style><cite id="CITEREFRajaramanUllman2011" class="citation book cs1">Rajaraman, A.; Ullman, J.D. (2011). <a rel="nofollow" class="external text" href="http://i.stanford.edu/~ullman/mmds/ch1.pdf">"Data Mining"</a> <span class="cs1-format">(PDF)</span>. <i>Mining of Massive Datasets</i>. pp. 1–17. <a href="/wiki/Doi_(identifier)" class="mw-redirect" title="Doi (identifier)">doi</a>:<a rel="nofollow" class="external text" href="https://doi.org/10.1017%2FCBO9781139058452.002">10.1017/CBO9781139058452.002</a>. <a href="/wiki/ISBN_(identifier)" class="mw-redirect" title="ISBN (identifier)">ISBN</a> <a href="/wiki/Special:BookSources/978-1-139-05845-2" title="Special:BookSources/978-1-139-05845-2"><bdi>978-1-139-05845-2</bdi></a>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&rft.genre=bookitem&rft.atitle=Data+Mining&rft.btitle=Mining+of+Massive+Datasets&rft.pages=1-17&rft.date=2011&rft_id=info%3Adoi%2F10.1017%2FCBO9781139058452.002&rft.isbn=978-1-139-05845-2&rft.aulast=Rajaraman&rft.aufirst=A.&rft.au=Ullman%2C+J.D.&rft_id=http%3A%2F%2Fi.stanford.edu%2F~ullman%2Fmmds%2Fch1.pdf&rfr_id=info%3Asid%2Fen.wikipedia.org%3ATf%E2%80%93idf" class="Z3988"></span></span> </li> <li id="cite_note-2"><span class="mw-cite-backlink"><b><a href="#cite_ref-2">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFBreitingerGippLanger2015" class="citation journal cs1">Breitinger, Corinna; Gipp, Bela; Langer, Stefan (2015-07-26). <a rel="nofollow" class="external text" href="http://nbn-resolving.de/urn:nbn:de:bsz:352-0-311312">"Research-paper recommender systems: a literature survey"</a>. <i>International Journal on Digital Libraries</i>. <b>17</b> (4): 305–338. <a href="/wiki/Doi_(identifier)" class="mw-redirect" title="Doi (identifier)">doi</a>:<a rel="nofollow" class="external text" href="https://doi.org/10.1007%2Fs00799-015-0156-0">10.1007/s00799-015-0156-0</a>. <a href="/wiki/ISSN_(identifier)" class="mw-redirect" title="ISSN (identifier)">ISSN</a> <a rel="nofollow" class="external text" href="https://search.worldcat.org/issn/1432-5012">1432-5012</a>. <a href="/wiki/S2CID_(identifier)" class="mw-redirect" title="S2CID (identifier)">S2CID</a> <a rel="nofollow" class="external text" href="https://api.semanticscholar.org/CorpusID:207035184">207035184</a>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=article&rft.jtitle=International+Journal+on+Digital+Libraries&rft.atitle=Research-paper+recommender+systems%3A+a+literature+survey&rft.volume=17&rft.issue=4&rft.pages=305-338&rft.date=2015-07-26&rft_id=https%3A%2F%2Fapi.semanticscholar.org%2FCorpusID%3A207035184%23id-name%3DS2CID&rft.issn=1432-5012&rft_id=info%3Adoi%2F10.1007%2Fs00799-015-0156-0&rft.aulast=Breitinger&rft.aufirst=Corinna&rft.au=Gipp%2C+Bela&rft.au=Langer%2C+Stefan&rft_id=http%3A%2F%2Fnbn-resolving.de%2Furn%3Anbn%3Ade%3Absz%3A352-0-311312&rfr_id=info%3Asid%2Fen.wikipedia.org%3ATf%E2%80%93idf" class="Z3988"></span></span> </li> <li id="cite_note-3"><span class="mw-cite-backlink"><b><a href="#cite_ref-3">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFSpärck_Jones1972" class="citation journal cs1"><a href="/wiki/Karen_Sp%C3%A4rck_Jones" title="Karen Spärck Jones">Spärck Jones, K.</a> (1972). "A Statistical Interpretation of Term Specificity and Its Application in Retrieval". <i>Journal of Documentation</i>. <b>28</b> (1): 11–21. <a href="/wiki/CiteSeerX_(identifier)" class="mw-redirect" title="CiteSeerX (identifier)">CiteSeerX</a> <span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.115.8343">10.1.1.115.8343</a></span>. <a href="/wiki/Doi_(identifier)" class="mw-redirect" title="Doi (identifier)">doi</a>:<a rel="nofollow" class="external text" href="https://doi.org/10.1108%2Feb026526">10.1108/eb026526</a>. <a href="/wiki/S2CID_(identifier)" class="mw-redirect" title="S2CID (identifier)">S2CID</a> <a rel="nofollow" class="external text" href="https://api.semanticscholar.org/CorpusID:2996187">2996187</a>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=article&rft.jtitle=Journal+of+Documentation&rft.atitle=A+Statistical+Interpretation+of+Term+Specificity+and+Its+Application+in+Retrieval&rft.volume=28&rft.issue=1&rft.pages=11-21&rft.date=1972&rft_id=https%3A%2F%2Fciteseerx.ist.psu.edu%2Fviewdoc%2Fsummary%3Fdoi%3D10.1.1.115.8343%23id-name%3DCiteSeerX&rft_id=https%3A%2F%2Fapi.semanticscholar.org%2FCorpusID%3A2996187%23id-name%3DS2CID&rft_id=info%3Adoi%2F10.1108%2Feb026526&rft.aulast=Sp%C3%A4rck+Jones&rft.aufirst=K.&rfr_id=info%3Asid%2Fen.wikipedia.org%3ATf%E2%80%93idf" class="Z3988"></span></span> </li> <li id="cite_note-4"><span class="mw-cite-backlink"><b><a href="#cite_ref-4">^</a></b></span> <span class="reference-text">Speech and Language Processing (3rd ed. draft), Dan Jurafsky and James H. Martin, chapter 14.<a rel="nofollow" class="external free" href="https://web.stanford.edu/~jurafsky/slp3/14.pdf">https://web.stanford.edu/~jurafsky/slp3/14.pdf</a></span> </li> <li id="cite_note-5"><span class="mw-cite-backlink"><b><a href="#cite_ref-5">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFManningRaghavanSchutze2008" class="citation book cs1">Manning, C.D.; Raghavan, P.; Schutze, H. (2008). <a rel="nofollow" class="external text" href="http://nlp.stanford.edu/IR-book/pdf/06vect.pdf">"Scoring, term weighting, and the vector space model"</a> <span class="cs1-format">(PDF)</span>. <i>Introduction to Information Retrieval</i>. p. 100. <a href="/wiki/Doi_(identifier)" class="mw-redirect" title="Doi (identifier)">doi</a>:<a rel="nofollow" class="external text" href="https://doi.org/10.1017%2FCBO9780511809071.007">10.1017/CBO9780511809071.007</a>. <a href="/wiki/ISBN_(identifier)" class="mw-redirect" title="ISBN (identifier)">ISBN</a> <a href="/wiki/Special:BookSources/978-0-511-80907-1" title="Special:BookSources/978-0-511-80907-1"><bdi>978-0-511-80907-1</bdi></a>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&rft.genre=bookitem&rft.atitle=Scoring%2C+term+weighting%2C+and+the+vector+space+model&rft.btitle=Introduction+to+Information+Retrieval&rft.pages=100&rft.date=2008&rft_id=info%3Adoi%2F10.1017%2FCBO9780511809071.007&rft.isbn=978-0-511-80907-1&rft.aulast=Manning&rft.aufirst=C.D.&rft.au=Raghavan%2C+P.&rft.au=Schutze%2C+H.&rft_id=http%3A%2F%2Fnlp.stanford.edu%2FIR-book%2Fpdf%2F06vect.pdf&rfr_id=info%3Asid%2Fen.wikipedia.org%3ATf%E2%80%93idf" class="Z3988"></span></span> </li> <li id="cite_note-6"><span class="mw-cite-backlink"><b><a href="#cite_ref-6">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite class="citation web cs1"><a rel="nofollow" class="external text" href="https://jmotif.github.io/sax-vsm_site/morea/algorithm/TFIDF.html">"TFIDF statistics | SAX-VSM"</a>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&rft.genre=unknown&rft.btitle=TFIDF+statistics+%7C+SAX-VSM&rft_id=https%3A%2F%2Fjmotif.github.io%2Fsax-vsm_site%2Fmorea%2Falgorithm%2FTFIDF.html&rfr_id=info%3Asid%2Fen.wikipedia.org%3ATf%E2%80%93idf" class="Z3988"></span></span> </li> <li id="cite_note-understanding-7"><span class="mw-cite-backlink">^ <a href="#cite_ref-understanding_7-0"><sup><i><b>a</b></i></sup></a> <a href="#cite_ref-understanding_7-1"><sup><i><b>b</b></i></sup></a> <a href="#cite_ref-understanding_7-2"><sup><i><b>c</b></i></sup></a></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFRobertson2004" class="citation journal cs1"><a href="/wiki/Stephen_Robertson_(computer_scientist)" title="Stephen Robertson (computer scientist)">Robertson, S.</a> (2004). "Understanding inverse document frequency: On theoretical arguments for IDF". <i>Journal of Documentation</i>. <b>60</b> (5): 503–520. <a href="/wiki/Doi_(identifier)" class="mw-redirect" title="Doi (identifier)">doi</a>:<a rel="nofollow" class="external text" href="https://doi.org/10.1108%2F00220410410560582">10.1108/00220410410560582</a>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=article&rft.jtitle=Journal+of+Documentation&rft.atitle=Understanding+inverse+document+frequency%3A+On+theoretical+arguments+for+IDF&rft.volume=60&rft.issue=5&rft.pages=503-520&rft.date=2004&rft_id=info%3Adoi%2F10.1108%2F00220410410560582&rft.aulast=Robertson&rft.aufirst=S.&rfr_id=info%3Asid%2Fen.wikipedia.org%3ATf%E2%80%93idf" class="Z3988"></span></span> </li> <li id="cite_note-8"><span class="mw-cite-backlink"><b><a href="#cite_ref-8">^</a></b></span> <span class="reference-text">See also <a rel="nofollow" class="external text" href="http://nlp.stanford.edu/IR-book/html/htmledition/probability-estimates-in-practice-1.html#p:justificationofidf">Probability estimates in practice</a> in <i>Introduction to Information Retrieval</i>.</span> </li> <li id="cite_note-Aizawa_2003_45–65-9"><span class="mw-cite-backlink">^ <a href="#cite_ref-Aizawa_2003_45–65_9-0"><sup><i><b>a</b></i></sup></a> <a href="#cite_ref-Aizawa_2003_45–65_9-1"><sup><i><b>b</b></i></sup></a></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFAizawa2003" class="citation journal cs1">Aizawa, Akiko (2003). "An information-theoretic perspective of tf–idf measures". <i>Information Processing and Management</i>. <b>39</b> (1): 45–65. <a href="/wiki/Doi_(identifier)" class="mw-redirect" title="Doi (identifier)">doi</a>:<a rel="nofollow" class="external text" href="https://doi.org/10.1016%2FS0306-4573%2802%2900021-3">10.1016/S0306-4573(02)00021-3</a>. <a href="/wiki/S2CID_(identifier)" class="mw-redirect" title="S2CID (identifier)">S2CID</a> <a rel="nofollow" class="external text" href="https://api.semanticscholar.org/CorpusID:45793141">45793141</a>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=article&rft.jtitle=Information+Processing+and+Management&rft.atitle=An+information-theoretic+perspective+of+tf%E2%80%93idf+measures&rft.volume=39&rft.issue=1&rft.pages=45-65&rft.date=2003&rft_id=info%3Adoi%2F10.1016%2FS0306-4573%2802%2900021-3&rft_id=https%3A%2F%2Fapi.semanticscholar.org%2FCorpusID%3A45793141%23id-name%3DS2CID&rft.aulast=Aizawa&rft.aufirst=Akiko&rfr_id=info%3Asid%2Fen.wikipedia.org%3ATf%E2%80%93idf" class="Z3988"></span></span> </li> <li id="cite_note-10"><span class="mw-cite-backlink"><b><a href="#cite_ref-10">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFBollackerLawrenceGiles1998" class="citation book cs1">Bollacker, Kurt D.; Lawrence, Steve; Giles, C. Lee (1998-01-01). "CiteSeer". <i>Proceedings of the second international conference on Autonomous agents - AGENTS '98</i>. pp. 116–123. <a href="/wiki/Doi_(identifier)" class="mw-redirect" title="Doi (identifier)">doi</a>:<a rel="nofollow" class="external text" href="https://doi.org/10.1145%2F280765.280786">10.1145/280765.280786</a>. <a href="/wiki/ISBN_(identifier)" class="mw-redirect" title="ISBN (identifier)">ISBN</a> <a href="/wiki/Special:BookSources/978-0-89791-983-8" title="Special:BookSources/978-0-89791-983-8"><bdi>978-0-89791-983-8</bdi></a>. <a href="/wiki/S2CID_(identifier)" class="mw-redirect" title="S2CID (identifier)">S2CID</a> <a rel="nofollow" class="external text" href="https://api.semanticscholar.org/CorpusID:3526393">3526393</a>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&rft.genre=bookitem&rft.atitle=CiteSeer&rft.btitle=Proceedings+of+the+second+international+conference+on+Autonomous+agents+-+AGENTS+%2798&rft.pages=116-123&rft.date=1998-01-01&rft_id=https%3A%2F%2Fapi.semanticscholar.org%2FCorpusID%3A3526393%23id-name%3DS2CID&rft_id=info%3Adoi%2F10.1145%2F280765.280786&rft.isbn=978-0-89791-983-8&rft.aulast=Bollacker&rft.aufirst=Kurt+D.&rft.au=Lawrence%2C+Steve&rft.au=Giles%2C+C.+Lee&rfr_id=info%3Asid%2Fen.wikipedia.org%3ATf%E2%80%93idf" class="Z3988"></span></span> </li> <li id="cite_note-11"><span class="mw-cite-backlink"><b><a href="#cite_ref-11">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFSivicZisserman2003" class="citation book cs1">Sivic, Josef; Zisserman, Andrew (2003-01-01). "Video Google: A text retrieval approach to object matching in videos". <a rel="nofollow" class="external text" href="http://dl.acm.org/citation.cfm?id=946247.946751"><i>Proceedings Ninth IEEE International Conference on Computer Vision</i></a>. ICCV '03. pp. 1470–. <a href="/wiki/Doi_(identifier)" class="mw-redirect" title="Doi (identifier)">doi</a>:<a rel="nofollow" class="external text" href="https://doi.org/10.1109%2FICCV.2003.1238663">10.1109/ICCV.2003.1238663</a>. <a href="/wiki/ISBN_(identifier)" class="mw-redirect" title="ISBN (identifier)">ISBN</a> <a href="/wiki/Special:BookSources/978-0-7695-1950-0" title="Special:BookSources/978-0-7695-1950-0"><bdi>978-0-7695-1950-0</bdi></a>. <a href="/wiki/S2CID_(identifier)" class="mw-redirect" title="S2CID (identifier)">S2CID</a> <a rel="nofollow" class="external text" href="https://api.semanticscholar.org/CorpusID:14457153">14457153</a>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&rft.genre=bookitem&rft.atitle=Video+Google%3A+A+text+retrieval+approach+to+object+matching+in+videos&rft.btitle=Proceedings+Ninth+IEEE+International+Conference+on+Computer+Vision&rft.series=ICCV+%2703&rft.pages=1470-&rft.date=2003-01-01&rft_id=https%3A%2F%2Fapi.semanticscholar.org%2FCorpusID%3A14457153%23id-name%3DS2CID&rft_id=info%3Adoi%2F10.1109%2FICCV.2003.1238663&rft.isbn=978-0-7695-1950-0&rft.aulast=Sivic&rft.aufirst=Josef&rft.au=Zisserman%2C+Andrew&rft_id=http%3A%2F%2Fdl.acm.org%2Fcitation.cfm%3Fid%3D946247.946751&rfr_id=info%3Asid%2Fen.wikipedia.org%3ATf%E2%80%93idf" class="Z3988"></span></span> </li> <li id="cite_note-12"><span class="mw-cite-backlink"><b><a href="#cite_ref-12">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFSeki" class="citation web cs1">Seki, Yohei. <a rel="nofollow" class="external text" href="http://research.nii.ac.jp/ntcir/workshop/OnlineProceedings3/NTCIR3-TSC-SekiY.pdf">"Sentence Extraction by tf/idf and Position Weighting from Newspaper Articles"</a> <span class="cs1-format">(PDF)</span>. National Institute of Informatics.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&rft.genre=unknown&rft.btitle=Sentence+Extraction+by+tf%2Fidf+and+Position+Weighting+from+Newspaper+Articles&rft.pub=National+Institute+of+Informatics&rft.aulast=Seki&rft.aufirst=Yohei&rft_id=http%3A%2F%2Fresearch.nii.ac.jp%2Fntcir%2Fworkshop%2FOnlineProceedings3%2FNTCIR3-TSC-SekiY.pdf&rfr_id=info%3Asid%2Fen.wikipedia.org%3ATf%E2%80%93idf" class="Z3988"></span></span> </li> <li id="cite_note-13"><span class="mw-cite-backlink"><b><a href="#cite_ref-13">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFBeelBreitinger2017" class="citation journal cs1">Beel, Joeran; Breitinger, Corinna (2017). <a rel="nofollow" class="external text" href="https://web.archive.org/web/20200922150304/http://beel.org/publications/2017%20iConference%20--%20Evaluating%20the%20CC-IDF%20citation-weighting%20scheme%20--%20preprint.pdf">"Evaluating the CC-IDF citation-weighting scheme – How effectively can 'Inverse Document Frequency' (IDF) be applied to references?"</a> <span class="cs1-format">(PDF)</span>. <i>Proceedings of the 12th IConference</i>. Archived from <a rel="nofollow" class="external text" href="http://beel.org/publications/2017%20iConference%20--%20Evaluating%20the%20CC-IDF%20citation-weighting%20scheme%20--%20preprint.pdf">the original</a> <span class="cs1-format">(PDF)</span> on 2020-09-22<span class="reference-accessdate">. Retrieved <span class="nowrap">2017-01-29</span></span>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=article&rft.jtitle=Proceedings+of+the+12th+IConference&rft.atitle=Evaluating+the+CC-IDF+citation-weighting+scheme+%E2%80%93+How+effectively+can+%27Inverse+Document+Frequency%27+%28IDF%29+be+applied+to+references%3F&rft.date=2017&rft.aulast=Beel&rft.aufirst=Joeran&rft.au=Breitinger%2C+Corinna&rft_id=http%3A%2F%2Fbeel.org%2Fpublications%2F2017%2520iConference%2520--%2520Evaluating%2520the%2520CC-IDF%2520citation-weighting%2520scheme%2520--%2520preprint.pdf&rfr_id=info%3Asid%2Fen.wikipedia.org%3ATf%E2%80%93idf" class="Z3988"></span></span> </li> <li id="cite_note-14"><span class="mw-cite-backlink"><b><a href="#cite_ref-14">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFKhoo_Khyou_BunBunIshizuka2001" class="citation book cs1">Khoo Khyou Bun; Bun, Khoo Khyou; Ishizuka, M. (2001). "Emerging Topic Tracking System". <i>Proceedings Third International Workshop on Advanced Issues of E-Commerce and Web-Based Information Systems. WECWIS 2001</i>. pp. 2–11. <a href="/wiki/CiteSeerX_(identifier)" class="mw-redirect" title="CiteSeerX (identifier)">CiteSeerX</a> <span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.16.7986">10.1.1.16.7986</a></span>. <a href="/wiki/Doi_(identifier)" class="mw-redirect" title="Doi (identifier)">doi</a>:<a rel="nofollow" class="external text" href="https://doi.org/10.1109%2Fwecwis.2001.933900">10.1109/wecwis.2001.933900</a>. <a href="/wiki/ISBN_(identifier)" class="mw-redirect" title="ISBN (identifier)">ISBN</a> <a href="/wiki/Special:BookSources/978-0-7695-1224-2" title="Special:BookSources/978-0-7695-1224-2"><bdi>978-0-7695-1224-2</bdi></a>. <a href="/wiki/S2CID_(identifier)" class="mw-redirect" title="S2CID (identifier)">S2CID</a> <a rel="nofollow" class="external text" href="https://api.semanticscholar.org/CorpusID:1049263">1049263</a>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&rft.genre=bookitem&rft.atitle=Emerging+Topic+Tracking+System&rft.btitle=Proceedings+Third+International+Workshop+on+Advanced+Issues+of+E-Commerce+and+Web-Based+Information+Systems.+WECWIS+2001&rft.pages=2-11&rft.date=2001&rft_id=https%3A%2F%2Fciteseerx.ist.psu.edu%2Fviewdoc%2Fsummary%3Fdoi%3D10.1.1.16.7986%23id-name%3DCiteSeerX&rft_id=https%3A%2F%2Fapi.semanticscholar.org%2FCorpusID%3A1049263%23id-name%3DS2CID&rft_id=info%3Adoi%2F10.1109%2Fwecwis.2001.933900&rft.isbn=978-0-7695-1224-2&rft.au=Khoo+Khyou+Bun&rft.au=Bun%2C+Khoo+Khyou&rft.au=Ishizuka%2C+M.&rfr_id=info%3Asid%2Fen.wikipedia.org%3ATf%E2%80%93idf" class="Z3988"></span></span> </li> <li id="cite_note-15"><span class="mw-cite-backlink"><b><a href="#cite_ref-15">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFLangerGipp2017" class="citation journal cs1">Langer, Stefan; Gipp, Bela (2017). <a rel="nofollow" class="external text" href="https://www.gipp.com/wp-content/papercite-data/pdf/beel17.pdf">"TF-IDuF: A Novel Term-Weighting Scheme for User Modeling based on Users' Personal Document Collections"</a> <span class="cs1-format">(PDF)</span>. <i>IConference</i>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=article&rft.jtitle=IConference&rft.atitle=TF-IDuF%3A+A+Novel+Term-Weighting+Scheme+for+User+Modeling+based+on+Users%27+Personal+Document+Collections&rft.date=2017&rft.aulast=Langer&rft.aufirst=Stefan&rft.au=Gipp%2C+Bela&rft_id=https%3A%2F%2Fwww.gipp.com%2Fwp-content%2Fpapercite-data%2Fpdf%2Fbeel17.pdf&rfr_id=info%3Asid%2Fen.wikipedia.org%3ATf%E2%80%93idf" class="Z3988"></span></span> </li> </ol></div></div> <ul><li><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFSaltonMcGill1986" class="citation book cs1"><a href="/wiki/Gerard_Salton" title="Gerard Salton">Salton, G</a>; McGill, M. J. (1986). <span class="id-lock-registration" title="Free registration required"><a rel="nofollow" class="external text" href="https://archive.org/details/introductiontomo00salt"><i>Introduction to modern information retrieval</i></a></span>. <a href="/wiki/McGraw-Hill" class="mw-redirect" title="McGraw-Hill">McGraw-Hill</a>. <a href="/wiki/ISBN_(identifier)" class="mw-redirect" title="ISBN (identifier)">ISBN</a> <a href="/wiki/Special:BookSources/978-0-07-054484-0" title="Special:BookSources/978-0-07-054484-0"><bdi>978-0-07-054484-0</bdi></a>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&rft.genre=book&rft.btitle=Introduction+to+modern+information+retrieval&rft.pub=McGraw-Hill&rft.date=1986&rft.isbn=978-0-07-054484-0&rft.aulast=Salton&rft.aufirst=G&rft.au=McGill%2C+M.+J.&rft_id=https%3A%2F%2Farchive.org%2Fdetails%2Fintroductiontomo00salt&rfr_id=info%3Asid%2Fen.wikipedia.org%3ATf%E2%80%93idf" class="Z3988"></span></li> <li><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFSaltonFoxWu1983" class="citation journal cs1"><a href="/wiki/Gerard_Salton" title="Gerard Salton">Salton, G.</a>; Fox, E. A.; Wu, H. (1983). "Extended Boolean information retrieval". <i>Communications of the ACM</i>. <b>26</b> (11): 1022–1036. <a href="/wiki/Doi_(identifier)" class="mw-redirect" title="Doi (identifier)">doi</a>:<a rel="nofollow" class="external text" href="https://doi.org/10.1145%2F182.358466">10.1145/182.358466</a>. <a href="/wiki/Hdl_(identifier)" class="mw-redirect" title="Hdl (identifier)">hdl</a>:<span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://hdl.handle.net/1813%2F6351">1813/6351</a></span>. <a href="/wiki/S2CID_(identifier)" class="mw-redirect" title="S2CID (identifier)">S2CID</a> <a rel="nofollow" class="external text" href="https://api.semanticscholar.org/CorpusID:207180535">207180535</a>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=article&rft.jtitle=Communications+of+the+ACM&rft.atitle=Extended+Boolean+information+retrieval&rft.volume=26&rft.issue=11&rft.pages=1022-1036&rft.date=1983&rft_id=info%3Ahdl%2F1813%2F6351&rft_id=https%3A%2F%2Fapi.semanticscholar.org%2FCorpusID%3A207180535%23id-name%3DS2CID&rft_id=info%3Adoi%2F10.1145%2F182.358466&rft.aulast=Salton&rft.aufirst=G.&rft.au=Fox%2C+E.+A.&rft.au=Wu%2C+H.&rfr_id=info%3Asid%2Fen.wikipedia.org%3ATf%E2%80%93idf" class="Z3988"></span></li> <li><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFSaltonBuckley1988" class="citation journal cs1"><a href="/wiki/Gerard_Salton" title="Gerard Salton">Salton, G.</a>; Buckley, C. (1988). <a rel="nofollow" class="external text" href="https://ecommons.cornell.edu/bitstream/1813/6721/1/87-881.pdf">"Term-weighting approaches in automatic text retrieval"</a> <span class="cs1-format">(PDF)</span>. <i>Information Processing & Management</i>. <b>24</b> (5): 513–523. <a href="/wiki/Doi_(identifier)" class="mw-redirect" title="Doi (identifier)">doi</a>:<a rel="nofollow" class="external text" href="https://doi.org/10.1016%2F0306-4573%2888%2990021-0">10.1016/0306-4573(88)90021-0</a>. <a href="/wiki/Hdl_(identifier)" class="mw-redirect" title="Hdl (identifier)">hdl</a>:<span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://hdl.handle.net/1813%2F6721">1813/6721</a></span>. <a href="/wiki/S2CID_(identifier)" class="mw-redirect" title="S2CID (identifier)">S2CID</a> <a rel="nofollow" class="external text" href="https://api.semanticscholar.org/CorpusID:7725217">7725217</a>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=article&rft.jtitle=Information+Processing+%26+Management&rft.atitle=Term-weighting+approaches+in+automatic+text+retrieval&rft.volume=24&rft.issue=5&rft.pages=513-523&rft.date=1988&rft_id=info%3Ahdl%2F1813%2F6721&rft_id=https%3A%2F%2Fapi.semanticscholar.org%2FCorpusID%3A7725217%23id-name%3DS2CID&rft_id=info%3Adoi%2F10.1016%2F0306-4573%2888%2990021-0&rft.aulast=Salton&rft.aufirst=G.&rft.au=Buckley%2C+C.&rft_id=https%3A%2F%2Fecommons.cornell.edu%2Fbitstream%2F1813%2F6721%2F1%2F87-881.pdf&rfr_id=info%3Asid%2Fen.wikipedia.org%3ATf%E2%80%93idf" class="Z3988"></span></li> <li><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFWuLukWongKwok2008" class="citation journal cs1">Wu, H. C.; Luk, R.W.P.; Wong, K.F.; Kwok, K.L. (2008). "Interpreting TF-IDF term weights as making relevance decisions". <i>ACM Transactions on Information Systems</i>. <b>26</b> (3): 1. <a href="/wiki/Doi_(identifier)" class="mw-redirect" title="Doi (identifier)">doi</a>:<a rel="nofollow" class="external text" href="https://doi.org/10.1145%2F1361684.1361686">10.1145/1361684.1361686</a>. <a href="/wiki/Hdl_(identifier)" class="mw-redirect" title="Hdl (identifier)">hdl</a>:<span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://hdl.handle.net/10397%2F10130">10397/10130</a></span>. <a href="/wiki/S2CID_(identifier)" class="mw-redirect" title="S2CID (identifier)">S2CID</a> <a rel="nofollow" class="external text" href="https://api.semanticscholar.org/CorpusID:18303048">18303048</a>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=article&rft.jtitle=ACM+Transactions+on+Information+Systems&rft.atitle=Interpreting+TF-IDF+term+weights+as+making+relevance+decisions&rft.volume=26&rft.issue=3&rft.pages=1&rft.date=2008&rft_id=info%3Ahdl%2F10397%2F10130&rft_id=https%3A%2F%2Fapi.semanticscholar.org%2FCorpusID%3A18303048%23id-name%3DS2CID&rft_id=info%3Adoi%2F10.1145%2F1361684.1361686&rft.aulast=Wu&rft.aufirst=H.+C.&rft.au=Luk%2C+R.W.P.&rft.au=Wong%2C+K.F.&rft.au=Kwok%2C+K.L.&rfr_id=info%3Asid%2Fen.wikipedia.org%3ATf%E2%80%93idf" class="Z3988"></span></li></ul> <div class="mw-heading mw-heading2"><h2 id="External_links_and_suggested_reading">External links and suggested reading</h2><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Tf%E2%80%93idf&action=edit&section=13" title="Edit section: External links and suggested reading"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <ul><li><a href="/wiki/Gensim" title="Gensim">Gensim</a> is a Python library for vector space modeling and includes tf–idf weighting.</li> <li><a rel="nofollow" class="external text" href="http://www.codeproject.com/KB/IP/AnatomyOfASearchEngine1.aspx">Anatomy of a search engine</a></li> <li><a rel="nofollow" class="external text" href="https://lucene.apache.org/core/3_6_1/api/all/org/apache/lucene/search/Similarity.html">tf–idf and related definitions</a> as used in <a href="/wiki/Lucene" class="mw-redirect" title="Lucene">Lucene</a></li> <li><a rel="nofollow" class="external text" href="http://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfTransformer.html#sklearn.feature_extraction.text.TfidfTransformer">TfidfTransformer</a> in <a href="/wiki/Scikit-learn" title="Scikit-learn">scikit-learn</a></li> <li><a rel="nofollow" class="external text" href="https://www.hpclab.ceid.upatras.gr/tmg/">Text to Matrix Generator (TMG)</a> MATLAB toolbox that can be used for various tasks in text mining (TM) specifically i) indexing, ii) retrieval, iii) dimensionality reduction, iv) clustering, v) classification. The indexing step offers the user the ability to apply local and global weighting methods, including tf–idf.</li> <li><a rel="nofollow" class="external text" href="https://www.opinosis-analytics.com/knowledge-base/term-frequency-explained/">Term-frequency explained</a> Explanation of term-frequency</li></ul> <!-- NewPP limit report Parsed by mw‐web.codfw.main‐f69cdc8f6‐t7vv5 Cached time: 20241122141925 Cache expiry: 2592000 Reduced expiry: false Complications: [vary‐revision‐sha1, show‐toc] CPU time usage: 0.383 seconds Real time usage: 0.538 seconds Preprocessor visited node count: 2219/1000000 Post‐expand include size: 42467/2097152 bytes Template argument size: 1636/2097152 bytes Highest expansion depth: 16/100 Expensive parser function count: 1/500 Unstrip recursion depth: 1/20 Unstrip post‐expand size: 63041/5000000 bytes Lua time usage: 0.187/10.000 seconds Lua memory usage: 5813106/52428800 bytes Number of Wikibase entities loaded: 0/400 --> <!-- Transclusion expansion time report (%,ms,calls,template) 100.00% 360.733 1 -total 48.43% 174.691 1 Template:Reflist 31.03% 111.926 6 Template:Cite_book 21.34% 76.993 1 Template:Short_description 14.44% 52.093 9 Template:Cite_journal 14.01% 50.546 2 Template:Pagetype 7.09% 25.572 1 Template:Rp 6.14% 22.160 1 Template:R/superscript 4.83% 17.423 13 Template:Main_other 3.59% 12.952 1 Template:SDcat --> <!-- Saved in parser cache with key enwiki:pcache:idhash:2057290-0!canonical and timestamp 20241122141925 and revision id 1236851603. Rendering was triggered because: page-view --> </div><!--esi <esi:include src="/esitest-fa8a495983347898/content" /> --><noscript><img src="https://login.wikimedia.org/wiki/Special:CentralAutoLogin/start?type=1x1" alt="" width="1" height="1" style="border: none; position: absolute;"></noscript> <div class="printfooter" data-nosnippet="">Retrieved from "<a dir="ltr" href="https://en.wikipedia.org/w/index.php?title=Tf–idf&oldid=1236851603">https://en.wikipedia.org/w/index.php?title=Tf–idf&oldid=1236851603</a>"</div></div> <div id="catlinks" class="catlinks" data-mw="interface"><div id="mw-normal-catlinks" class="mw-normal-catlinks"><a href="/wiki/Help:Category" title="Help:Category">Categories</a>: <ul><li><a href="/wiki/Category:Statistical_natural_language_processing" title="Category:Statistical natural language processing">Statistical natural language processing</a></li><li><a href="/wiki/Category:Ranking_functions" title="Category:Ranking functions">Ranking functions</a></li><li><a href="/wiki/Category:Vector_space_model" title="Category:Vector space model">Vector space model</a></li></ul></div><div id="mw-hidden-catlinks" class="mw-hidden-catlinks mw-hidden-cats-hidden">Hidden categories: <ul><li><a href="/wiki/Category:Articles_with_short_description" title="Category:Articles with short description">Articles with short description</a></li><li><a href="/wiki/Category:Short_description_is_different_from_Wikidata" title="Category:Short description is different from Wikidata">Short description is different from Wikidata</a></li></ul></div></div> </div> </main> </div> <div class="mw-footer-container"> <footer id="footer" class="mw-footer" > <ul id="footer-info"> <li id="footer-info-lastmod"> This page was last edited on 26 July 2024, at 21:48<span class="anonymous-show"> (UTC)</span>.</li> <li id="footer-info-copyright">Text is available under the <a href="/wiki/Wikipedia:Text_of_the_Creative_Commons_Attribution-ShareAlike_4.0_International_License" title="Wikipedia:Text of the Creative Commons Attribution-ShareAlike 4.0 International License">Creative Commons Attribution-ShareAlike 4.0 License</a>; additional terms may apply. By using this site, you agree to the <a href="https://foundation.wikimedia.org/wiki/Special:MyLanguage/Policy:Terms_of_Use" class="extiw" title="foundation:Special:MyLanguage/Policy:Terms of Use">Terms of Use</a> and <a href="https://foundation.wikimedia.org/wiki/Special:MyLanguage/Policy:Privacy_policy" class="extiw" title="foundation:Special:MyLanguage/Policy:Privacy policy">Privacy Policy</a>. Wikipedia® is a registered trademark of the <a rel="nofollow" class="external text" href="https://wikimediafoundation.org/">Wikimedia Foundation, Inc.</a>, a non-profit organization.</li> </ul> <ul id="footer-places"> <li id="footer-places-privacy"><a href="https://foundation.wikimedia.org/wiki/Special:MyLanguage/Policy:Privacy_policy">Privacy policy</a></li> <li id="footer-places-about"><a href="/wiki/Wikipedia:About">About Wikipedia</a></li> <li id="footer-places-disclaimers"><a href="/wiki/Wikipedia:General_disclaimer">Disclaimers</a></li> <li id="footer-places-contact"><a href="//en.wikipedia.org/wiki/Wikipedia:Contact_us">Contact Wikipedia</a></li> <li id="footer-places-wm-codeofconduct"><a href="https://foundation.wikimedia.org/wiki/Special:MyLanguage/Policy:Universal_Code_of_Conduct">Code of Conduct</a></li> <li id="footer-places-developers"><a href="https://developer.wikimedia.org">Developers</a></li> <li id="footer-places-statslink"><a href="https://stats.wikimedia.org/#/en.wikipedia.org">Statistics</a></li> <li id="footer-places-cookiestatement"><a href="https://foundation.wikimedia.org/wiki/Special:MyLanguage/Policy:Cookie_statement">Cookie statement</a></li> <li id="footer-places-mobileview"><a href="//en.m.wikipedia.org/w/index.php?title=Tf%E2%80%93idf&mobileaction=toggle_view_mobile" class="noprint stopMobileRedirectToggle">Mobile view</a></li> </ul> <ul id="footer-icons" class="noprint"> <li id="footer-copyrightico"><a href="https://wikimediafoundation.org/" class="cdx-button cdx-button--fake-button cdx-button--size-large cdx-button--fake-button--enabled"><img src="/static/images/footer/wikimedia-button.svg" width="84" height="29" alt="Wikimedia Foundation" loading="lazy"></a></li> <li id="footer-poweredbyico"><a href="https://www.mediawiki.org/" class="cdx-button cdx-button--fake-button cdx-button--size-large cdx-button--fake-button--enabled"><img src="/w/resources/assets/poweredby_mediawiki.svg" alt="Powered by MediaWiki" width="88" height="31" loading="lazy"></a></li> </ul> </footer> </div> </div> </div> <div class="vector-settings" id="p-dock-bottom"> <ul></ul> </div><script>(RLQ=window.RLQ||[]).push(function(){mw.config.set({"wgHostname":"mw-web.codfw.main-f69cdc8f6-jvw68","wgBackendResponseTime":145,"wgPageParseReport":{"limitreport":{"cputime":"0.383","walltime":"0.538","ppvisitednodes":{"value":2219,"limit":1000000},"postexpandincludesize":{"value":42467,"limit":2097152},"templateargumentsize":{"value":1636,"limit":2097152},"expansiondepth":{"value":16,"limit":100},"expensivefunctioncount":{"value":1,"limit":500},"unstrip-depth":{"value":1,"limit":20},"unstrip-size":{"value":63041,"limit":5000000},"entityaccesscount":{"value":0,"limit":400},"timingprofile":["100.00% 360.733 1 -total"," 48.43% 174.691 1 Template:Reflist"," 31.03% 111.926 6 Template:Cite_book"," 21.34% 76.993 1 Template:Short_description"," 14.44% 52.093 9 Template:Cite_journal"," 14.01% 50.546 2 Template:Pagetype"," 7.09% 25.572 1 Template:Rp"," 6.14% 22.160 1 Template:R/superscript"," 4.83% 17.423 13 Template:Main_other"," 3.59% 12.952 1 Template:SDcat"]},"scribunto":{"limitreport-timeusage":{"value":"0.187","limit":"10.000"},"limitreport-memusage":{"value":5813106,"limit":52428800}},"cachereport":{"origin":"mw-web.codfw.main-f69cdc8f6-t7vv5","timestamp":"20241122141925","ttl":2592000,"transientcontent":false}}});});</script> <script type="application/ld+json">{"@context":"https:\/\/schema.org","@type":"Article","name":"Tf\u2013idf","url":"https:\/\/en.wikipedia.org\/wiki\/Tf%E2%80%93idf","sameAs":"http:\/\/www.wikidata.org\/entity\/Q796584","mainEntity":"http:\/\/www.wikidata.org\/entity\/Q796584","author":{"@type":"Organization","name":"Contributors to Wikimedia projects"},"publisher":{"@type":"Organization","name":"Wikimedia Foundation, Inc.","logo":{"@type":"ImageObject","url":"https:\/\/www.wikimedia.org\/static\/images\/wmf-hor-googpub.png"}},"datePublished":"2005-06-16T15:43:29Z","dateModified":"2024-07-26T21:48:07Z","headline":"statistic which reflects how important a word is to a document in a collection or corpus"}</script> </body> </html>