CINXE.COM

<!doctype html> <html lang="en" dir="ltr"> <head> <meta name="google-signin-client-id" content="721724668570-nbkv1cfusk7kk4eni4pjvepaus73b13t.apps.googleusercontent.com"> <meta name="google-signin-scope" content="profile email https://www.googleapis.com/auth/developerprofiles https://www.googleapis.com/auth/developerprofiles.award"> <meta property="og:site_name" content="Google for Developers"> <meta property="og:type" content="website"><meta name="theme-color" content="#fff"><meta charset="utf-8"> <meta content="IE=Edge" http-equiv="X-UA-Compatible"> <meta name="viewport" content="width=device-width, initial-scale=1"> <link rel="manifest" href="/_pwa/developers/manifest.json" crossorigin="use-credentials"> <link rel="preconnect" href="//www.gstatic.com" crossorigin> <link rel="preconnect" href="//fonts.gstatic.com" crossorigin> <link rel="preconnect" href="//fonts.googleapis.com" crossorigin> <link rel="preconnect" href="//apis.google.com" crossorigin> <link rel="preconnect" href="//www.google-analytics.com" crossorigin><link rel="stylesheet" href="//fonts.googleapis.com/css?family=Google+Sans:400,500|Roboto:400,400italic,500,500italic,700,700italic|Roboto+Mono:400,500,700&display=swap"> <link rel="stylesheet" href="//fonts.googleapis.com/css2?family=Material+Icons&family=Material+Symbols+Outlined&display=block"><link rel="stylesheet" href="https://www.gstatic.com/devrel-devsite/prod/v870e399c64f7c43c99a3043db4b3a74327bb93d0914e84a0c3dba90bbfd67625/developers/css/app.css"> <link rel="shortcut icon" href="https://www.gstatic.com/devrel-devsite/prod/v870e399c64f7c43c99a3043db4b3a74327bb93d0914e84a0c3dba90bbfd67625/developers/images/favicon-new.png"> <link rel="apple-touch-icon" href="https://www.gstatic.com/devrel-devsite/prod/v870e399c64f7c43c99a3043db4b3a74327bb93d0914e84a0c3dba90bbfd67625/developers/images/touchicon-180-new.png"><link rel="canonical" href="https://developers.google.com/machine-learning/glossary"><link rel="search" type="application/opensearchdescription+xml" title="Google for Developers" href="https://developers.google.com/s/opensearch.xml"> <link rel="alternate" hreflang="en" href="https://developers.google.com/machine-learning/glossary" /><link rel="alternate" hreflang="x-default" href="https://developers.google.com/machine-learning/glossary" /><link rel="alternate" hreflang="ar" href="https://developers.google.com/machine-learning/glossary?hl=ar" /><link rel="alternate" hreflang="bn" href="https://developers.google.com/machine-learning/glossary?hl=bn" /><link rel="alternate" hreflang="zh-Hans" href="https://developers.google.com/machine-learning/glossary?hl=zh-cn" /><link rel="alternate" hreflang="zh-Hant" href="https://developers.google.com/machine-learning/glossary?hl=zh-tw" /><link rel="alternate" hreflang="fa" href="https://developers.google.com/machine-learning/glossary?hl=fa" /><link rel="alternate" hreflang="fr" href="https://developers.google.com/machine-learning/glossary?hl=fr" /><link rel="alternate" hreflang="de" href="https://developers.google.com/machine-learning/glossary?hl=de" /><link rel="alternate" hreflang="he" href="https://developers.google.com/machine-learning/glossary?hl=he" /><link rel="alternate" hreflang="hi" href="https://developers.google.com/machine-learning/glossary?hl=hi" /><link rel="alternate" hreflang="id" href="https://developers.google.com/machine-learning/glossary?hl=id" /><link rel="alternate" hreflang="it" href="https://developers.google.com/machine-learning/glossary?hl=it" /><link rel="alternate" hreflang="ja" href="https://developers.google.com/machine-learning/glossary?hl=ja" /><link rel="alternate" hreflang="ko" href="https://developers.google.com/machine-learning/glossary?hl=ko" /><link rel="alternate" hreflang="pl" href="https://developers.google.com/machine-learning/glossary?hl=pl" /><link rel="alternate" hreflang="pt-BR" href="https://developers.google.com/machine-learning/glossary?hl=pt-br" /><link rel="alternate" hreflang="ru" href="https://developers.google.com/machine-learning/glossary?hl=ru" /><link rel="alternate" hreflang="es-419" href="https://developers.google.com/machine-learning/glossary?hl=es-419" /><link rel="alternate" hreflang="th" href="https://developers.google.com/machine-learning/glossary?hl=th" /><link rel="alternate" hreflang="tr" href="https://developers.google.com/machine-learning/glossary?hl=tr" /><link rel="alternate" hreflang="vi" href="https://developers.google.com/machine-learning/glossary?hl=vi" /><title>Machine Learning Glossary  |  Google for Developers</title> <meta property="og:title" content="Machine Learning Glossary  |  Google for Developers"><meta property="og:url" content="https://developers.google.com/machine-learning/glossary"><meta property="og:image" content="https://www.gstatic.com/devrel-devsite/prod/v870e399c64f7c43c99a3043db4b3a74327bb93d0914e84a0c3dba90bbfd67625/developers/images/opengraph/white.png"> <meta property="og:image:width" content="1200"> <meta property="og:image:height" content="675"><meta property="og:locale" content="en"><meta name="twitter:card" content="summary_large_image"><script type="application/ld+json"> { "@context": "https://schema.org", "@type": "Article", "headline": "Machine Learning Glossary" } </script><script type="application/ld+json"> { "@context": "https://schema.org", "@type": "BreadcrumbList", "itemListElement": [{ "@type": "ListItem", "position": 1, "name": "Machine Learning", "item": "https://developers.google.com/machine-learning" },{ "@type": "ListItem", "position": 2, "name": "Machine Learning Glossary", "item": "https://developers.google.com/machine-learning/glossary" }] } </script> <link rel="stylesheet" href="/extras.css"></head> <body class="" template="page" theme="white" type="article" layout="docs" concierge='closed' display-toc pending> <devsite-progress type="indeterminate" id="app-progress"></devsite-progress> <section class="devsite-wrapper"> <devsite-cookie-notification-bar></devsite-cookie-notification-bar><devsite-header role="banner"> <div class="devsite-header--inner nocontent"> <div class="devsite-top-logo-row-wrapper-wrapper"> <div class="devsite-top-logo-row-wrapper"> <div class="devsite-top-logo-row"> <button type="button" id="devsite-hamburger-menu" class="devsite-header-icon-button button-flat material-icons gc-analytics-event" data-category="Site-Wide Custom Events" data-label="Navigation menu button" visually-hidden aria-label="Open menu"> </button> <div class="devsite-product-name-wrapper"> <span class="devsite-product-name"> <ul class="devsite-breadcrumb-list" > <li class="devsite-breadcrumb-item "> <a href="https://developers.google.com/machine-learning" class="devsite-breadcrumb-link gc-analytics-event" data-category="Site-Wide Custom Events" data-label="Upper Header" data-value="1" track-type="globalNav" track-name="breadcrumb" track-metadata-position="1" track-metadata-eventdetail="Machine Learning" > Machine Learning </a> </li> </ul> </span> </div> <div class="devsite-top-logo-row-middle"> <div class="devsite-header-upper-tabs"> <devsite-tabs class="upper-tabs"> <nav class="devsite-tabs-wrapper" aria-label="Upper tabs"> <tab > <a href="https://developers.google.com/machine-learning/foundational-courses" track-metadata-eventdetail="https://developers.google.com/machine-learning/foundational-courses" class="devsite-tabs-content gc-analytics-event " track-type="nav" track-metadata-position="nav - foundational courses" track-metadata-module="primary nav" data-category="Site-Wide Custom Events" data-label="Tab: Foundational courses" track-name="foundational courses" > Foundational courses </a> </tab> <tab > <a href="https://developers.google.com/machine-learning/advanced-courses" track-metadata-eventdetail="https://developers.google.com/machine-learning/advanced-courses" class="devsite-tabs-content gc-analytics-event " track-type="nav" track-metadata-position="nav - advanced courses" track-metadata-module="primary nav" data-category="Site-Wide Custom Events" data-label="Tab: Advanced courses" track-name="advanced courses" > Advanced courses </a> </tab> <tab > <a href="https://developers.google.com/machine-learning/guides" track-metadata-eventdetail="https://developers.google.com/machine-learning/guides" class="devsite-tabs-content gc-analytics-event " track-type="nav" track-metadata-position="nav - guides" track-metadata-module="primary nav" data-category="Site-Wide Custom Events" data-label="Tab: Guides" track-name="guides" > Guides </a> </tab> <tab class="devsite-dropdown devsite-active "> <a href="https://developers.google.com/machine-learning/glossary" track-metadata-eventdetail="https://developers.google.com/machine-learning/glossary" class="devsite-tabs-content gc-analytics-event " track-type="nav" track-metadata-position="nav - glossary" track-metadata-module="primary nav" aria-label="Glossary, selected" data-category="Site-Wide Custom Events" data-label="Tab: Glossary" track-name="glossary" > Glossary </a> <a href="#" role="button" aria-haspopup="true" aria-expanded="false" aria-label="Dropdown menu for Glossary" track-type="nav" track-metadata-eventdetail="https://developers.google.com/machine-learning/glossary" track-metadata-position="nav - glossary" track-metadata-module="primary nav" data-category="Site-Wide Custom Events" data-label="Tab: Glossary" track-name="glossary" class="devsite-tabs-dropdown-toggle devsite-icon devsite-icon-arrow-drop-down"></a> <div class="devsite-tabs-dropdown" aria-label="submenu" hidden> <div class="devsite-tabs-dropdown-content"> <div class="devsite-tabs-dropdown-column "> <ul class="devsite-tabs-dropdown-section "> <li class="devsite-nav-item"> <a href="https://developers.google.com/machine-learning/glossary" track-type="nav" track-metadata-eventdetail="https://developers.google.com/machine-learning/glossary" track-metadata-position="nav - glossary" track-metadata-module="tertiary nav" tooltip > <div class="devsite-nav-item-title"> All terms </div> </a> </li> <li class="devsite-nav-item"> <a href="https://developers.google.com/machine-learning/glossary/clustering" track-type="nav" track-metadata-eventdetail="https://developers.google.com/machine-learning/glossary/clustering" track-metadata-position="nav - glossary" track-metadata-module="tertiary nav" tooltip > <div class="devsite-nav-item-title"> Clustering </div> </a> </li> <li class="devsite-nav-item"> <a href="https://developers.google.com/machine-learning/glossary/df" track-type="nav" track-metadata-eventdetail="https://developers.google.com/machine-learning/glossary/df" track-metadata-position="nav - glossary" track-metadata-module="tertiary nav" tooltip > <div class="devsite-nav-item-title"> Decision Forests </div> </a> </li> <li class="devsite-nav-item"> <a href="https://developers.google.com/machine-learning/glossary/fairness" track-type="nav" track-metadata-eventdetail="https://developers.google.com/machine-learning/glossary/fairness" track-metadata-position="nav - glossary" track-metadata-module="tertiary nav" tooltip > <div class="devsite-nav-item-title"> Fairness </div> </a> </li> <li class="devsite-nav-item"> <a href="https://developers.google.com/machine-learning/glossary/fundamentals" track-type="nav" track-metadata-eventdetail="https://developers.google.com/machine-learning/glossary/fundamentals" track-metadata-position="nav - glossary" track-metadata-module="tertiary nav" tooltip > <div class="devsite-nav-item-title"> Fundamentals </div> </a> </li> <li class="devsite-nav-item"> <a href="https://developers.google.com/machine-learning/glossary/googlecloud" track-type="nav" track-metadata-eventdetail="https://developers.google.com/machine-learning/glossary/googlecloud" track-metadata-position="nav - glossary" track-metadata-module="tertiary nav" tooltip > <div class="devsite-nav-item-title"> GCP </div> </a> </li> <li class="devsite-nav-item"> <a href="https://developers.google.com/machine-learning/glossary/generative" track-type="nav" track-metadata-eventdetail="https://developers.google.com/machine-learning/glossary/generative" track-metadata-position="nav - glossary" track-metadata-module="tertiary nav" tooltip > <div class="devsite-nav-item-title"> Generative AI </div> </a> </li> <li class="devsite-nav-item"> <a href="https://developers.google.com/machine-learning/glossary/image" track-type="nav" track-metadata-eventdetail="https://developers.google.com/machine-learning/glossary/image" track-metadata-position="nav - glossary" track-metadata-module="tertiary nav" tooltip > <div class="devsite-nav-item-title"> Image </div> </a> </li> <li class="devsite-nav-item"> <a href="https://developers.google.com/machine-learning/glossary/language" track-type="nav" track-metadata-eventdetail="https://developers.google.com/machine-learning/glossary/language" track-metadata-position="nav - glossary" track-metadata-module="tertiary nav" tooltip > <div class="devsite-nav-item-title"> Lang Eval </div> </a> </li> <li class="devsite-nav-item"> <a href="https://developers.google.com/machine-learning/glossary/recsystems" track-type="nav" track-metadata-eventdetail="https://developers.google.com/machine-learning/glossary/recsystems" track-metadata-position="nav - glossary" track-metadata-module="tertiary nav" tooltip > <div class="devsite-nav-item-title"> Recommendation Systems </div> </a> </li> <li class="devsite-nav-item"> <a href="https://developers.google.com/machine-learning/glossary/rl" track-type="nav" track-metadata-eventdetail="https://developers.google.com/machine-learning/glossary/rl" track-metadata-position="nav - glossary" track-metadata-module="tertiary nav" tooltip > <div class="devsite-nav-item-title"> Reinforcement Learning </div> </a> </li> <li class="devsite-nav-item"> <a href="https://developers.google.com/machine-learning/glossary/sequence" track-type="nav" track-metadata-eventdetail="https://developers.google.com/machine-learning/glossary/sequence" track-metadata-position="nav - glossary" track-metadata-module="tertiary nav" tooltip > <div class="devsite-nav-item-title"> Sequence Models </div> </a> </li> <li class="devsite-nav-item"> <a href="https://developers.google.com/machine-learning/glossary/tensorflow" track-type="nav" track-metadata-eventdetail="https://developers.google.com/machine-learning/glossary/tensorflow" track-metadata-position="nav - glossary" track-metadata-module="tertiary nav" tooltip > <div class="devsite-nav-item-title"> TensorFlow </div> </a> </li> </ul> </div> </div> </div> </tab> </nav> </devsite-tabs> </div> <devsite-search enable-signin enable-search enable-suggestions enable-query-completion project-name="Machine Learning" tenant-name="Google for Developers" project-scope="/machine-learning" url-scoped="https://developers.google.com/s/results/machine-learning" > <form class="devsite-search-form" action="https://developers.google.com/s/results" method="GET"> <div class="devsite-search-container"> <button type="button" search-open class="devsite-search-button devsite-header-icon-button button-flat material-icons" aria-label="Open search"></button> <div class="devsite-searchbox"> <input aria-activedescendant="" aria-autocomplete="list" aria-label="Search" aria-expanded="false" aria-haspopup="listbox" autocomplete="off" class="devsite-search-field devsite-search-query" name="q" placeholder="Search" role="combobox" type="text" value="" > <div class="devsite-search-image material-icons" aria-hidden="true"> </div> <div class="devsite-search-shortcut-icon-container" aria-hidden="true"> <kbd class="devsite-search-shortcut-icon">/</kbd> </div> </div> </div> </form> <button type="button" search-close class="devsite-search-button devsite-header-icon-button button-flat material-icons" aria-label="Close search"></button> </devsite-search> </div> <devsite-language-selector> <ul role="presentation"> <li role="presentation"> <a role="menuitem" lang="en" >English</a> </li> <li role="presentation"> <a role="menuitem" lang="de" >Deutsch</a> </li> <li role="presentation"> <a role="menuitem" lang="es" >Español</a> </li> <li role="presentation"> <a role="menuitem" lang="es_419" >Español – América Latina</a> </li> <li role="presentation"> <a role="menuitem" lang="fr" >Français</a> </li> <li role="presentation"> <a role="menuitem" lang="id" >Indonesia</a> </li> <li role="presentation"> <a role="menuitem" lang="it" >Italiano</a> </li> <li role="presentation"> <a role="menuitem" lang="pl" >Polski</a> </li> <li role="presentation"> <a role="menuitem" lang="pt_br" >Português – Brasil</a> </li> <li role="presentation"> <a role="menuitem" lang="vi" >Tiếng Việt</a> </li> <li role="presentation"> <a role="menuitem" lang="tr" >Türkçe</a> </li> <li role="presentation"> <a role="menuitem" lang="ru" >Русский</a> </li> <li role="presentation"> <a role="menuitem" lang="he" >עברית</a> </li> <li role="presentation"> <a role="menuitem" lang="ar" >العربيّة</a> </li> <li role="presentation"> <a role="menuitem" lang="fa" >فارسی</a> </li> <li role="presentation"> <a role="menuitem" lang="hi" >हिंदी</a> </li> <li role="presentation"> <a role="menuitem" lang="bn" >বাংলা</a> </li> <li role="presentation"> <a role="menuitem" lang="th" >ภาษาไทย</a> </li> <li role="presentation"> <a role="menuitem" lang="zh_cn" >中文 – 简体</a> </li> <li role="presentation"> <a role="menuitem" lang="zh_tw" >中文 – 繁體</a> </li> <li role="presentation"> <a role="menuitem" lang="ja" >日本語</a> </li> <li role="presentation"> <a role="menuitem" lang="ko" >한국어</a> </li> </ul> </devsite-language-selector> <devsite-user enable-profiles fp-auth id="devsite-user"> <span class="button devsite-top-button" aria-hidden="true" visually-hidden>Sign in</span> </devsite-user> </div> </div> </div> <div class="devsite-collapsible-section devsite-header-no-lower-tabs "> <div class="devsite-header-background"> <div class="devsite-product-id-row" > <div class="devsite-product-description-row"> <ul class="devsite-breadcrumb-list" > <li class="devsite-breadcrumb-item "> <a href="https://developers.google.com/machine-learning/glossary" class="devsite-breadcrumb-link gc-analytics-event" data-category="Site-Wide Custom Events" data-label="Lower Header" data-value="1" track-type="globalNav" track-name="breadcrumb" track-metadata-position="1" track-metadata-eventdetail="" > Glossary </a> </li> </ul> </div> </div> </div> </div> </div> </devsite-header> <devsite-book-nav scrollbars hidden> <div class="devsite-book-nav-filter" hidden> <span class="filter-list-icon material-icons" aria-hidden="true"></span> <input type="text" placeholder="Filter" aria-label="Type to filter" role="searchbox"> <span class="filter-clear-button hidden" data-title="Clear filter" aria-label="Clear filter" role="button" tabindex="0"></span> </div> <nav class="devsite-book-nav devsite-nav nocontent" aria-label="Side menu"> <div class="devsite-mobile-header"> <button type="button" id="devsite-close-nav" class="devsite-header-icon-button button-flat material-icons gc-analytics-event" data-category="Site-Wide Custom Events" data-label="Close navigation" aria-label="Close navigation"> </button> <div class="devsite-product-name-wrapper"> <span class="devsite-product-name"> <ul class="devsite-breadcrumb-list" > <li class="devsite-breadcrumb-item "> <a href="https://developers.google.com/machine-learning" class="devsite-breadcrumb-link gc-analytics-event" data-category="Site-Wide Custom Events" data-label="Upper Header" data-value="1" track-type="globalNav" track-name="breadcrumb" track-metadata-position="1" track-metadata-eventdetail="Machine Learning" > Machine Learning </a> </li> </ul> </span> </div> </div> <div class="devsite-book-nav-wrapper"> <div class="devsite-mobile-nav-top"> <ul class="devsite-nav-list"> <li class="devsite-nav-item"> <a href="/machine-learning/foundational-courses" class="devsite-nav-title gc-analytics-event " data-category="Site-Wide Custom Events" data-label="Tab: Foundational courses" track-name="foundational courses" data-category="Site-Wide Custom Events" data-label="Responsive Tab: Foundational courses" track-type="globalNav" track-metadata-eventDetail="globalMenu" track-metadata-position="nav"> <span class="devsite-nav-text" tooltip > Foundational courses </span> </a> </li> <li class="devsite-nav-item"> <a href="/machine-learning/advanced-courses" class="devsite-nav-title gc-analytics-event " data-category="Site-Wide Custom Events" data-label="Tab: Advanced courses" track-name="advanced courses" data-category="Site-Wide Custom Events" data-label="Responsive Tab: Advanced courses" track-type="globalNav" track-metadata-eventDetail="globalMenu" track-metadata-position="nav"> <span class="devsite-nav-text" tooltip > Advanced courses </span> </a> </li> <li class="devsite-nav-item"> <a href="/machine-learning/guides" class="devsite-nav-title gc-analytics-event " data-category="Site-Wide Custom Events" data-label="Tab: Guides" track-name="guides" data-category="Site-Wide Custom Events" data-label="Responsive Tab: Guides" track-type="globalNav" track-metadata-eventDetail="globalMenu" track-metadata-position="nav"> <span class="devsite-nav-text" tooltip > Guides </span> </a> </li> <li class="devsite-nav-item"> <a href="/machine-learning/glossary" class="devsite-nav-title gc-analytics-event devsite-nav-active" data-category="Site-Wide Custom Events" data-label="Tab: Glossary" track-name="glossary" data-category="Site-Wide Custom Events" data-label="Responsive Tab: Glossary" track-type="globalNav" track-metadata-eventDetail="globalMenu" track-metadata-position="nav"> <span class="devsite-nav-text" tooltip > Glossary </span> </a> <ul class="devsite-nav-responsive-tabs devsite-nav-has-menu "> <li class="devsite-nav-item"> <span class="devsite-nav-title" tooltip data-category="Site-Wide Custom Events" data-label="Tab: Glossary" track-name="glossary" > <span class="devsite-nav-text" tooltip menu="Glossary"> More </span> <span class="devsite-nav-icon material-icons" data-icon="forward" menu="Glossary"> </span> </span> </li> </ul> </li> </ul> </div> <div class="devsite-mobile-nav-bottom"> <ul class="devsite-nav-list" menu="Glossary" aria-label="Side menu" hidden> <li class="devsite-nav-item"> <a href="/machine-learning/glossary" class="devsite-nav-title gc-analytics-event " data-category="Site-Wide Custom Events" data-label="Responsive Tab: All terms" track-type="navMenu" track-metadata-eventDetail="globalMenu" track-metadata-position="nav"> <span class="devsite-nav-text" tooltip > All terms </span> </a> </li> <li class="devsite-nav-item"> <a href="/machine-learning/glossary/clustering" class="devsite-nav-title gc-analytics-event " data-category="Site-Wide Custom Events" data-label="Responsive Tab: Clustering" track-type="navMenu" track-metadata-eventDetail="globalMenu" track-metadata-position="nav"> <span class="devsite-nav-text" tooltip > Clustering </span> </a> </li> <li class="devsite-nav-item"> <a href="/machine-learning/glossary/df" class="devsite-nav-title gc-analytics-event " data-category="Site-Wide Custom Events" data-label="Responsive Tab: Decision Forests" track-type="navMenu" track-metadata-eventDetail="globalMenu" track-metadata-position="nav"> <span class="devsite-nav-text" tooltip > Decision Forests </span> </a> </li> <li class="devsite-nav-item"> <a href="/machine-learning/glossary/fairness" class="devsite-nav-title gc-analytics-event " data-category="Site-Wide Custom Events" data-label="Responsive Tab: Fairness" track-type="navMenu" track-metadata-eventDetail="globalMenu" track-metadata-position="nav"> <span class="devsite-nav-text" tooltip > Fairness </span> </a> </li> <li class="devsite-nav-item"> <a href="/machine-learning/glossary/fundamentals" class="devsite-nav-title gc-analytics-event " data-category="Site-Wide Custom Events" data-label="Responsive Tab: Fundamentals" track-type="navMenu" track-metadata-eventDetail="globalMenu" track-metadata-position="nav"> <span class="devsite-nav-text" tooltip > Fundamentals </span> </a> </li> <li class="devsite-nav-item"> <a href="/machine-learning/glossary/googlecloud" class="devsite-nav-title gc-analytics-event " data-category="Site-Wide Custom Events" data-label="Responsive Tab: GCP" track-type="navMenu" track-metadata-eventDetail="globalMenu" track-metadata-position="nav"> <span class="devsite-nav-text" tooltip > GCP </span> </a> </li> <li class="devsite-nav-item"> <a href="/machine-learning/glossary/generative" class="devsite-nav-title gc-analytics-event " data-category="Site-Wide Custom Events" data-label="Responsive Tab: Generative AI" track-type="navMenu" track-metadata-eventDetail="globalMenu" track-metadata-position="nav"> <span class="devsite-nav-text" tooltip > Generative AI </span> </a> </li> <li class="devsite-nav-item"> <a href="/machine-learning/glossary/image" class="devsite-nav-title gc-analytics-event " data-category="Site-Wide Custom Events" data-label="Responsive Tab: Image" track-type="navMenu" track-metadata-eventDetail="globalMenu" track-metadata-position="nav"> <span class="devsite-nav-text" tooltip > Image </span> </a> </li> <li class="devsite-nav-item"> <a href="/machine-learning/glossary/language" class="devsite-nav-title gc-analytics-event " data-category="Site-Wide Custom Events" data-label="Responsive Tab: Lang Eval" track-type="navMenu" track-metadata-eventDetail="globalMenu" track-metadata-position="nav"> <span class="devsite-nav-text" tooltip > Lang Eval </span> </a> </li> <li class="devsite-nav-item"> <a href="/machine-learning/glossary/recsystems" class="devsite-nav-title gc-analytics-event " data-category="Site-Wide Custom Events" data-label="Responsive Tab: Recommendation Systems" track-type="navMenu" track-metadata-eventDetail="globalMenu" track-metadata-position="nav"> <span class="devsite-nav-text" tooltip > Recommendation Systems </span> </a> </li> <li class="devsite-nav-item"> <a href="/machine-learning/glossary/rl" class="devsite-nav-title gc-analytics-event " data-category="Site-Wide Custom Events" data-label="Responsive Tab: Reinforcement Learning" track-type="navMenu" track-metadata-eventDetail="globalMenu" track-metadata-position="nav"> <span class="devsite-nav-text" tooltip > Reinforcement Learning </span> </a> </li> <li class="devsite-nav-item"> <a href="/machine-learning/glossary/sequence" class="devsite-nav-title gc-analytics-event " data-category="Site-Wide Custom Events" data-label="Responsive Tab: Sequence Models" track-type="navMenu" track-metadata-eventDetail="globalMenu" track-metadata-position="nav"> <span class="devsite-nav-text" tooltip > Sequence Models </span> </a> </li> <li class="devsite-nav-item"> <a href="/machine-learning/glossary/tensorflow" class="devsite-nav-title gc-analytics-event " data-category="Site-Wide Custom Events" data-label="Responsive Tab: TensorFlow" track-type="navMenu" track-metadata-eventDetail="globalMenu" track-metadata-position="nav"> <span class="devsite-nav-text" tooltip > TensorFlow </span> </a> </li> </ul> </div> </div> </nav> </devsite-book-nav> <section id="gc-wrapper"> <main role="main" class="devsite-main-content" > <devsite-content> <article class="devsite-article"><style> /* Styles inlined from /machine-learning/glossary/glossary.css */ /* Drop display of empty description row */ .devsite-product-description-row { display: none; } h2.glossary { border: none; padding-top: 25px; margin-top: 30px; margin-bottom: 10px; font-weight: bold; } h2.hide-from-toc { border-bottom: none; border-top: 1px solid #ebebeb; padding-top: 1%; margin-bottom: 1px; } a.glossary-anchor { display: block; padding-top: 40px; } @media screen and (min-width: 720px) { /* Styling for intersection/union images in IoU entry */ /* Place images side by side if not on phone */ #intersection-union-side-by-side img { display: inline-block; width: 45%; margin-right: 4.5%; } /* Styling for tables in sparse representation section */ /* Use two-column layout if not on phone */ #sparse-dense-tables { width: 80%; margin-left: auto; margin-right: auto; column-count: 2; column-width: 45%; } #sparse-dense-tables table { break-after: column; } } #sparse-dense-tables table caption { background: none; } #sparse-dense-tables table tr.elided-rows td { text-align: center; } .glossary-icon-container { float: right; position: relative; top: -34px; } /* Push glossary icons 40px to the left * to match 40px of right padding * applied to heading elements */ h2 + .glossary-icon-container { right: 40px; } .glossary-icon { color: transparent; float: left; font-size: 5px; position: relative; } .glossary-icon[title='Fairness'], .glossary-icon[data-title='Fairness'], .glossary-icon[title='Generative AI'], .glossary-icon[data-title='Generative AI'], .glossary-icon[title='ML Fundamentals'], .glossary-icon[data-title='ML Fundamentals'], .glossary-icon[title='Recommendation Systems'], .glossary-icon[data-title='Recommendation Systems'], .glossary-icon[title='Image Models'], .glossary-icon[data-title='Image Models'], .glossary-icon[title='Clustering'], .glossary-icon[data-title='Clustering'], .glossary-icon[title='Language Evaluation'], .glossary-icon[data-title='Language Evaluation'], .glossary-icon[title='Sequence Models'], .glossary-icon[data-title='Sequence Models'], .glossary-icon[title='Decision Forests'], .glossary-icon[data-title='Decision Forests'] { font-size: 7px; top: 4px; letter-spacing: -2.5px; } .glossary-icon[title='Fairness']::after, .glossary-icon[data-title='Fairness']::after, .glossary-icon[title='Generative AI']::after, .glossary-icon[data-title='Generative AI']::after, .glossary-icon[title='ML Fundamentals']::after, .glossary-icon[data-title='ML Fundamentals']::after, .glossary-icon[title='Recommendation Systems']::after, .glossary-icon[data-title='Recommendation Systems']::after, .glossary-icon[title='Image Models']::after, .glossary-icon[data-title='Image Models']::after, .glossary-icon[title='Clustering']::after, .glossary-icon[data-title='Clustering']::after, .glossary-icon[title='Language Evaluation'], .glossary-icon[data-title='Language Evaluation']::after, .glossary-icon[title='Sequence Models'], .glossary-icon[data-title='Sequence Models']::after, .glossary-icon[title='Decision Forests'], .glossary-icon[data-title='Decision Forests']::after { color: initial; font-size: 25px; text-align: center; } .glossary-icon[title='Fairness']::after, .glossary-icon[data-title='Fairness']::after { content: '⚖️'; width: 32px; } .glossary-icon[title='Generative AI']::after, .glossary-icon[data-title='Generative AI']::after { content: '🎨'; width: 32px; } .glossary-icon[title='ML Fundamentals']::after, .glossary-icon[data-title='ML Fundamentals']::after { content: '🐣'; width: 32px; } .glossary-icon[title='Image Models']::after, .glossary-icon[data-title='Image Models']::after { content: '🖼️'; width: 32px; } .glossary-icon[title='Clustering']::after, .glossary-icon[data-title='Clustering']::after { content: '🍇'; width: 32px; } .glossary-icon[title='Language Evaluation']::after, .glossary-icon[data-title='Language Evaluation']::after { content: '🔤'; width: 32px; } .glossary-icon[title='Sequence Models']::after, .glossary-icon[data-title='Sequence Models']::after { content: '🔺→🟦→🟡'; width: 96px; font-size: 1vw; } .glossary-icon[title='Decision Forests']::after, .glossary-icon[data-title='Decision Forests']::after { content: '🌳🌲🌳'; width: 64px; letter-spacing: -0.45em; font-size: 1vw; } .glossary-icon[title='Recommendation Systems']::after, .glossary-icon[data-title='Recommendation Systems']::after { content: '👎👍'; width: 64px; } .glossary-icon[title='Google Cloud']::after, .glossary-icon[data-title='Google Cloud']::after { background-position: center; background-repeat: no-repeat; content: ""; width: 37px; background-image: url(https://www.gstatic.com/images/branding/product/1x/google_cloud_48dp.png); background-size: 28px 28px; height: 29px; position: absolute; left: 0; } .glossary-icon[title='Reinforcement Learning'], .glossary-icon[data-title='Reinforcement Learning'] { font-size: 7px; top: 4px; } .glossary-icon[title='Reinforcement Learning']::after, .glossary-icon[data-title='Reinforcement Learning']::after { color: rgb(139, 0, 0); content: 'RL'; font-size: 25px; text-align: center; width: 32px; } .glossary-icon[title='TensorFlow']::after, .glossary-icon[data-title='TensorFlow']::after { background-position: center; background-repeat: no-repeat; content: ""; width: 30px; background-image: url(https://developers.google.com/site-assets/logo-tensorflow.svg); background-size: 26px 26px; height: 28px; position: absolute; left: 0; } </style> <div class="devsite-article-meta nocontent" role="navigation"> <ul class="devsite-breadcrumb-list" aria-label="Breadcrumb"> <li class="devsite-breadcrumb-item "> <a href="https://developers.google.com/" class="devsite-breadcrumb-link gc-analytics-event" data-category="Site-Wide Custom Events" data-label="Breadcrumbs" data-value="1" track-type="globalNav" track-name="breadcrumb" track-metadata-position="1" track-metadata-eventdetail="" > Home </a> </li> <li class="devsite-breadcrumb-item "> <div class="devsite-breadcrumb-guillemet material-icons" aria-hidden="true"></div> <a href="https://developers.google.com/products" class="devsite-breadcrumb-link gc-analytics-event" data-category="Site-Wide Custom Events" data-label="Breadcrumbs" data-value="2" track-type="globalNav" track-name="breadcrumb" track-metadata-position="2" track-metadata-eventdetail="" > Products </a> </li> <li class="devsite-breadcrumb-item "> <div class="devsite-breadcrumb-guillemet material-icons" aria-hidden="true"></div> <a href="https://developers.google.com/machine-learning" class="devsite-breadcrumb-link gc-analytics-event" data-category="Site-Wide Custom Events" data-label="Breadcrumbs" data-value="3" track-type="globalNav" track-name="breadcrumb" track-metadata-position="3" track-metadata-eventdetail="Machine Learning" > Machine Learning </a> </li> <li class="devsite-breadcrumb-item "> <div class="devsite-breadcrumb-guillemet material-icons" aria-hidden="true"></div> <a href="https://developers.google.com/machine-learning/glossary" class="devsite-breadcrumb-link gc-analytics-event" data-category="Site-Wide Custom Events" data-label="Breadcrumbs" data-value="4" track-type="globalNav" track-name="breadcrumb" track-metadata-position="4" track-metadata-eventdetail="" > Glossary </a> </li> </ul> <devsite-thumb-rating position="header"> </devsite-thumb-rating> </div> <devsite-feedback position="header" project-name="Machine Learning" product-id="5005867" bucket="" context="" version="t-devsite-webserver-20241114-r00-rc02.464922260396498922" data-label="Send Feedback Button" track-type="feedback" track-name="sendFeedbackLink" track-metadata-position="header" class="nocontent" project-icon="https://www.gstatic.com/devrel-devsite/prod/v870e399c64f7c43c99a3043db4b3a74327bb93d0914e84a0c3dba90bbfd67625/developers/images/touchicon-180-new.png" > <button> Send feedback </button> </devsite-feedback> <h1 class="devsite-page-title" tabindex="-1"> Machine Learning Glossary </h1> <devsite-feature-tooltip ack-key="AckCollectionsBookmarkTooltipDismiss" analytics-category="Site-Wide Custom Events" analytics-action-show="Callout Profile displayed" analytics-action-close="Callout Profile dismissed" analytics-label="Create Collection Callout" class="devsite-page-bookmark-tooltip nocontent" dismiss-button="true" id="devsite-collections-dropdown" dismiss-button-text="Dismiss" close-button-text="Got it"> <devsite-bookmark></devsite-bookmark> <span slot="popout-heading"> Stay organized with collections </span> <span slot="popout-contents"> Save and categorize content based on your preferences. </span> </devsite-feature-tooltip> <div class="devsite-page-title-meta"><devsite-view-release-notes></devsite-view-release-notes></div> <devsite-toc class="devsite-nav" depth="2" devsite-toc-embedded > </devsite-toc> <div class="devsite-article-body clearfix "> <devsite-mathjax config="TeX-AMS-MML_SVG"></devsite-mathjax></p> <p>This glossary defines general machine learning terms, plus terms specific to TensorFlow.</p> <aside class="key-point"> <h4 id="did-you-know" data-text="Did You Know?" tabindex="-1">Did You Know?</h4> <p>You can <strong>filter the glossary</strong> by choosing a topic from the Glossary drop-down in the top navigation bar. The hatching bird icon signifies definitions aimed at ML newcomers.</p> </aside> <p><a class="glossary-anchor" name="a"></a> <h2 class="glossary" id="a" data-text="A" tabindex="-1">A</h2></p> <p><a class="glossary-anchor" name="ablation"></a> <h2 class="hide-from-toc" id="ablation" data-text=" ablation" tabindex="-1"> ablation</h2></p> <p>A technique for evaluating the importance of a <a href="#feature"><strong>feature</strong></a> or component by temporarily <em>removing</em> it from a <a href="#model"><strong>model</strong></a>. You then retrain the model without that feature or component, and if the retrained model performs significantly worse, then the removed feature or component was likely important.</p> <p>For example, suppose you train a <a href="#classification_model"><strong>classification model</strong></a> on 10 features and achieve 88% <a href="#precision"><strong>precision</strong></a> on the <a href="#test_set"><strong>test set</strong></a>. To check the <a href="#variable-importances"><strong>importance</strong></a> of the first feature, you can retrain the model using only the nine other features. If the retrained model performs significantly worse (for instance, 55% precision), then the removed feature was probably important. Conversely, if the retrained model performs equally well, then that feature was probably not that important.</p> <p>Ablation can also help determine the importance of:</p> <ul> <li>Larger components, such as an entire subsystem of a larger ML system</li> <li>Processes or techniques, such as a data preprocessing step</li> </ul> <p>In both cases, you would observe how the system's performance changes (or doesn't change) after you've removed the component.</p> <p><a class="glossary-anchor" name="AB_testing"></a> <h2 class="hide-from-toc" id="ab-testing" data-text=" A/B testing" tabindex="-1"> A/B testing</h2></p> <p>A statistical way of comparing two (or more) techniques—the <em>A</em> and the <em>B</em>. Typically, the <em>A</em> is an existing technique, and the <em>B</em> is a new technique. A/B testing not only determines which technique performs better but also whether the difference is statistically significant.</p> <p>A/B testing usually compares a single <a href="#metric"><strong>metric</strong></a> on two techniques; for example, how does model <a href="#accuracy"><strong>accuracy</strong></a> compare for two techniques? However, A/B testing can also compare any finite number of metrics.</p> <p><a class="glossary-anchor" name="accelerator"></a> <a class="glossary-anchor" name="accelerator-chip"></a> <h2 class="hide-from-toc" id="accelerator-chip" data-text=" accelerator chip" tabindex="-1"> accelerator chip</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="Google Cloud">#GoogleCloud</div> </div></p> <p>A category of specialized hardware components designed to perform key computations needed for deep learning algorithms.</p> <p>Accelerator chips (or just <strong>accelerators</strong>, for short) can significantly increase the speed and efficiency of training and inference tasks compared to a general-purpose CPU. They are ideal for training neural networks and similar computationally intensive tasks.</p> <p>Examples of accelerator chips include:</p> <ul> <li>Google's Tensor Processing Units (<a href="#TPU"><strong>TPUs</strong></a>) with dedicated hardware for deep learning.</li> <li>NVIDIA's GPUs which, though initially designed for graphics processing, are designed to enable parallel processing, which can significantly increase processing speed.</li> </ul> <p><a class="glossary-anchor" name="accuracy"></a> <h2 class="hide-from-toc" id="accuracy" data-text=" accuracy" tabindex="-1"> accuracy</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="ML Fundamentals">#fundamentals</div> </div></p> <p>The number of correct classification <a href="#prediction"><strong>predictions</strong></a> divided by the total number of predictions. That is:</p> <div> $$\text{Accuracy} = \frac{\text{correct predictions}} {\text{correct predictions + incorrect predictions }}$$ </div> <p>For example, a model that made 40 correct predictions and 10 incorrect predictions would have an accuracy of:</p> <div> $$\text{Accuracy} = \frac{\text{40}} {\text{40 + 10}} = \text{80%}$$ </div> <p><a href="#binary_classification"><strong>Binary classification</strong></a> provides specific names for the different categories of <em>correct predictions</em> and <em>incorrect predictions</em>. So, the accuracy formula for binary classification is as follows:</p> <div> $$\text{Accuracy} = \frac{\text{TP} + \text{TN}} {\text{TP} + \text{TN} + \text{FP} + \text{FN}}$$ </div> <p>where:</p> <ul> <li>TP is the number of <a href="#TP"><strong>true positives</strong></a> (correct predictions).</li> <li>TN is the number of <a href="#TN"><strong>true negatives</strong></a> (correct predictions).</li> <li>FP is the number of <a href="#FP"><strong>false positives</strong></a> (incorrect predictions).</li> <li>FN is the number of <a href="#FN"><strong>false negatives</strong></a> (incorrect predictions).</li> </ul> <p>Compare and contrast accuracy with <a href="#precision"><strong>precision</strong></a> and <a href="#recall"><strong>recall</strong></a>.</p> <section class="expandable"> <h4 class="showalways" id="click-the-icon-for-additional-notes." data-text=" Click the icon for additional notes. " tabindex="-1"> Click the icon for additional notes. </h4> <div class="expand-background"> <p> Although a valuable metric for some situations, accuracy is highly misleading for others. Notably, accuracy is usually a poor metric for evaluating classification models that process <a href="#class_imbalanced_data_set"><b>class-imbalanced datasets</b></a>. </p> <p> For example, suppose snow falls only 25 days per century in a certain subtropical city. Since days without snow (the negative class) vastly outnumber days with snow (the positive class), the snow dataset for this city is class-imbalanced. Imagine a <a href="#binary-classification"><b>binary classification</b></a> model that is supposed to predict either snow or no snow each day but simply predicts "no snow" every day. This model is highly accurate but has no predictive power. The following table summarizes the results for a century of predictions: </p> <table> <tr><th>Category</th> <th>Number</th> </tr> <tr><td>TP</td> <td>0</td> </tr> <tr><td>TN</td> <td>36499</td> </tr> <tr><td>FP</td> <td>0</td> </tr> <tr><td>FN</td> <td>25</td> </tr> </table> <p>The accuracy of this model is therefore:</p> <div></div><devsite-code><pre class="devsite-click-to-copy" translate="no" dir="ltr" is-upgraded syntax="Text only">accuracy = (TP + TN) / (TP + TN + FP + FN) accuracy = (0 + 36499) / (0 + 36499 + 0 + 25) = 0.9993 = 99.93%</pre></devsite-code> <p>Although 99.93% accuracy seems like very a impressive percentage, the model actually has no predictive power.</p> <p> <a href="#precision"><b>Precision</b></a> and <a href="#recall"><b>recall</b></a> are usually more useful metrics than <b>accuracy</b> for evaluating models trained on class-imbalanced datasets. </p> </div> <hr /> </section> <p>See <a href="/machine-learning/crash-course/classification/accuracy-precision-recall">Classification: Accuracy, recall, precision and related metrics</a> in Machine Learning Crash Course for more information.</p> <p><a class="glossary-anchor" name="action"></a> <h2 class="hide-from-toc" id="action" data-text="action" tabindex="-1">action</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="Reinforcement Learning">#rl</div> </div></p> <p>In <a href="#reinforcement_learning"><strong>reinforcement learning</strong></a>, the mechanism by which the <a href="#agent"><strong>agent</strong></a> transitions between <a href="#state"><strong>states</strong></a> of the <a href="#environment"><strong>environment</strong></a>. The agent chooses the action by using a <a href="#policy"><strong>policy</strong></a>.</p> <p><a class="glossary-anchor" name="activation_function"></a> <h2 class="hide-from-toc" id="activation-function" data-text="activation function" tabindex="-1">activation function</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="ML Fundamentals">#fundamentals</div> </div></p> <p>A function that enables <a href="#neural_network"><strong>neural networks</strong></a> to learn <a href="#nonlinear"><strong>nonlinear</strong></a> (complex) relationships between features and the label.</p> <p>Popular activation functions include:</p> <ul> <li><a href="#ReLU"><strong>ReLU</strong></a></li> <li><a href="#sigmoid-function"><strong>Sigmoid</strong></a></li> </ul> <p>The plots of activation functions are never single straight lines. For example, the plot of the ReLU activation function consists of two straight lines:</p> <p> <img src="/static/machine-learning/glossary/images/relu.svg" loading="lazy" alt="A cartesian plot of two lines. The first line has a constant y value of 0, running along the x-axis from -infinity,0 to 0,-0. The second line starts at 0,0. This line has a slope of +1, so it runs from 0,0 to +infinity,+infinity." > </p> <p>A plot of the sigmoid activation function looks as follows:</p> <p> <img src="/static/machine-learning/glossary/images/sigmoid.svg" loading="lazy" alt="A two-dimensional curved plot with x values spanning the domain -infinity to +positive, while y values span the range almost 0 to almost 1. When x is 0, y is 0.5. The slope of the curve is always positive, with the highest slope at 0,0.5 and gradually decreasing slopes as the absolute value of x increases." > </p> <section class="expandable"> <h4 class="showalways" id="click-the-icon-to-see-an-example." data-text=" Click the icon to see an example. " tabindex="-1"> Click the icon to see an example. </h4> <div class="expand-background"> <p>In a neural network, activation functions manipulate the <a href="#weighted_sum">weighted sum</a> of all the inputs to a <a href="#neuron">neuron</a>. To calculate a weighted sum, the neuron adds up the products of the relevant values and weights. For example, suppose the relevant input to a neuron consists of the following:</p> <table> <tr><td>input value</td> <td>input weight</td></tr> <tr><td>2</td> <td>-1.3</td></tr> <tr><td>-1</td> <td>0.6</td></tr> <tr><td>3</td> <td>0.4</td></tr> </table> The weighted sum is therefore: <div></div><devsite-code><pre class="devsite-click-to-copy" translate="no" dir="ltr" is-upgraded syntax="Text only">weighted sum = (2)(-1.3) + (-1)(0.6) + (3)(0.4) = -2.0</pre></devsite-code> Suppose the designer of this neural network chooses the <a href="#sigmoid-function"><b>sigmoid function</b></a> to be the activation function. In that case, the neuron calculates the sigmoid of -2.0, which is approximately 0.12. Therefore, the neuron passes 0.12 (rather than -2.0) to the next layer in the neural network. The following figure illustrates the relevant part of the process: <p> <img src="/static/machine-learning/glossary/images/ActivationFunction_sigmoid.png" loading="lazy" width="668" alt="An input layer with three features passing three feature values and three weights to a neuron in a hidden layer. The hidden layer calculates the raw value (-2.0), and then passes the raw value to the activation function. The activation function calculates the sigmoid of the raw value and passes the result (0.12) to the next layer of the neural network." > </p> </div> <hr /> </section> <p>See <a href="/machine-learning/crash-course/neural-networks/activation-functions">Neural networks: Activation functions</a> in Machine Learning Crash Course for more information.</p> <p><a class="glossary-anchor" name="active_learning"></a> <h2 class="hide-from-toc" id="active-learning" data-text="active learning" tabindex="-1">active learning</h2></p> <p>A <a href="#training"><strong>training</strong></a> approach in which the algorithm <em>chooses</em> some of the data it learns from. Active learning is particularly valuable when <a href="#labeled_example"><strong>labeled examples</strong></a> are scarce or expensive to obtain. Instead of blindly seeking a diverse range of labeled examples, an active learning algorithm selectively seeks the particular range of examples it needs for learning.</p> <p><a class="glossary-anchor" name="AdaGrad"></a> <h2 class="hide-from-toc" id="adagrad" data-text=" AdaGrad" tabindex="-1"> AdaGrad</h2></p> <p>A sophisticated gradient descent algorithm that rescales the gradients of each <a href="#parameter"><strong>parameter</strong></a>, effectively giving each parameter an independent <a href="#learning_rate"><strong>learning rate</strong></a>. For a full explanation, see <a href="http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf" target="T">this AdaGrad paper</a>.</p> <p><a class="glossary-anchor" name="agent"></a> <h2 class="hide-from-toc" id="agent" data-text=" agent" tabindex="-1"> agent</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="Reinforcement Learning">#rl</div> </div></p> <p>In <a href="#reinforcement_learning"><strong>reinforcement learning</strong></a>, the entity that uses a <a href="#policy"><strong>policy</strong></a> to maximize the expected <a href="#return"><strong>return</strong></a> gained from transitioning between <a href="#state"><strong>states</strong></a> of the <a href="#environment"><strong>environment</strong></a>.</p> <p>More generally, an agent is software that autonomously plans and executes a series of actions in pursuit of a goal, with the ability to adapt to changes in its environment. For example, an <a href="#LLM"><strong>LLM</strong></a>-based agent might use an LLM to generate a plan, rather than applying a reinforcement learning policy.</p> <p><a class="glossary-anchor" name="agglomerative_clustering"></a> <h2 class="hide-from-toc" id="agglomerative-clustering" data-text=" agglomerative clustering" tabindex="-1"> agglomerative clustering</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="Clustering">#clustering</div> </div></p> <p>See <a href="#hierarchical_clustering"><strong>hierarchical clustering</strong></a>.</p> <p><a class="glossary-anchor" name="anomaly-detection"></a> <h2 class="hide-from-toc" id="anomaly-detection" data-text=" anomaly detection" tabindex="-1"> anomaly detection</h2></p> <p>The process of identifying <a href="#outliers"><strong>outliers</strong></a>. For example, if the mean for a certain <a href="#feature"><strong>feature</strong></a> is 100 with a standard deviation of 10, then anomaly detection should flag a value of 200 as suspicious.</p> <p><a class="glossary-anchor" name="AR"></a> <h2 class="hide-from-toc" id="ar" data-text=" AR" tabindex="-1"> AR</h2></p> <p>Abbreviation for <a href="#augmented_reality"><strong>augmented reality</strong></a>.</p> <p><a class="glossary-anchor" name="area_under_the_pr_curve"></a> <h2 class="hide-from-toc" id="area-under-the-pr-curve" data-text=" area under the PR curve" tabindex="-1"> area under the PR curve</h2></p> <p>See <a href="#PR_AUC"><strong>PR AUC (Area under the PR Curve)</strong></a>.</p> <p><a class="glossary-anchor" name="area_under_the_ROC_curve"></a> <h2 class="hide-from-toc" id="area-under-the-roc-curve" data-text=" area under the ROC curve" tabindex="-1"> area under the ROC curve</h2></p> <p>See <a href="#AUC"><strong>AUC (Area under the ROC curve)</strong></a>.</p> <p><a class="glossary-anchor" name="artificial_general_intelligence"></a> <h2 class="hide-from-toc" id="artificial-general-intelligence" data-text=" artificial general intelligence" tabindex="-1"> artificial general intelligence</h2></p> <p>A non-human mechanism that demonstrates a <em>broad range</em> of problem solving, creativity, and adaptability. For example, a program demonstrating artificial general intelligence could translate text, compose symphonies, <em>and</em> excel at games that have not yet been invented.</p> <p><a class="glossary-anchor" name="artificial_intelligence"></a> <h2 class="hide-from-toc" id="artificial-intelligence" data-text=" artificial intelligence" tabindex="-1"> artificial intelligence</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="ML Fundamentals">#fundamentals</div> </div></p> <p>A non-human program or <a href="#model"><strong>model</strong></a> that can solve sophisticated tasks. For example, a program or model that translates text or a program or model that identifies diseases from radiologic images both exhibit artificial intelligence.</p> <p>Formally, <a href="#machine_learning"><strong>machine learning</strong></a> is a sub-field of artificial intelligence. However, in recent years, some organizations have begun using the terms <em>artificial intelligence</em> and <em>machine learning</em> interchangeably.</p> <p><a class="glossary-anchor" name="attention"></a> <h2 class="hide-from-toc" id="attention" data-text=" attention" tabindex="-1"> attention</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="Language Evaluation">#language</div> </div></p> <p>A mechanism used in a <a href="#neural_network"><strong>neural network</strong></a> that indicates the importance of a particular word or part of a word. Attention compresses the amount of information a model needs to predict the next token/word. A typical attention mechanism might consist of a <a href="#weighted_sum"><strong>weighted sum</strong></a> over a set of inputs, where the <a href="#weight"><strong>weight</strong></a> for each input is computed by another part of the neural network.</p> <p>Refer also to <a href="#self-attention"><strong>self-attention</strong></a> and <a href="#multi-head-self-attention"><strong>multi-head self-attention</strong></a>, which are the building blocks of <a href="#Transformer"><strong>Transformers</strong></a>.</p> <p>See <a href="/machine-learning/crash-course/llm/transformers">LLMs: What's a large language model?</a> in Machine Learning Crash Course for more information about self-attention.</p> <p><a class="glossary-anchor" name="attribute"></a> <h2 class="hide-from-toc" id="attribute" data-text=" attribute" tabindex="-1"> attribute</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="Fairness">#fairness</div> </div></p> <p>Synonym for <a href="#feature"><strong>feature</strong></a>.</p> <p>In machine learning fairness, attributes often refer to characteristics pertaining to individuals.</p> <p><a class="glossary-anchor" name="attribute-sampling"></a> <h2 class="hide-from-toc" id="attribute-sampling" data-text=" attribute sampling" tabindex="-1"> attribute sampling</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="Decision Forests">#df</div> </div></p> <p>A tactic for training a <a href="#decision-forest"><strong>decision forest</strong></a> in which each <a href="#decision-tree"><strong>decision tree</strong></a> considers only a random subset of possible <a href="#feature"><strong>features</strong></a> when learning the <a href="#condition"><strong>condition</strong></a>. Generally, a different subset of features is sampled for each <a href="#node-decision-tree"><strong>node</strong></a>. In contrast, when training a decision tree without attribute sampling, all possible features are considered for each node.</p> <p><a class="glossary-anchor" name="AUC"></a> <h2 class="hide-from-toc" id="auc-area-under-the-roc-curve" data-text=" AUC (Area under the ROC curve)" tabindex="-1"> AUC (Area under the ROC curve)</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="ML Fundamentals">#fundamentals</div> </div></p> <p>A number between 0.0 and 1.0 representing a <a href="#binary-classification"><strong>binary classification</strong></a> model's ability to separate <a href="#positive_class"><strong>positive classes</strong></a> from <a href="#negative_class"><strong>negative classes</strong></a>. The closer the AUC is to 1.0, the better the model's ability to separate classes from each other.</p> <p>For example, the following illustration shows a classifier model that separates positive classes (green ovals) from negative classes (purple rectangles) perfectly. This unrealistically perfect model has an AUC of 1.0:</p> <p> <img src="/static/machine-learning/glossary/images/AUCIdealClassSeparation.png" loading="lazy" alt="A number line with 8 positive examples on one side and 9 negative examples on the other side." > </p> <p>Conversely, the following illustration shows the results for a classifier model that generated random results. This model has an AUC of 0.5:</p> <p> <img src="/static/machine-learning/glossary/images/AUCSetupPNPNPN.png" loading="lazy" alt="A number line with 6 positive examples and 6 negative examples. The sequence of examples is positive, negative, positive, negative, positive, negative, positive, negative, positive negative, positive, negative." > </p> <p>Yes, the preceding model has an AUC of 0.5, not 0.0.</p> <p>Most models are somewhere between the two extremes. For instance, the following model separates positives from negatives somewhat, and therefore has an AUC somewhere between 0.5 and 1.0:</p> <p> <img src="/static/machine-learning/glossary/images/AUCSetupTypical.png" loading="lazy" alt="A number line with 6 positive examples and 6 negative examples. The sequence of examples is negative, negative, negative, negative, positive, negative, positive, positive, negative, positive, positive, positive." > </p> <p>AUC ignores any value you set for <a href="#classification_threshold"><strong>classification threshold</strong></a>. Instead, AUC considers <em>all</em> possible classification thresholds.</p> <section class="expandable"> <h4 class="showalways" id="click-the-icon-to-learn-about-the-relationship-between-auc-and-roc-curves." data-text=" Click the icon to learn about the relationship between AUC and ROC curves. " tabindex="-1"> Click the icon to learn about the relationship between AUC and ROC curves. </h4> <div class="expand-background"> <p>AUC represents the <i>area</i> under an <a href="#ROC"><b>ROC curve</b></a>. For example, the ROC curve for a model that perfectly separates positives from negatives looks as follows:</p> <p> <img src="/static/machine-learning/glossary/images/AUC1_0.png" loading="lazy" alt="Cartesian plot. x-axis is false positive rate; y-axis is true positive rate. Graph starts at 0,0 and goes straight up to 0,1 and then straight to the right ending at 1,1." > </p> <p>AUC is the area of the gray region in the preceding illustration. In this unusual case, the area is simply the length of the gray region (1.0) multiplied by the width of the gray region (1.0). So, the product of 1.0 and 1.0 yields an AUC of exactly 1.0, which is the highest possible AUC score.</p> <p>Conversely, the ROC curve for a classifier that can't separate classes at all is as follows. The area of this gray region is 0.5.</p> <p> <img src="/static/machine-learning/glossary/images/AUC0_5.png" loading="lazy" alt="Cartesian plot. x-axis is false positive rate; y-axis is true positive rate. Graph starts at 0,0 and goes diagonally to 1,1." > </p> <p>A more typical ROC curve looks approximately like the following:</p> <p> <img src="/static/machine-learning/glossary/images/ROCTypicalGraph.png" loading="lazy" alt="Cartesian plot. x-axis is false positive rate; y-axis is true positive rate. Graph starts at 0,0 and takes an irregular arc to 1,0." > </p> <p> It would be painstaking to calculate the area under this curve manually, which is why a program typically calculates most AUC values. </p> </div> <hr /> </section> <section class="expandable"> <h4 class="showalways" id="click-the-icon-for-a-more-formal-definition-of-auc." data-text=" Click the icon for a more formal definition of AUC. " tabindex="-1"> Click the icon for a more formal definition of AUC. </h4> <div class="expand-background"> <p> AUC is the probability that a classifier will be more confident that a randomly chosen positive example is actually positive than that a randomly chosen negative example is positive. </p> </div> <hr /> </section> <p>See <a href="/machine-learning/crash-course/classification/roc-and-auc">Classification: ROC and AUC</a> in Machine Learning Crash Course for more information.</p> <p><a class="glossary-anchor" name="augmented_reality"></a> <h2 class="hide-from-toc" id="augmented-reality" data-text=" augmented reality" tabindex="-1"> augmented reality</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="Image Models">#image</div> </div></p> <p>A technology that superimposes a computer-generated image on a user's view of the real world, thus providing a composite view.</p> <p><a class="glossary-anchor" name="autoencoder"></a> <h2 class="hide-from-toc" id="autoencoder" data-text=" autoencoder" tabindex="-1"> autoencoder</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="Language Evaluation">#language</div> <div class="glossary-icon" title="Image Models">#image</div> </div></p> <p>A system that learns to extract the most important information from the input. Autoencoders are a combination of an <a href="#encoder"><strong>encoder</strong></a> and <a href="#decoder"><strong>decoder</strong></a>. Autoencoders rely on the following two-step process:</p> <ol> <li>The encoder maps the input to a (typically) lossy lower-dimensional (intermediate) format.</li> <li>The decoder builds a lossy version of the original input by mapping the lower-dimensional format to the original higher-dimensional input format.</li> </ol> <p>Autoencoders are trained end-to-end by having the decoder attempt to reconstruct the original input from the encoder's intermediate format as closely as possible. Because the intermediate format is smaller (lower-dimensional) than the original format, the autoencoder is forced to learn what information in the input is essential, and the output won't be perfectly identical to the input.</p> <p>For example:</p> <ul> <li>If the input data is a graphic, the non-exact copy would be similar to the original graphic, but somewhat modified. Perhaps the non-exact copy removes noise from the original graphic or fills in some missing pixels.</li> <li>If the input data is text, an autoencoder would generate new text that mimics (but is not identical to) the original text.</li> </ul> <p>See also <a href="#variational-autoencoder"><strong>variational autoencoders</strong></a>.</p> <p><a class="glossary-anchor" name="automation_bias"></a> <h2 class="hide-from-toc" id="automation-bias" data-text=" automation bias " tabindex="-1"> automation bias </h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="Fairness">#fairness</div> </div></p> <p>When a human decision maker favors recommendations made by an automated decision-making system over information made without automation, even when the automated decision-making system makes errors.</p> <p>See <a href="/machine-learning/crash-course/fairness/types-of-bias">Fairness: Types of bias</a> in Machine Learning Crash Course for more information.</p> <p><a class="glossary-anchor" name="AutoML"></a> <h2 class="hide-from-toc" id="automl" data-text=" AutoML " tabindex="-1"> AutoML </h2></p> <p>Any automated process for building <a href="#machine_learning"><strong>machine learning</strong></a> <a href="#model"><strong>models</strong></a>. AutoML can automatically do tasks such as the following:</p> <ul> <li>Search for the most appropriate model.</li> <li>Tune <a href="#hyperparameter"><strong>hyperparameters</strong></a>.</li> <li>Prepare data (including performing <a href="#feature_engineering"><strong>feature engineering</strong></a>).</li> <li>Deploy the resulting model.</li> </ul> <p>AutoML is useful for data scientists because it can save them time and effort in developing machine learning pipelines and improve prediction accuracy. It is also useful to non-experts, by making complicated machine learning tasks more accessible to them.</p> <p>See <a href="/machine-learning/crash-course/automl">Automated Machine Learning (AutoML)</a> in Machine Learning Crash Course for more information.</p> <p><a class="glossary-anchor" name="auto-regressive-model"></a> <h2 class="hide-from-toc" id="auto-regressive-model" data-text=" auto-regressive model " tabindex="-1"> auto-regressive model </h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="Language Evaluation">#language</div> <div class="glossary-icon" title="Image Models">#image</div> <div class="glossary-icon" title="Generative AI">#generativeAI</div> </div></p> <p>A <a href="#model"><strong>model</strong></a> that infers a prediction based on its own previous predictions. For example, auto-regressive language models predict the next <a href="#token"><strong>token</strong></a> based on the previously predicted tokens. All <a href="#transformer"><strong>Transformer</strong></a>-based <a href="#large-language-model"><strong>large language models</strong></a> are auto-regressive.</p> <p>In contrast, <a href="#GAN"><strong>GAN</strong></a>-based image models are usually not auto-regressive since they generate an image in a single forward-pass and not iteratively in steps. However, certain image generation models <em>are</em> auto-regressive because they generate an image in steps.</p> <p><a class="glossary-anchor" name="auxiliary-loss"></a> <h2 class="hide-from-toc" id="auxiliary-loss" data-text=" auxiliary loss " tabindex="-1"> auxiliary loss </h2></p> <p>A <a href="#loss-function"><strong>loss function</strong></a>—used in conjunction with a <a href="#neural-network"><strong>neural network</strong></a> <a href="#model"><strong>model's</strong></a> main loss function—that helps accelerate <a href="#training"><strong>training</strong></a> during the early iterations when weights are randomly initialized.</p> <p>Auxiliary loss functions push effective <a href="#gradient"><strong>gradients</strong></a> to the earlier <a href="#layer"><strong>layers</strong></a>. This facilitates <a href="#convergence"><strong>convergence</strong></a> during <a href="#training"><strong>training</strong></a> by combating the <a href="#vanishing_gradient_problem"><strong>vanishing gradient problem</strong></a>.</p> <p><a class="glossary-anchor" name="average_precision"></a> <h2 class="hide-from-toc" id="average-precision" data-text=" average precision " tabindex="-1"> average precision </h2></p> <p>A metric for summarizing the performance of a ranked sequence of results. Average precision is calculated by taking the average of the <a href="#precision"><strong>precision</strong></a> values for each relevant result (each result in the ranked list where the recall increases relative to the previous result).</p> <p>See also <a href="#area_under_the_pr_curve"><strong>Area under the PR Curve</strong></a>.</p> <p><a class="glossary-anchor" name="axis-aligned-condition"></a> <h2 class="hide-from-toc" id="axis-aligned-condition" data-text=" axis-aligned condition " tabindex="-1"> axis-aligned condition </h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="Decision Forests">#df</div> </div></p> <p>In a <a href="#decision-tree"><strong>decision tree</strong></a>, a <a href="#condition"><strong>condition</strong></a> that involves only a single <a href="#feature"><strong>feature</strong></a>. For example, if area is a feature, then the following is an axis-aligned condition:</p> <div></div><devsite-code><pre class="devsite-click-to-copy" translate="no" dir="ltr" is-upgraded syntax="Text only">area > 200</pre></devsite-code> <p>Contrast with <a href="#oblique-condition"><strong>oblique condition</strong></a>.</p> <p><a class="glossary-anchor" name="b"></a> <h2 class="glossary" id="b" data-text="B" tabindex="-1">B</h2></p> </li> </ul> <p><a class="glossary-anchor" name="backpropagation"></a> <h2 class="hide-from-toc" id="backpropagation" data-text=" backpropagation" tabindex="-1"> backpropagation</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="ML Fundamentals">#fundamentals</div> </div></p> <p>The algorithm that implements <a href="#gradient_descent"><strong>gradient descent</strong></a> in <a href="#neural_network"><strong>neural networks</strong></a>.</p> <p>Training a neural network involves many <a href="#iteration"><strong>iterations</strong></a> of the following two-pass cycle:</p> <ol> <li>During the <strong>forward pass</strong>, the system processes a <a href="#batch"><strong>batch</strong></a> of <a href="#example"><strong>examples</strong></a> to yield prediction(s). The system compares each prediction to each <a href="#label"><strong>label</strong></a> value. The difference between the prediction and the label value is the <a href="#loss"><strong>loss</strong></a> for that example. The system aggregates the losses for all the examples to compute the total loss for the current batch.</li> <li>During the <strong>backward pass</strong> (backpropagation), the system reduces loss by adjusting the weights of all the <a href="#neuron"><strong>neurons</strong></a> in all the <a href="#hidden_layer"><strong>hidden layer(s)</strong></a>.</li> </ol> <p>Neural networks often contain many neurons across many hidden layers. Each of those neurons contribute to the overall loss in different ways. Backpropagation determines whether to increase or decrease the weights applied to particular neurons.</p> <p>The <a href="#learning_rate"><strong>learning rate</strong></a> is a multiplier that controls the degree to which each backward pass increases or decreases each weight. A large learning rate will increase or decrease each weight more than a small learning rate.</p> <p>In calculus terms, backpropagation implements the <a href="https://www.khanacademy.org/math/ap-calculus-ab/ab-differentiation-2-new/ab-3-1a/v/chain-rule-introduction"><strong>chain rule</strong></a>. from calculus. That is, backpropagation calculates the <a href="#partial_derivative"><strong>partial derivative</strong></a> of the error with respect to each parameter.</p> <p>Years ago, ML practitioners had to write code to implement backpropagation. Modern ML APIs like Keras now implement backpropagation for you. Phew!</p> <p>See <a href="/machine-learning/crash-course/neural-networks">Neural networks</a> in Machine Learning Crash Course for more information.</p> <p><a class="glossary-anchor" name="bagging"></a> <h2 class="hide-from-toc" id="bagging" data-text=" bagging " tabindex="-1"> bagging </h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="Decision Forests">#df</div> </div></p> <p>A method to <a href="#training"><strong>train</strong></a> an <a href="#ensemble"><strong>ensemble</strong></a> where each constituent <a href="#model"><strong>model</strong></a> trains on a random subset of training examples <a href="#sampling-with-replacement"><strong>sampled with replacement</strong></a>. For example, a <a href="#random-forest"><strong>random forest</strong></a> is a collection of <a href="#decision-tree"><strong>decision trees</strong></a> trained with bagging.</p> <p>The term <strong>bagging</strong> is short for <strong>b</strong>ootstrap <strong>agg</strong>regat<strong>ing</strong>.</p> <p>See <a href="/machine-learning/decision-forests/random-forests">Random forests</a> in the Decision Forests course for more information.</p> <p><a class="glossary-anchor" name="bag_of_words"></a> <h2 class="hide-from-toc" id="bag-of-words" data-text=" bag of words" tabindex="-1"> bag of words</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="Language Evaluation">#language</div> </div></p> <p>A representation of the words in a phrase or passage, irrespective of order. For example, bag of words represents the following three phrases identically:</p> <ul> <li>the dog jumps</li> <li>jumps the dog</li> <li>dog jumps the</li> </ul> <p>Each word is mapped to an index in a <a href="#sparse_vector"><strong>sparse vector</strong></a>, where the vector has an index for every word in the vocabulary. For example, the phrase <em>the dog jumps</em> is mapped into a feature vector with non-zero values at the three indexes corresponding to the words <em>the</em>, <em>dog</em>, and <em>jumps</em>. The non-zero value can be any of the following:</p> <ul> <li>A 1 to indicate the presence of a word.</li> <li>A count of the number of times a word appears in the bag. For example, if the phrase were <em>the maroon dog is a dog with maroon fur</em>, then both <em>maroon</em> and <em>dog</em> would be represented as 2, while the other words would be represented as 1.</li> <li>Some other value, such as the logarithm of the count of the number of times a word appears in the bag.</li> </ul> <p><a class="glossary-anchor" name="baseline"></a> <h2 class="hide-from-toc" id="baseline" data-text=" baseline" tabindex="-1"> baseline</h2></p> <p>A <a href="#model"><strong>model</strong></a> used as a reference point for comparing how well another model (typically, a more complex one) is performing. For example, a <a href="#logistic_regression"><strong>logistic regression model</strong></a> might serve as a good baseline for a <a href="#deep_model"><strong>deep model</strong></a>.</p> <p>For a particular problem, the baseline helps model developers quantify the minimal expected performance that a new model must achieve for the new model to be useful.</p> <p><a class="glossary-anchor" name="batch"></a> <h2 class="hide-from-toc" id="batch" data-text=" batch" tabindex="-1"> batch</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="ML Fundamentals">#fundamentals</div> </div></p> <p>The set of <a href="#example"><strong>examples</strong></a> used in one training <a href="#iteration"><strong>iteration</strong></a>. The <a href="#batch_size"><strong>batch size</strong></a> determines the number of examples in a batch.</p> <p>See <a href="#epoch"><strong>epoch</strong></a> for an explanation of how a batch relates to an epoch.</p> <p>See <a href="/machine-learning/crash-course/linear-regression/hyperparameters">Linear regression: Hyperparameters</a> in Machine Learning Crash Course for more information.</p> <p><a class="glossary-anchor" name="batch_inference"></a> <h2 class="hide-from-toc" id="batch-inference" data-text=" batch inference" tabindex="-1"> batch inference</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="TensorFlow">#TensorFlow</div> <div class="glossary-icon" title="Google Cloud">#GoogleCloud</div> </div></p> <p>The process of <a href="#inference"><strong>inferring</strong></a> predictions on multiple <a href="#unlabeled_example"><strong>unlabeled examples</strong></a> divided into smaller subsets ("batches").</p> <p>Batch inference can take advantage of the parallelization features of <a href="#accelerator-chip"><strong>accelerator chips</strong></a>. That is, multiple accelerators can simultaneously infer predictions on different batches of unlabeled examples, dramatically increasing the number of inferences per second.</p> <p>See <a href="/machine-learning/crash-course/production-ml-systems/static-vs-dynamic-inference">Production ML systems: Static versus dynamic inference</a> in Machine Learning Crash Course for more information.</p> <p><a class="glossary-anchor" name="batch_normalization"></a> <h2 class="hide-from-toc" id="batch-normalization" data-text=" batch normalization" tabindex="-1"> batch normalization</h2></p> <p><a href="#normalization"><strong>Normalizing</strong></a> the input or output of the <a href="#activation_function"><strong>activation functions</strong></a> in a <a href="#hidden_layer"><strong>hidden layer</strong></a>. Batch normalization can provide the following benefits:</p> <ul> <li>Make <a href="#neural_network"><strong>neural networks</strong></a> more stable by protecting against <a href="#outliers"><strong>outlier</strong></a> weights.</li> <li>Enable higher <a href="#learning_rate"><strong>learning rates</strong></a>, which can speed training.</li> <li>Reduce <a href="#overfitting"><strong>overfitting</strong></a>.</li> </ul> <p><a class="glossary-anchor" name="batch_size"></a> <h2 class="hide-from-toc" id="batch-size" data-text=" batch size" tabindex="-1"> batch size</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="ML Fundamentals">#fundamentals</div> </div></p> <p>The number of <a href="#example"><strong>examples</strong></a> in a <a href="#batch"><strong>batch</strong></a>. For instance, if the batch size is 100, then the model processes 100 examples per <a href="#iteration"><strong>iteration</strong></a>.</p> <p>The following are popular batch size strategies:</p> <ul> <li><a href="#SGD"><strong>Stochastic Gradient Descent (SGD)</strong></a>, in which the batch size is 1.</li> <li>Full batch, in which the batch size is the number of examples in the entire <a href="#training_set"><strong>training set</strong></a>. For instance, if the training set contains a million examples, then the batch size would be a million examples. Full batch is usually an inefficient strategy.</li> <li><a href="#mini-batch"><strong>mini-batch</strong></a> in which the batch size is usually between 10 and 1000. Mini-batch is usually the most efficient strategy.</li> </ul> <p>See the following for more information:</p> <ul> <li><a href="/machine-learning/crash-course/production-ml-systems/static-vs-dynamic-inference">Production ML systems: Static versus dynamic inference</a> in Machine Learning Crash Course.</li> <li><a href="/machine-learning/guides/deep-learning-tuning-playbook">Deep Learning Tuning Playbook</a>.</li> </ul> <p><a class="glossary-anchor" name="Bayesian_neural_network"></a> <h2 class="hide-from-toc" id="bayesian-neural-network" data-text=" Bayesian neural network" tabindex="-1"> Bayesian neural network</h2></p> <p>A probabilistic <a href="#neural_network"><strong>neural network</strong></a> that accounts for uncertainty in <a href="#weight"><strong>weights</strong></a> and outputs. A standard neural network regression model typically <a href="#prediction"><strong>predicts</strong></a> a scalar value; for example, a standard model predicts a house price of 853,000. In contrast, a Bayesian neural network predicts a distribution of values; for example, a Bayesian model predicts a house price of 853,000 with a standard deviation of 67,200.</p> <p>A Bayesian neural network relies on <a href="https://betterexplained.com/articles/an-intuitive-and-short-explanation-of-bayes-theorem/" target="T"> Bayes' Theorem</a> to calculate uncertainties in weights and predictions. A Bayesian neural network can be useful when it is important to quantify uncertainty, such as in models related to pharmaceuticals. Bayesian neural networks can also help prevent <a href="#overfitting"><strong>overfitting</strong></a>.</p> <p><a class="glossary-anchor" name="Bayesian_optimization"></a> <h2 class="hide-from-toc" id="bayesian-optimization" data-text=" Bayesian optimization" tabindex="-1"> Bayesian optimization</h2></p> <p>A <a href="#probabilistic-regression-model"><strong>probabilistic regression model</strong></a> technique for optimizing computationally expensive <a href="#objective_function"><strong>objective functions</strong></a> by instead optimizing a surrogate that quantifies the uncertainty using a Bayesian learning technique. Since Bayesian optimization is itself very expensive, it is usually used to optimize expensive-to-evaluate tasks that have a small number of parameters, such as selecting <a href="#hyperparameter"><strong>hyperparameters</strong></a>.</p> <p><a class="glossary-anchor" name="bellman_equation"></a> <h2 class="hide-from-toc" id="bellman-equation" data-text=" Bellman equation" tabindex="-1"> Bellman equation</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="Reinforcement Learning">#rl</div> </div></p> <p>In reinforcement learning, the following identity satisfied by the optimal <a href="#q-function"><strong>Q-function</strong></a>:</p> <p>\[Q(s, a) = r(s, a) + \gamma \mathbb{E}_{s'|s,a} \max_{a'} Q(s', a')\]</p> <p><a href="#reinforcement_learning"><strong>Reinforcement learning</strong></a> algorithms apply this identity to create <a href="#q-learning"><strong>Q-learning</strong></a> via the following update rule:</p> <p>\[Q(s,a) \gets Q(s,a) + \alpha \left[r(s,a) + \gamma \displaystyle\max_{\substack{a_1}} Q(s',a') - Q(s,a) \right] \]</p> <p>Beyond reinforcement learning, the Bellman equation has applications to dynamic programming. See the <a href="https://wikipedia.org/wiki/Bellman_equation" target="T"> Wikipedia entry for Bellman equation</a>.</p> <p><a class="glossary-anchor" name="BERT"></a> <h2 class="hide-from-toc" id="bert-bidirectional-encoder-representations-from-transformers" data-text=" BERT (Bidirectional Encoder Representations from Transformers)" tabindex="-1"> BERT (Bidirectional Encoder Representations from Transformers)</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="Language Evaluation">#language</div> </div></p> <p>A model architecture for text <a href="#representation"><strong>representation</strong></a>. A trained BERT model can act as part of a larger model for text classification or other ML tasks.</p> <p>BERT has the following characteristics:</p> <ul> <li>Uses the <a href="#Transformer"><strong>Transformer</strong></a> architecture, and therefore relies on <a href="#self-attention"><strong>self-attention</strong></a>.</li> <li>Uses the <a href="#encoder"><strong>encoder</strong></a> part of the Transformer. The encoder's job is to produce good text representations, rather than to perform a specific task like classification.</li> <li>Is <a href="#bidirectional"><strong>bidirectional</strong></a>.</li> <li>Uses <a href="#masked-language-model"><strong>masking</strong></a> for <a href="#unsupervised_machine_learning"><strong>unsupervised training</strong></a>.</li> </ul> <p>BERT's variants include:</p> <ul> <li><a href="https://ai.googleblog.com/2019/12/albert-lite-bert-for-self-supervised.html">ALBERT</a>, which is an acronym for <strong>A</strong> <strong>L</strong>ight <strong>BERT</strong>.</li> <li><a href="https://ai.googleblog.com/2020/08/language-agnostic-bert-sentence.html">LaBSE</a>.</li> </ul> <p>See <a href="https://ai.googleblog.com/2018/11/open-sourcing-bert-state-of-art-pre.html">Open Sourcing BERT: State-of-the-Art Pre-training for Natural Language Processing</a> for an overview of BERT.</p> <p><a class="glossary-anchor" name="bias_ethics"></a> <h2 class="hide-from-toc" id="bias-ethicsfairness" data-text=" bias (ethics/fairness)" tabindex="-1"> bias (ethics/fairness)</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="Fairness">#fairness</div> <div class="glossary-icon" title="ML Fundamentals">#fundamentals</div> </div></p> <p> 1. Stereotyping, prejudice or favoritism towards some things, people, or groups over others. These biases can affect collection and interpretation of data, the design of a system, and how users interact with a system. Forms of this type of bias include: </p> <ul> <li><a href="#automation_bias"><strong>automation bias</strong></a></li> <li><a href="#confirmation_bias"><strong>confirmation bias</strong></a></li> <li><a href="#confirmation_bias"><strong>experimenter's bias</strong></a></li> <li><a href="#group_attribution_bias"><strong>group attribution bias</strong></a></li> <li><a href="#implicit_bias"><strong>implicit bias</strong></a></li> <li><a href="#in-group_bias"><strong>in-group bias</strong></a></li> <li><a href="#out-group_homogeneity_bias"><strong>out-group homogeneity bias</strong></a></li> </ul> <p> 2. Systematic error introduced by a sampling or reporting procedure. Forms of this type of bias include: </p> <ul> <li><a href="#selection_bias"><strong>coverage bias</strong></a></li> <li><a href="#selection_bias"><strong>non-response bias</strong></a></li> <li><a href="#participation_bias"><strong>participation bias</strong></a></li> <li><a href="#reporting_bias"><strong>reporting bias</strong></a></li> <li><a href="#selection_bias"><strong>sampling bias</strong></a></li> <li><a href="#selection_bias"><strong>selection bias</strong></a></li> </ul> <p>Not to be confused with the <a href="#bias"><strong>bias term</strong></a> in machine learning models or <a href="#prediction_bias"><strong>prediction bias</strong></a>.</p> <p>See <a href="/machine-learning/crash-course/fairness/types-of-bias">Fairness: Types of bias</a> in Machine Learning Crash Course for more information.</p> <p><a class="glossary-anchor" name="bias"></a> <h2 class="hide-from-toc" id="bias-math-or-bias-term" data-text=" bias (math) or bias term" tabindex="-1"> bias (math) or bias term</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="ML Fundamentals">#fundamentals</div> </div></p> <p>An intercept or offset from an origin. Bias is a parameter in machine learning models, which is symbolized by either of the following:</p> <ul> <li><i>b</i></li> <li><i>w<sub>0</sub></i></li> </ul> <p>For example, bias is the <em>b</em> in the following formula:</p> <div> $$y' = b + w_1x_1 + w_2x_2 + … w_nx_n$$ </div> <p>In a simple two-dimensional line, bias just means "y-intercept." For example, the bias of the line in the following illustration is 2.</p> <p> <img src="/static/machine-learning/glossary/images/bias.png" loading="lazy" alt="The plot of a line with a slope of 0.5 and a bias (y-intercept) of 2." > </p> <p>Bias exists because not all models start from the origin (0,0). For example, suppose an amusement park costs 2 Euros to enter and an additional 0.5 Euro for every hour a customer stays. Therefore, a model mapping the total cost has a bias of 2 because the lowest cost is 2 Euros.</p> <p>Bias is not to be confused with <a href="#bias_ethics"><strong>bias in ethics and fairness</strong></a> or <a href="#prediction_bias"><strong>prediction bias</strong></a>.</p> <p>See <a href="/machine-learning/crash-course/linear-regression">Linear Regression</a> in Machine Learning Crash Course for more information.</p> <p><a class="glossary-anchor" name="bidirectional"></a> <h2 class="hide-from-toc" id="bidirectional" data-text=" bidirectional" tabindex="-1"> bidirectional</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="Language Evaluation">#language</div> </div></p> <p>A term used to describe a system that evaluates the text that both <em>precedes</em> and <em>follows</em> a target section of text. In contrast, a <a href="#unidirectional"><strong>unidirectional</strong></a> system only evaluates the text that <em>precedes</em> a target section of text.</p> <p>For example, consider a <a href="#masked-language-model"><strong>masked language model</strong></a> that must determine probabilities for the word or words representing the underline in the following question:</p> <blockquote> <p>What is the _____ with you?</p> </blockquote> <p>A unidirectional language model would have to base its probabilities only on the context provided by the words "What", "is", and "the". In contrast, a bidirectional language model could also gain context from "with" and "you", which might help the model generate better predictions.</p> <p><a class="glossary-anchor" name="bidirectional-language-model"></a> <h2 class="hide-from-toc" id="bidirectional-language-model" data-text=" bidirectional language model" tabindex="-1"> bidirectional language model</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="Language Evaluation">#language</div> </div></p> <p>A <a href="#language-model"><strong>language model</strong></a> that determines the probability that a given token is present at a given location in an excerpt of text based on the <em>preceding</em> and <em>following</em> text.</p> <p><a class="glossary-anchor" name="bigram"></a> <h2 class="hide-from-toc" id="bigram" data-text=" bigram" tabindex="-1"> bigram</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="Sequence Models">#seq</div> <div class="glossary-icon" title="Language Evaluation">#language</div> </div></p> <p>An <a href="#N-gram"><strong>N-gram</strong></a> in which N=2.</p> <p><a class="glossary-anchor" name="binary_classification"></a> <a class="glossary-anchor" name="binary-classification"></a> <h2 class="hide-from-toc" id="binary-classification" data-text=" binary classification" tabindex="-1"> binary classification</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="ML Fundamentals">#fundamentals</div> </div></p> <p>A type of <a href="#classification_model"><strong>classification</strong></a> task that predicts one of two mutually exclusive classes:</p> <ul> <li>the <a href="#positive_class"><strong>positive class</strong></a></li> <li>the <a href="#negative_class"><strong>negative class</strong></a></li> </ul> <p>For example, the following two machine learning models each perform binary classification:</p> <ul> <li>A model that determines whether email messages are <em>spam</em> (the positive class) or <em>not spam</em> (the negative class).</li> <li>A model that evaluates medical symptoms to determine whether a person has a particular disease (the positive class) or doesn't have that disease (the negative class).</li> </ul> <p>Contrast with <a href="#multi-class"><strong>multi-class classification</strong></a>.</p> <p>See also <a href="#logistic_regression"><strong>logistic regression</strong></a> and <a href="#classification_threshold"><strong>classification threshold</strong></a>.</p> <p>See <a href="/machine-learning/crash-course/classification">Classification</a> in Machine Learning Crash Course for more information.</p> <p><a class="glossary-anchor" name="binary-condition"></a> <h2 class="hide-from-toc" id="binary-condition" data-text=" binary condition " tabindex="-1"> binary condition </h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="Decision Forests">#df</div> </div></p> <p>In a <a href="#decision-tree"><strong>decision tree</strong></a>, a <a href="#condition"><strong>condition</strong></a> that has only two possible outcomes, typically <em>yes</em> or <em>no</em>. For example, the following is a binary condition:</p> <div></div><devsite-code><pre class="devsite-click-to-copy" translate="no" dir="ltr" is-upgraded syntax="Text only">temperature >= 100</pre></devsite-code> <p>Contrast with <a href="#non-binary-condition"><strong>non-binary condition</strong></a>.</p> <p>See <a href="/machine-learning/decision-forests/conditions">Types of conditions</a> in the Decision Forests course for more information.</p> <p><a class="glossary-anchor" name="binning"></a> <h2 class="hide-from-toc" id="binning" data-text=" binning" tabindex="-1"> binning</h2></p> <p>Synonym for <a href="#bucketing"><strong>bucketing</strong></a>.</p> <p><a class="glossary-anchor" name="BLEU"></a> <h2 class="hide-from-toc" id="bleu-bilingual-evaluation-understudy" data-text=" BLEU (Bilingual Evaluation Understudy)" tabindex="-1"> BLEU (Bilingual Evaluation Understudy)</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="Language Evaluation">#language</div> </div></p> <p>A score between 0.0 and 1.0, inclusive, indicating the quality of a translation between two human languages (for example, between English and Russian). A BLEU score of 1.0 indicates a perfect translation; a BLEU score of 0.0 indicates a terrible translation.</p> <p><a class="glossary-anchor" name="boosting"></a> <h2 class="hide-from-toc" id="boosting" data-text=" boosting" tabindex="-1"> boosting</h2></p> <p>A machine learning technique that iteratively combines a set of simple and not very accurate classifiers (referred to as "weak" classifiers) into a classifier with high accuracy (a "strong" classifier) by <a href="#upweighting"><strong>upweighting</strong></a> the examples that the model is currently misclassifying.</p> <p>See <a href="/machine-learning/decision-forests/intro-to-gbdt">Gradient Boosted Decision Trees?</a> in the Decision Forests course for more information.</p> <p><a class="glossary-anchor" name="bounding_box"></a> <h2 class="hide-from-toc" id="bounding-box" data-text=" bounding box" tabindex="-1"> bounding box</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="Image Models">#image</div> </div></p> <p>In an image, the (<em>x</em>, <em>y</em>) coordinates of a rectangle around an area of interest, such as the dog in the image below.</p> <p> <img src="/static/machine-learning/glossary/images/bounding_box.jpg" loading="lazy" width="280" alt="Photograph of a dog sitting on a sofa. A green bounding box with top-left coordinates of (275, 1271) and bottom-right coordinates of (2954, 2761) circumscribes the dog's body" > </p> <p><a class="glossary-anchor" name="broadcasting"></a> <h2 class="hide-from-toc" id="broadcasting" data-text=" broadcasting" tabindex="-1"> broadcasting</h2></p> <p>Expanding the shape of an operand in a matrix math operation to <a href="#dimensions"><strong>dimensions</strong></a> compatible for that operation. For example, linear algebra requires that the two operands in a matrix addition operation must have the same dimensions. Consequently, you can't add a matrix of shape (m, n) to a vector of length n. Broadcasting enables this operation by virtually expanding the vector of length n to a matrix of shape (m, n) by replicating the same values down each column.</p> <p>For example, given the following definitions, linear algebra prohibits A+B because A and B have different dimensions:</p> <div></div><devsite-code><pre class="devsite-click-to-copy" translate="no" dir="ltr" is-upgraded syntax="Text only"><code translate="no" dir="ltr">A = [[7, 10, 4], [13, 5, 9]] B = [2] </code></pre></devsite-code> <p>However, broadcasting enables the operation A+B by virtually expanding B to:</p> <div></div><devsite-code><pre class="devsite-click-to-copy" translate="no" dir="ltr" is-upgraded syntax="Text only"><code translate="no" dir="ltr"> [[2, 2, 2], [2, 2, 2]] </code></pre></devsite-code> <p>Thus, A+B is now a valid operation:</p> <div></div><devsite-code><pre class="devsite-click-to-copy" translate="no" dir="ltr" is-upgraded syntax="Text only"><code translate="no" dir="ltr">[[7, 10, 4], + [[2, 2, 2], = [[ 9, 12, 6], [13, 5, 9]] [2, 2, 2]] [15, 7, 11]] </code></pre></devsite-code> <p>See the following description of <a href="https://docs.scipy.org/doc/numpy-1.15.0/user/basics.broadcasting.html" target="T">broadcasting in NumPy</a> for more details.</p> <p><a class="glossary-anchor" name="bucketing"></a> <h2 class="hide-from-toc" id="bucketing" data-text=" bucketing" tabindex="-1"> bucketing</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="ML Fundamentals">#fundamentals</div> </div></p> <p>Converting a single <a href="#feature"><strong>feature</strong></a> into multiple binary features called <strong>buckets</strong> or <strong>bins</strong>, typically based on a value range. The chopped feature is typically a <a href="#continuous_feature"><strong>continuous feature</strong></a>.</p> <p>For example, instead of representing temperature as a single continuous floating-point feature, you could chop ranges of temperatures into discrete buckets, such as:</p> <ul> <li><= 10 degrees Celsius would be the "cold" bucket.</li> <li>11 - 24 degrees Celsius would be the "temperate" bucket.</li> <li>>= 25 degrees Celsius would be the "warm" bucket.</li> </ul> <p>The model will treat every value in the same bucket identically. For example, the values <code translate="no" dir="ltr">13</code> and <code translate="no" dir="ltr">22</code> are both in the temperate bucket, so the model treats the two values identically.</p> <section class="expandable"> <h4 class="showalways" id="click-the-icon-for-additional-notes._1" data-text=" Click the icon for additional notes. " tabindex="-1"> Click the icon for additional notes. </h4> <div class="expand-background"> <p> If you represent temperature as a continuous feature, then the model treats temperature as a single feature. If you represent temperature as three buckets, then the model treats each bucket as a separate feature. That is, a model can learn separate relationships of each bucket to the <a href="#label"><b>label</b></a>. For example, a <a href="#linear_regression"><b>linear regression</b></a> model can learn separate <a href="#weight"><b>weights</b></a> for each bucket. </p> <p>Increasing the number of buckets makes your model more complicated by increasing the number of relationships that your model must learn. For example, the cold, temperate, and warm buckets are essentially three separate features for your model to train on. If you decide to add two more buckets--for example, freezing and hot--your model would now have to train on five separate features.</p> <p>How do you know how many buckets to create, or what the ranges for each bucket should be? The answers typically require a fair amount of experimentation. </p> </div> <hr /> </section> <p>See <a href="/machine-learning/crash-course/numerical-data/binning">Numerical data: Binning</a> in Machine Learning Crash Course for more information.</p> <p><a class="glossary-anchor" name="c"></a> <h2 class="glossary" id="c" data-text="C" tabindex="-1">C</h2></p> <p><a class="glossary-anchor" name="calibration_layer"></a> <h2 class="hide-from-toc" id="calibration-layer" data-text=" calibration layer" tabindex="-1"> calibration layer</h2></p> <p>A post-prediction adjustment, typically to account for <a href="#prediction_bias"><strong>prediction bias</strong></a>. The adjusted predictions and probabilities should match the distribution of an observed set of labels.</p> <p><a class="glossary-anchor" name="candidate_generation"></a> <h2 class="hide-from-toc" id="candidate-generation" data-text=" candidate generation" tabindex="-1"> candidate generation</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="Recommendation Systems">#recsystems</div> </div></p> <p>The initial set of recommendations chosen by a <a href="#recommendation_system"><strong>recommendation system</strong></a>. For example, consider a bookstore that offers 100,000 titles. The candidate generation phase creates a much smaller list of suitable books for a particular user, say 500. But even 500 books is way too many to recommend to a user. Subsequent, more expensive, phases of a recommendation system (such as <a href="#scoring"><strong>scoring</strong></a> and <a href="#re-ranking"><strong>re-ranking</strong></a>) reduce those 500 to a much smaller, more useful set of recommendations.</p> <p>See <a href="/machine-learning/recommendation/overview/candidate-generation">Candidate generation overview</a> in the Recommendation Systems course for more information.</p> <p><a class="glossary-anchor" name="candidate_sampling"></a> <h2 class="hide-from-toc" id="candidate-sampling" data-text=" candidate sampling" tabindex="-1"> candidate sampling</h2></p> <p>A training-time optimization that calculates a probability for all the <a href="#positive_class"><strong>positive</strong></a> labels, using, for example, <a href="#softmax"><strong>softmax</strong></a>, but only for a random sample of negative labels. For instance, given an example labeled <em>beagle</em> and <em>dog</em>, candidate sampling computes the predicted probabilities and corresponding loss terms for:</p> <ul> <li><em>beagle</em></li> <li><em>dog</em></li> <li>a random subset of the remaining negative classes (for example, <em>cat</em>, <em>lollipop</em>, <em>fence</em>).</li> </ul> <p>The idea is that the <a href="#negative_class"><strong>negative classes</strong></a> can learn from less frequent negative reinforcement as long as <a href="#positive_class"><strong>positive classes</strong></a> always get proper positive reinforcement, and this is indeed observed empirically.</p> <p>Candidate sampling is more computationally efficient than training algorithms that compute predictions for <em>all</em> negative classes, particularly when the number of negative classes is very large.</p> <p><a class="glossary-anchor" name="categorical_data"></a> <h2 class="hide-from-toc" id="categorical-data" data-text=" categorical data" tabindex="-1"> categorical data</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="ML Fundamentals">#fundamentals</div> </div></p> <p><a href="#feature"><strong>Features</strong></a> having a specific set of possible values. For example, consider a categorical feature named <code translate="no" dir="ltr">traffic-light-state</code>, which can only have one of the following three possible values:</p> <ul> <li><code translate="no" dir="ltr">red</code></li> <li><code translate="no" dir="ltr">yellow</code></li> <li><code translate="no" dir="ltr">green</code></li> </ul> <p>By representing <code translate="no" dir="ltr">traffic-light-state</code> as a categorical feature, a model can learn the differing impacts of <code translate="no" dir="ltr">red</code>, <code translate="no" dir="ltr">green</code>, and <code translate="no" dir="ltr">yellow</code> on driver behavior.</p> <p>Categorical features are sometimes called <a href="#discrete_feature"><strong>discrete features</strong></a>.</p> <p>Contrast with <a href="#numerical_data"><strong>numerical data</strong></a>.</p> <p>See <a href="/machine-learning/crash-course/categorical-data">Working with categorical data</a> in Machine Learning Crash Course for more information.</p> <p><a class="glossary-anchor" name="causal-language-model"></a> <h2 class="hide-from-toc" id="causal-language-model" data-text=" causal language model" tabindex="-1"> causal language model</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="Language Evaluation">#language</div> </div></p> <p>Synonym for <a href="#unidirectional-language-model"><strong>unidirectional language model</strong></a>.</p> <p>See <a href="#bidirectional-language-model"><strong>bidirectional language model</strong></a> to contrast different directional approaches in language modeling.</p> <p><a class="glossary-anchor" name="centroid"></a> <h2 class="hide-from-toc" id="centroid" data-text="centroid" tabindex="-1">centroid</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="Clustering">#clustering</div> </div></p> <p>The center of a cluster as determined by a <a href="#k-means"><strong>k-means</strong></a> or <a href="#k-median"><strong>k-median</strong></a> algorithm. For example, if k is 3, then the k-means or k-median algorithm finds 3 centroids.</p> <p>See <a href="/machine-learning/clustering/clustering-algorithms">Clustering algorithms</a> in the Clustering course for more information.</p> <p><a class="glossary-anchor" name="centroid_based_clustering"></a> <h2 class="hide-from-toc" id="centroid-based-clustering" data-text=" centroid-based clustering" tabindex="-1"> centroid-based clustering</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="Clustering">#clustering</div> </div></p> <p>A category of <a href="#clustering"><strong>clustering</strong></a> algorithms that organizes data into nonhierarchical clusters. <a href="#k-means"><strong>k-means</strong></a> is the most widely used centroid-based clustering algorithm.</p> <p>Contrast with <a href="#hierarchical_clustering"><strong>hierarchical clustering</strong></a> algorithms.</p> <p>See <a href="/machine-learning/clustering/clustering-algorithms">Clustering algorithms</a> in the Clustering course for more information.</p> <p><a class="glossary-anchor" name="chain-of-thought-prompting"></a> <h2 class="hide-from-toc" id="chain-of-thought-prompting" data-text=" chain-of-thought prompting" tabindex="-1"> chain-of-thought prompting</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="Language Evaluation">#language</div> <div class="glossary-icon" title="Generative AI">#generativeAI</div> </div></p> <p>A <a href="#prompt-engineering"><strong>prompt engineering</strong></a> technique that encourages a <a href="#large-language-model"><strong>large language model</strong></a> (LLM) to explain its reasoning, step by step. For example, consider the following prompt, paying particular attention to the second sentence:</p> <blockquote> <p>How many g forces would a driver experience in a car that goes from 0 to 60 miles per hour in 7 seconds? In the answer, show all relevant calculations.</p> </blockquote> <p>The LLM's response would likely:</p> <ul> <li>Show a sequence of physics formulas, plugging in the values 0, 60, and 7 in appropriate places.</li> <li>Explain why it chose those formulas and what the various variables mean.</li> </ul> <p>Chain-of-thought prompting forces the LLM to perform all the calculations, which might lead to a more correct answer. In addition, chain-of-thought prompting enables the user to examine the LLM's steps to determine whether or not the answer makes sense.</p> <p><a class="glossary-anchor" name="chat"></a> <h2 class="hide-from-toc" id="chat" data-text=" chat" tabindex="-1"> chat</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="Language Evaluation">#language</div> <div class="glossary-icon" title="Generative AI">#generativeAI</div> </div></p> <p>The contents of a back-and-forth dialogue with an ML system, typically a <a href="#large-language-model"><strong>large language model</strong></a>. The previous interaction in a chat (what you typed and how the large language model responded) becomes the context for subsequent parts of the chat.</p> <p>A <strong>chatbot</strong> is an application of a large language model.</p> <p><a class="glossary-anchor" name="checkpoint"></a> <h2 class="hide-from-toc" id="checkpoint" data-text=" checkpoint" tabindex="-1"> checkpoint</h2></p> <p>Data that captures the state of a model's <a href="#parameter"><strong>parameters</strong></a> either during training or after training is completed. For example, during training, you can:</p> <ol> <li>Stop training, perhaps intentionally or perhaps as the result of certain errors.</li> <li>Capture the checkpoint.</li> <li>Later, reload the checkpoint, possibly on different hardware.</li> <li>Restart training.</li> </ol> <p><a class="glossary-anchor" name="class"></a> <h2 class="hide-from-toc" id="class" data-text=" class" tabindex="-1"> class</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="ML Fundamentals">#fundamentals</div> </div></p> <p>A category that a <a href="#label"><strong>label</strong></a> can belong to. For example:</p> <ul> <li>In a <a href="#binary_classification"><strong>binary classification</strong></a> model that detects spam, the two classes might be <em>spam</em> and <em>not spam</em>.</li> <li>In a <a href="#multi-class"><strong>multi-class classification</strong></a> model that identifies dog breeds, the classes might be <em>poodle</em>, <em>beagle</em>, <em>pug</em>, and so on.</li> </ul> <p>A <a href="#classification_model"><strong>classification model</strong></a> predicts a class. In contrast, a <a href="#regression_model"><strong>regression model</strong></a> predicts a number rather than a class.</p> <p>See <a href="/machine-learning/crash-course/classification">Classification</a> in Machine Learning Crash Course for more information.</p> <p><a class="glossary-anchor" name="classification_model"></a> <h2 class="hide-from-toc" id="classification-model" data-text=" classification model" tabindex="-1"> classification model</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="ML Fundamentals">#fundamentals</div> </div></p> <p>A <a href="#model"><strong>model</strong></a> whose prediction is a <a href="#class"><strong>class</strong></a>. For example, the following are all classification models:</p> <ul> <li>A model that predicts an input sentence's language (French? Spanish? Italian?).</li> <li>A model that predicts tree species (Maple? Oak? Baobab?).</li> <li>A model that predicts the positive or negative class for a particular medical condition.</li> </ul> <p>In contrast, <a href="#regression_model"><strong>regression models</strong></a> predict numbers rather than classes.</p> <p>Two common types of classification models are:</p> <ul> <li><a href="#binary-classification"><strong>binary classification</strong></a></li> <li><a href="#multi-class"><strong>multi-class classification</strong></a></li> </ul> <p><a class="glossary-anchor" name="classification_threshold"></a> <h2 class="hide-from-toc" id="classification-threshold" data-text=" classification threshold" tabindex="-1"> classification threshold</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="ML Fundamentals">#fundamentals</div> </div></p> <p>In a <a href="#binary-classification"><strong>binary classification</strong></a>, a number between 0 and 1 that converts the raw output of a <a href="#logistic_regression"><strong>logistic regression</strong></a> model into a prediction of either the <a href="#positive_class"><strong>positive class</strong></a> or the <a href="#negative_class"><strong>negative class</strong></a>. Note that the classification threshold is a value that a human chooses, not a value chosen by model training.</p> <p>A logistic regression model outputs a raw value between 0 and 1. Then:</p> <ul> <li>If this raw value is <em>greater than</em> the classification threshold, then the positive class is predicted.</li> <li>If this raw value is <em>less than</em> the classification threshold, then the negative class is predicted.</li> </ul> <p>For example, suppose the classification threshold is 0.8. If the raw value is 0.9, then the model predicts the positive class. If the raw value is 0.7, then the model predicts the negative class.</p> <p>The choice of classification threshold strongly influences the number of <a href="#FP"><strong>false positives</strong></a> and <a href="#FN"><strong>false negatives</strong></a>.</p> <section class="expandable"> <h4 class="showalways" id="click-the-icon-for-additional-notes._2" data-text=" Click the icon for additional notes. " tabindex="-1"> Click the icon for additional notes. </h4> <div class="expand-background"> <p> As models or datasets evolve, engineers sometimes also change the classification threshold. When the classification threshold changes, positive class predictions can suddenly become negative classes and vice-versa. </p> <p> For example, consider a binary classification disease prediction model. Suppose that when the system runs in the first year:</p> <ul> <li>The raw value for a particular patient is 0.95.</li> <li>The classification threshold is 0.94.</li> </ul> <p>Therefore, the system diagnoses the positive class. (The patient gasps, "Oh no! I'm sick!")</p> <p>A year later, perhaps the values now look as follows:</p> <ul> <li>The raw value for the same patient remains at 0.95.</li> <li>The classification threshold changes to 0.97.</li> </ul> <p>Therefore, the system now reclassifies that patient as the negative class. ("Happy day! I'm not sick.") Same patient. Different diagnosis.</p> </div> <hr /> </section> <p>See <a href="/machine-learning/crash-course/classification/thresholding">Thresholds and the confusion matrix</a> in Machine Learning Crash Course for more information.</p> <p><a class="glossary-anchor" name="class_imbalanced_data_set"></a> <h2 class="hide-from-toc" id="class-imbalanced-dataset" data-text=" class-imbalanced dataset" tabindex="-1"> class-imbalanced dataset</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="ML Fundamentals">#fundamentals</div> </div></p> <p>A dataset for a classification problem in which the total number of <a href="#label"><strong>labels</strong></a> of each class differs significantly. For example, consider a binary classification dataset whose two labels are divided as follows:</p> <ul> <li>1,000,000 negative labels</li> <li>10 positive labels</li> </ul> <p>The ratio of negative to positive labels is 100,000 to 1, so this is a class-imbalanced dataset.</p> <p>In contrast, the following dataset is <em>not</em> class-imbalanced because the ratio of negative labels to positive labels is relatively close to 1:</p> <ul> <li>517 negative labels</li> <li>483 positive labels</li> </ul> <p>Multi-class datasets can also be class-imbalanced. For example, the following multi-class classification dataset is also class-imbalanced because one label has far more examples than the other two:</p> <ul> <li>1,000,000 labels with class "green"</li> <li>200 labels with class "purple"</li> <li>350 labels with class "orange"</li> </ul> <p>See also <a href="#entropy"><strong>entropy</strong></a>, <a href="#majority_class"><strong>majority class</strong></a>, and <a href="#minority_class"><strong>minority class</strong></a>.</p> <p><a class="glossary-anchor" name="clipping"></a> <h2 class="hide-from-toc" id="clipping" data-text=" clipping" tabindex="-1"> clipping</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="ML Fundamentals">#fundamentals</div> </div></p> <p>A technique for handling <a href="#outliers"><strong>outliers</strong></a> by doing either or both of the following:</p> <ul> <li>Reducing <a href="#feature"><strong>feature</strong></a> values that are greater than a maximum threshold down to that maximum threshold.</li> <li>Increasing feature values that are less than a minimum threshold up to that minimum threshold.</li> </ul> <p>For example, suppose that <0.5% of values for a particular feature fall outside the range 40–60. In this case, you could do the following:</p> <ul> <li>Clip all values over 60 (the maximum threshold) to be exactly 60.</li> <li>Clip all values under 40 (the minimum threshold) to be exactly 40.</li> </ul> <p>Outliers can damage models, sometimes causing <a href="#weight"><strong>weights</strong></a> to overflow during training. Some outliers can also dramatically spoil metrics like <a href="#accuracy"><strong>accuracy</strong></a>. Clipping is a common technique to limit the damage.</p> <p><a href="#gradient_clipping"><strong>Gradient clipping</strong></a> forces <a href="#gradient"><strong>gradient</strong></a> values within a designated range during training.</p> <p>See <a href="/machine-learning/crash-course/numerical-data/normalization">Numerical data: Normalization</a> in Machine Learning Crash Course for more information.</p> <p><a class="glossary-anchor" name="Cloud_TPU"></a> <h2 class="hide-from-toc" id="cloud-tpu" data-text=" Cloud TPU " tabindex="-1"> Cloud TPU </h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="TensorFlow">#TensorFlow</div> <div class="glossary-icon" title="Google Cloud">#GoogleCloud</div> </div></p> <p>A specialized hardware accelerator designed to speed up machine learning workloads on Google Cloud.</p> <p><a class="glossary-anchor" name="clustering"></a> <h2 class="hide-from-toc" id="clustering" data-text="clustering" tabindex="-1">clustering</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="Clustering">#clustering</div> </div></p> <p>Grouping related <a href="#example"><strong>examples</strong></a>, particularly during <a href="#unsupervised_machine_learning"><strong>unsupervised learning</strong></a>. Once all the examples are grouped, a human can optionally supply meaning to each cluster.</p> <p>Many clustering algorithms exist. For example, the <a href="#k-means"><strong>k-means</strong></a> algorithm clusters examples based on their proximity to a <a href="#centroid"><strong>centroid</strong></a>, as in the following diagram:</p> <p> <img src="/static/machine-learning/glossary/images/Cluster.svg" loading="lazy" alt="A two-dimensional graph in which the x-axis is labeled tree width, and the y-axis is labeled tree height. The graph contains two centroids and several dozen data points. The data points are categorized based on their proximity. That is, the data points closest to one centroid are categorized as cluster 1, while those closest to the other centroid are categorized as cluster 2." > </p> <p>A human researcher could then review the clusters and, for example, label cluster 1 as "dwarf trees" and cluster 2 as "full-size trees."</p> <p>As another example, consider a clustering algorithm based on an example's distance from a center point, illustrated as follows:</p> <p> <img src="/static/machine-learning/glossary/images/RingCluster.svg" loading="lazy" alt="Dozens of data points are arranged in concentric circles, almost like holes around the center of a dart board. The innermost ring of data points is categorized as cluster 1, the middle ring is categorized as cluster 2, and the outermost ring as cluster 3." > </p> <p>See the <a href="/machine-learning/clustering">Clustering course</a> for more information.</p> <p><a class="glossary-anchor" name="co-adaptation"></a> <h2 class="hide-from-toc" id="co-adaptation" data-text=" co-adaptation" tabindex="-1"> co-adaptation</h2></p> <p>When <a href="#neuron"><strong>neurons</strong></a> predict patterns in training data by relying almost exclusively on outputs of specific other neurons instead of relying on the network's behavior as a whole. When the patterns that cause co-adaptation are not present in validation data, then co-adaptation causes overfitting. <a href="#dropout_regularization"><strong>Dropout regularization</strong></a> reduces co-adaptation because dropout ensures neurons cannot rely solely on specific other neurons.</p> <p><a class="glossary-anchor" name="collaborative_filtering"></a> <h2 class="hide-from-toc" id="collaborative-filtering" data-text=" collaborative filtering" tabindex="-1"> collaborative filtering</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="Recommendation Systems">#recsystems</div> </div></p> <p>Making <a href="#prediction"><strong>predictions</strong></a> about the interests of one user based on the interests of many other users. Collaborative filtering is often used in <a href="#recommendation_system"><strong>recommendation systems</strong></a>.</p> <p>See <a href="/machine-learning/recommendation/collaborative/basics">Collaborative filtering</a> in the Recommendation Systems course for more information.</p> <p><a class="glossary-anchor" name="concept-drift"></a> <h2 class="hide-from-toc" id="concept-drift" data-text=" concept drift" tabindex="-1"> concept drift</h2></p> <p>A shift in the relationship between features and the label. Over time, concept drift reduces a model's quality.</p> <p>During training, the model learns the relationship between the features and their labels in the training set. If the labels in the training set are good proxies for the real-world, then the model <em>should</em> make good real world predictions. However, due to concept drift, the model's predictions tend to degrade over time.</p> <p>For example, consider a <a href="#binary-classification"><strong>binary classification</strong></a> model that predicts whether or not a certain car model is "fuel efficient." That is, the features could be:</p> <ul> <li>car weight</li> <li>engine compression</li> <li>transmission type</li> </ul> <p>while the label is either:</p> <ul> <li>fuel efficient</li> <li>not fuel efficient</li> </ul> <p>However, the concept of "fuel efficient car" keeps changing. A car model labeled <em>fuel efficient</em> in 1994 would almost certainly be labeled <em>not fuel efficient</em> in 2024. A model suffering from concept drift tends to make less and less useful predictions over time.</p> <p>Compare and contrast with <a href="#nonstationarity"><strong>nonstationarity</strong></a>.</p> <section class="expandable"> <h4 class="showalways" id="click-the-icon-for-additional-notes._3" data-text=" Click the icon for additional notes. " tabindex="-1"> Click the icon for additional notes. </h4> <div class="expand-background"> <p> To compensate for concept drift, retrain models faster than the <i>rate</i> of concept drift. For example, if concept drift reduces model precision by a meaningful margin every two months, then retrain your model more frequently than every two months. </p> </div> <hr /> </section> <p><a class="glossary-anchor" name="condition"></a> <h2 class="hide-from-toc" id="condition" data-text=" condition " tabindex="-1"> condition </h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="Decision Forests">#df</div> </div></p> <p>In a <a href="#decision-tree"><strong>decision tree</strong></a>, any <a href="#node"><strong>node</strong></a> that evaluates an expression. For example, the following portion of a decision tree contains two conditions:</p> <p> <img src="/static/machine-learning/glossary/images/condition.png" loading="lazy" width="433" alt="A decision tree consisting of two conditions: (x > 0) and (y > 0)." > </p> <p>A condition is also called a split or a test.</p> <p>Contrast condition with <a href="#leaf"><strong>leaf</strong></a>.</p> <p>See also:</p> <ul> <li><a href="#binary-condition"><strong>binary condition</strong></a></li> <li><a href="#non-binary-condition"><strong>non-binary condition</strong></a>.</li> <li><a href="#axis-aligned-condition"><strong>axis-aligned-condition</strong></a></li> <li><a href="#oblique-condition"><strong>oblique-condition</strong></a></li> </ul> <p>See <a href="/machine-learning/decision-forests/conditions">Types of conditions</a> in the Decision Forests course for more information.</p> <p><a class="glossary-anchor" name="confabulation"></a> <h2 class="hide-from-toc" id="confabulation" data-text=" confabulation" tabindex="-1"> confabulation</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="Language Evaluation">#language</div> </div></p> <p>Synonym for <a href="#hallucination"><strong>hallucination</strong></a>.</p> <p>Confabulation is probably a more technically accurate term than hallucination. However, hallucination became popular first.</p> <p><a class="glossary-anchor" name="configuration"></a> <h2 class="hide-from-toc" id="configuration" data-text=" configuration" tabindex="-1"> configuration</h2></p> <p>The process of assigning the initial property values used to train a model, including:</p> <ul> <li>the model's composing <a href="#layer"><strong>layers</strong></a></li> <li>the location of the data</li> <li><a href="#hyperparameter"><strong>hyperparameters</strong></a> such as: <ul> <li><a href="#learning_rate"><strong>learning rate</strong></a></li> <li><a href="#iteration"><strong>iterations</strong></a></li> <li><a href="#optimizer"><strong>optimizer</strong></a></li> <li><a href="#loss-function"><strong>loss function</strong></a></li> </ul></li> </ul> <p>In machine learning projects, configuration can be done through a special configuration file or using configuration libraries such as the following:</p> <ul> <li><a href="https://www.tensorflow.org/tensorboard/hyperparameter_tuning_with_hparams" target="T">HParam</a></li> <li><a href="https://github.com/google/gin-config" target="T">Gin</a></li> <li><a href="#fiddle"><b>Fiddle</b></a></li> </ul> <p><a class="glossary-anchor" name="confirmation_bias"></a> <h2 class="hide-from-toc" id="confirmation-bias" data-text=" confirmation bias " tabindex="-1"> confirmation bias </h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="Fairness">#fairness</div> </div></p> <p>The tendency to search for, interpret, favor, and recall information in a way that confirms one's pre-existing beliefs or hypotheses. Machine learning developers may inadvertently collect or label data in ways that influence an outcome supporting their existing beliefs. Confirmation bias is a form of <a href="#implicit_bias"><strong>implicit bias</strong></a>.</p> <p><strong>Experimenter's bias</strong> is a form of confirmation bias in which an experimenter continues training models until a pre-existing hypothesis is confirmed.</p> <p><a class="glossary-anchor" name="confusion_matrix"></a> <h2 class="hide-from-toc" id="confusion-matrix" data-text=" confusion matrix" tabindex="-1"> confusion matrix</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="ML Fundamentals">#fundamentals</div> </div></p> <p>An NxN table that summarizes the number of correct and incorrect predictions that a <a href="#classification_model"><strong>classification model</strong></a> made. For example, consider the following confusion matrix for a <a href="#binary_classification"><strong>binary classification</strong></a> model:</p> <table> <thead> <tr> <th></th> <th>Tumor (predicted)</th> <th>Non-Tumor (predicted)</th> </tr> </thead> <tbody> <tr> <td>Tumor (ground truth)</td> <td>18 (TP)</td> <td>1 (FN)</td> </tr> <tr> <td>Non-Tumor (ground truth)</td> <td>6 (FP)</td> <td>452 (TN)</td> </tr> </tbody> </table> <p>The preceding confusion matrix shows the following:</p> <ul> <li>Of the 19 predictions in which <a href="#ground_truth"><strong>ground truth</strong></a> was Tumor, the model correctly classified 18 and incorrectly classified 1.</li> <li>Of the 458 predictions in which ground truth was Non-Tumor, the model correctly classified 452 and incorrectly classified 6.</li> </ul> <p>The confusion matrix for a <a href="#multi-class"><strong>multi-class classification</strong></a> problem can help you identify patterns of mistakes. For example, consider the following confusion matrix for a 3-class multi-class classification model that categorizes three different iris types (Virginica, Versicolor, and Setosa). When the ground truth was Virginica, the confusion matrix shows that the model was far more likely to mistakenly predict Versicolor than Setosa:</p> <table> <tr> <th> </th> <th>Setosa (predicted)</th> <th>Versicolor (predicted)</th> <th>Virginica (predicted)</th> </tr> <tr> <td>Setosa (ground truth)</td> <td>88</td> <td>12</td> <td>0</td> </tr> <tr> <td>Versicolor (ground truth)</td> <td>6</td> <td>141</td> <td>7</td> </tr> <tr> <td>Virginica (ground truth)</td> <td>2</td> <td>27</td> <td>109</td> </tr> </table> <p>As yet another example, a confusion matrix could reveal that a model trained to recognize handwritten digits tends to mistakenly predict 9 instead of 4, or mistakenly predict 1 instead of 7.</p> <p>Confusion matrixes contain sufficient information to calculate a variety of performance metrics, including <a href="#precision"><strong>precision</strong></a> and <a href="#recall"><strong>recall</strong></a>.</p> <p><a class="glossary-anchor" name="constituency-parsing"></a> <h2 class="hide-from-toc" id="constituency-parsing" data-text=" constituency parsing" tabindex="-1"> constituency parsing</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="Language Evaluation">#language</div> </div></p> <p>Dividing a sentence into smaller grammatical structures ("constituents"). A later part of the ML system, such as a <a href="#natural_language_understanding"><strong>natural language understanding</strong></a> model, can parse the constituents more easily than the original sentence. For example, consider the following sentence:</p> <blockquote> <p>My friend adopted two cats.</p> </blockquote> <p>A constituency parser can divide this sentence into the following two constituents:</p> <ul> <li><em>My friend</em> is a noun phrase.</li> <li><em>adopted two cats</em> is a verb phrase.</li> </ul> <p>These constituents can be further subdivided into smaller constituents. For example, the verb phrase</p> <blockquote> <p>adopted two cats</p> </blockquote> <p>could be further subdivided into:</p> <ul> <li><em>adopted</em> is a verb.</li> <li><em>two cats</em> is another noun phrase.</li> </ul> <p><a class="glossary-anchor" name="contextualized_language_embedding"></a> <h2 class="hide-from-toc" id="contextualized-language-embedding" data-text=" contextualized language embedding" tabindex="-1"> contextualized language embedding</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="Language Evaluation">#language</div> <div class="glossary-icon" title="Generative AI">#generativeAI</div> </div></p> <p>An <a href="#embedding_vector"><strong>embedding</strong></a> that comes close to "understanding" words and phrases in ways that native human speakers can. Contextualized language embeddings can understand complex syntax, semantics, and context.</p> <p>For example, consider embeddings of the English word <em>cow</em>. Older embeddings such as <a href="https://wikipedia.org/wiki/Word2vec">word2vec</a> can represent English words such that the distance in the <a href="#embedding_space"><strong>embedding space</strong></a> from <em>cow</em> to <em>bull</em> is similar to the distance from <em>ewe</em> (female sheep) to <em>ram</em> (male sheep) or from <em>female</em> to <em>male</em>. Contextualized language embeddings can go a step further by recognizing that English speakers sometimes casually use the word <em>cow</em> to mean either cow or bull.</p> <p><a class="glossary-anchor" name="context_window"></a> <h2 class="hide-from-toc" id="context-window" data-text=" context window" tabindex="-1"> context window</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="Language Evaluation">#language</div> <div class="glossary-icon" title="Generative AI">#generativeAI</div> </div></p> <p>The number of <a href="#token"><strong>tokens</strong></a> a model can process in a given <a href="#prompt"><strong>prompt</strong></a>. The larger the context window, the more information the model can use to provide coherent and consistent responses to the prompt.</p> <p><a class="glossary-anchor" name="continuous_feature"></a> <h2 class="hide-from-toc" id="continuous-feature" data-text=" continuous feature" tabindex="-1"> continuous feature</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="ML Fundamentals">#fundamentals</div> </div></p> <p>A floating-point <a href="#feature"><strong>feature</strong></a> with an infinite range of possible values, such as temperature or weight.</p> <p>Contrast with <a href="#discrete_feature"><strong>discrete feature</strong></a>.</p> <p><a class="glossary-anchor" name="convenience_sampling"></a> <h2 class="hide-from-toc" id="convenience-sampling" data-text=" convenience sampling" tabindex="-1"> convenience sampling</h2></p> <p>Using a dataset not gathered scientifically in order to run quick experiments. Later on, it's essential to switch to a scientifically gathered dataset.</p> <p><a class="glossary-anchor" name="convergence"></a> <h2 class="hide-from-toc" id="convergence" data-text=" convergence" tabindex="-1"> convergence</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="ML Fundamentals">#fundamentals</div> </div></p> <p>A state reached when <a href="#loss"><strong>loss</strong></a> values change very little or not at all with each <a href="#iteration"><strong>iteration</strong></a>. For example, the following <a href="#loss_curve"><strong>loss curve</strong></a> suggests convergence at around 700 iterations:</p> <p> <img src="/static/machine-learning/glossary/images/Convergence.png" loading="lazy" alt="Cartesian plot. X-axis is loss. Y-axis is the number of training iterations. Loss is very high during first few iterations, but drops sharply. After about 100 iterations, loss is still descending but far more gradually. After about 700 iterations, loss stays flat." > </p> <p>A model <strong>converges</strong> when additional training won't improve the model.</p> <p>In <a href="#deep_model"><strong>deep learning</strong></a>, loss values sometimes stay constant or nearly so for many iterations before finally descending. During a long period of constant loss values, you may temporarily get a false sense of convergence.</p> <p>See also <a href="#early_stopping"><strong>early stopping</strong></a>.</p> <p>See <a href="/machine-learning/crash-course/linear-regression/gradient-descent#model_convergence_and_loss_curves">Model convergence and loss curves</a> in Machine Learning Crash Course for more information.</p> <p><a class="glossary-anchor" name="convex_function"></a> <h2 class="hide-from-toc" id="convex-function" data-text=" convex function" tabindex="-1"> convex function</h2></p> <p>A function in which the region above the graph of the function is a <a href="#convex_set"><strong>convex set</strong></a>. The prototypical convex function is shaped something like the letter <strong>U</strong>. For example, the following are all convex functions:</p> <p> <img src="/static/machine-learning/glossary/images/convex_functions.png" loading="lazy" alt="U-shaped curves, each with a single minimum point." > </p> <p>In contrast, the following function is not convex. Notice how the region above the graph is not a convex set:</p> <p> <img src="/static/machine-learning/glossary/images/nonconvex_function.svg" loading="lazy" alt="A W-shaped curve with two different local minimum points." > </p> <p>A <strong>strictly convex function</strong> has exactly one local minimum point, which is also the global minimum point. The classic U-shaped functions are strictly convex functions. However, some convex functions (for example, straight lines) are not U-shaped.</p> <section class="expandable"> <h4 class="showalways" id="click-the-icon-for-a-deeper-look-at-the-math." data-text=" Click the icon for a deeper look at the math. " tabindex="-1"> Click the icon for a deeper look at the math. </h4> <div class="expand-background"> <p> A lot of the common <a href="#loss-function">loss functions</a>, including the following, are convex functions: </p> <ul> <li><a href="#L2_loss"><b>L<sub>2</sub> loss</b></a></li> <li><a href="#Log_Loss"><b>Log Loss</b></a></li> <li><a href="#L1_regularization"><b>L<sub>1</sub> regularization</b></a></li> <li><a href="#L2_regularization"><b>L<sub>2</sub> regularization</b></a></li> </ul> <p> Many variations of <a href="#gradient_descent"><b>gradient descent</b></a> are guaranteed to find a point close to the minimum of a strictly convex function. Similarly, many variations of <a href="#SGD"><b>stochastic gradient descent</b></a> have a high probability (though, not a guarantee) of finding a point close to the minimum of a strictly convex function. </p> <p> The sum of two convex functions (for example, L<sub>2</sub> loss + L<sub>1</sub> regularization) is a convex function. </p> <p> <a href="#deep_model"><b>Deep models</b></a> are never convex functions. Remarkably, algorithms designed for <a href="#convex_optimization"><b>convex optimization</b></a> tend to find reasonably good solutions on deep networks anyway, even though those solutions are not guaranteed to be a global minimum. </p> </div> <hr /> </section> <p>See <a href="/machine-learning/crash-course/linear-regression/gradient-descent#convergence_and_convex_functions">Convergence and convex functions</a> in Machine Learning Crash Course for more information.</p> <p><a class="glossary-anchor" name="convex_optimization"></a> <h2 class="hide-from-toc" id="convex-optimization" data-text="convex optimization" tabindex="-1">convex optimization</h2></p> <p>The process of using mathematical techniques such as <a href="#gradient_descent"><strong>gradient descent</strong></a> to find the minimum of a <a href="#convex_function"><strong>convex function</strong></a>. A great deal of research in machine learning has focused on formulating various problems as convex optimization problems and in solving those problems more efficiently.</p> <p>For complete details, see Boyd and Vandenberghe, <a href="https://web.stanford.edu/~boyd/cvxbook/bv_cvxbook.pdf" target="T">Convex Optimization</a>.</p> <p><a class="glossary-anchor" name="convex_set"></a> <h2 class="hide-from-toc" id="convex-set" data-text="convex set" tabindex="-1">convex set</h2></p> <p>A subset of Euclidean space such that a line drawn between any two points in the subset remains completely within the subset. For instance, the following two shapes are convex sets:</p> <p> <img src="/static/machine-learning/glossary/images/convex_set.png" loading="lazy" alt="One illustration of a rectangle. Another illustration of an oval." > </p> <p>In contrast, the following two shapes are not convex sets:</p> <p> <img src="/static/machine-learning/glossary/images/nonconvex_set.png" loading="lazy" alt="One illustration of a pie-chart with a missing slice. Another illustration of a wildly irregular polygon." > </p> <p><a class="glossary-anchor" name="convolution"></a> <h2 class="hide-from-toc" id="convolution" data-text="convolution" tabindex="-1">convolution</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="Image Models">#image</div> </div></p> <p>In mathematics, casually speaking, a mixture of two functions. In machine learning, a convolution mixes the <a href="#convolutional_filter"><strong>convolutional filter</strong></a> and the input matrix in order to train <a href="#weight"><strong>weights</strong></a>.</p> <p>The term "convolution" in machine learning is often a shorthand way of referring to either <a href="#convolutional_operation"><strong>convolutional operation</strong></a> or <a href="#convolutional_layer"><strong>convolutional layer</strong></a>.</p> <p>Without convolutions, a machine learning algorithm would have to learn a separate weight for every cell in a large <a href="#tensor"><strong>tensor</strong></a>. For example, a machine learning algorithm training on 2K x 2K images would be forced to find 4M separate weights. Thanks to convolutions, a machine learning algorithm only has to find weights for every cell in the <a href="#convolutional_filter"><strong>convolutional filter</strong></a>, dramatically reducing the memory needed to train the model. When the convolutional filter is applied, it is simply replicated across cells such that each is multiplied by the filter.</p> <p>See <a href="/machine-learning/practica/image-classification/convolutional-neural-networks">Introducing Convolutional Neural Networks</a> in the Image Classification course for more information.</p> <p><a class="glossary-anchor" name="convolutional_filter"></a> <h2 class="hide-from-toc" id="convolutional-filter" data-text="convolutional filter" tabindex="-1">convolutional filter</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="Image Models">#image</div> </div></p> <p>One of the two actors in a <a href="#convolutional_operation"><strong>convolutional operation</strong></a>. (The other actor is a slice of an input matrix.) A convolutional filter is a matrix having the same <a href="#rank"><strong>rank</strong></a> as the input matrix, but a smaller shape. For example, given a 28x28 input matrix, the filter could be any 2D matrix smaller than 28x28.</p> <p>In photographic manipulation, all the cells in a convolutional filter are typically set to a constant pattern of ones and zeroes. In machine learning, convolutional filters are typically seeded with random numbers and then the network <a href="#training"><strong>trains</strong></a> the ideal values.</p> <p>See <a href="/machine-learning/practica/image-classification/convolutional-neural-networks#1_convolution">Convolution</a> in the Image Classification course for more information.</p> <p><a class="glossary-anchor" name="convolutional_layer"></a> <h2 class="hide-from-toc" id="convolutional-layer" data-text="convolutional layer" tabindex="-1">convolutional layer</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="Image Models">#image</div> </div></p> <p>A layer of a <a href="#deep_model"><strong>deep neural network</strong></a> in which a <a href="#convolutional_filter"><strong>convolutional filter</strong></a> passes along an input matrix. For example, consider the following 3x3 <a href="#convolutional_filter"><strong>convolutional filter</strong></a>:</p> <p> <img src="/static/machine-learning/glossary/images/ConvolutionalFilter33.svg" loading="lazy" alt="A 3x3 matrix with the following values: [[0,1,0], [1,0,1], [0,1,0]]" > </p> <p>The following animation shows a convolutional layer consisting of 9 convolutional operations involving the 5x5 input matrix. Notice that each convolutional operation works on a different 3x3 slice of the input matrix. The resulting 3x3 matrix (on the right) consists of the results of the 9 convolutional operations:</p> <p> <img src="/static/machine-learning/glossary/images/AnimatedConvolution.gif" loading="lazy" alt="An animation showing two matrixes. The first matrix is the 5x5 matrix: [[128,97,53,201,198], [35,22,25,200,195], [37,24,28,197,182], [33,28,92,195,179], [31,40,100,192,177]]. The second matrix is the 3x3 matrix: [[181,303,618], [115,338,605], [169,351,560]]. The second matrix is calculated by applying the convolutional filter [[0, 1, 0], [1, 0, 1], [0, 1, 0]] across different 3x3 subsets of the 5x5 matrix." > </p> <p>See <a href="/machine-learning/practica/image-classification/convolutional-neural-networks#fully_connected_layers">Fully Connected Layers</a> in the Image Classification course for more information.</p> <p><a class="glossary-anchor" name="convolutional_neural_network"></a> <h2 class="hide-from-toc" id="convolutional-neural-network" data-text="convolutional neural network" tabindex="-1">convolutional neural network</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="Image Models">#image</div> </div></p> <p>A <a href="#neural_network"><strong>neural network</strong></a> in which at least one layer is a <a href="#convolutional_layer"><strong>convolutional layer</strong></a>. A typical convolutional neural network consists of some combination of the following layers:</p> <ul> <li><a href="#convolutional_layer"><strong>convolutional layers</strong></a></li> <li><a href="#pooling"><strong>pooling layers</strong></a></li> <li><a href="#dense_layer"><strong>dense layers</strong></a></li> </ul> <p>Convolutional neural networks have had great success in certain kinds of problems, such as image recognition.</p> <p><a class="glossary-anchor" name="convolutional_operation"></a> <h2 class="hide-from-toc" id="convolutional-operation" data-text="convolutional operation" tabindex="-1">convolutional operation</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="Image Models">#image</div> </div></p> <p>The following two-step mathematical operation:</p> <ol> <li>Element-wise multiplication of the <a href="#convolutional_filter"><strong>convolutional filter</strong></a> and a slice of an input matrix. (The slice of the input matrix has the same rank and size as the convolutional filter.)</li> <li>Summation of all the values in the resulting product matrix.</li> </ol> <p>For example, consider the following 5x5 input matrix:</p> <p> <img src="/static/machine-learning/glossary/images/ConvolutionalLayerInputMatrix.svg" loading="lazy" alt="The 5x5 matrix: [[128,97,53,201,198], [35,22,25,200,195], [37,24,28,197,182], [33,28,92,195,179], [31,40,100,192,177]]." > </p> <p>Now imagine the following 2x2 convolutional filter:</p> <p> <img src="/static/machine-learning/glossary/images/ConvolutionalLayerFilter.svg" loading="lazy" alt="The 2x2 matrix: [[1, 0], [0, 1]]" > </p> <p>Each convolutional operation involves a single 2x2 slice of the input matrix. For example, suppose we use the 2x2 slice at the top-left of the input matrix. So, the convolution operation on this slice looks as follows:</p> <p> <img src="/static/machine-learning/glossary/images/ConvolutionalLayerOperation.svg" loading="lazy" alt="Applying the convolutional filter [[1, 0], [0, 1]] to the top-left 2x2 section of the input matrix, which is [[128,97], [35,22]]. The convolutional filter leaves the 128 and 22 intact, but zeroes out the 97 and 35. Consequently, the convolution operation yields the value 150 (128+22)." > </p> <p>A <a href="#convolutional_layer"><strong>convolutional layer</strong></a> consists of a series of convolutional operations, each acting on a different slice of the input matrix.</p> <p><a class="glossary-anchor" name="cost"></a> <h2 class="hide-from-toc" id="cost" data-text=" cost" tabindex="-1"> cost</h2></p> <p>Synonym for <a href="#loss"><strong>loss</strong></a>.</p> <p><a class="glossary-anchor" name="co-training"></a> <h2 class="hide-from-toc" id="co-training" data-text=" co-training" tabindex="-1"> co-training</h2></p> <p>A <a href="#semi-supervised_learning"><strong>semi-supervised learning</strong></a> approach particularly useful when all of the following conditions are true:</p> <ul> <li>The ratio of <a href="#unlabeled_example"><strong>unlabeled examples</strong></a> to <a href="#labeled_example"><strong>labeled examples</strong></a> in the dataset is high.</li> <li>This is a classification problem (<a href="#binary_classification"><strong>binary</strong></a> or <a href="#multi-class"><strong>multi-class</strong></a>).</li> <li>The <a href="#dataset"><strong>dataset</strong></a> contains two different sets of predictive features that are independent of each other and complementary.</li> </ul> <p>Co-training essentially amplifies independent signals into a stronger signal. For example, consider a <a href="#classification_model"><strong>classification model</strong></a> that categorizes individual used cars as either <em>Good</em> or <em>Bad</em>. One set of predictive features might focus on aggregate characteristics such as the year, make, and model of the car; another set of predictive features might focus on the previous owner's driving record and the car's maintenance history.</p> <p>The seminal paper on co-training is <a href="https://www.cs.cmu.edu/%7Eavrim/Papers/cotrain.pdf">Combining Labeled and Unlabeled Data with Co-Training</a> by Blum and Mitchell.</p> <p><a class="glossary-anchor" name="counterfactual_fairness"></a> <h2 class="hide-from-toc" id="counterfactual-fairness" data-text=" counterfactual fairness " tabindex="-1"> counterfactual fairness </h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="Fairness">#fairness</div> </div></p> <p>A <a href="#fairness_metric"><strong>fairness metric</strong></a> that checks whether a classifier produces the same result for one individual as it does for another individual who is identical to the first, except with respect to one or more <a href="#sensitive_attribute"><strong>sensitive attributes</strong></a>. Evaluating a classifier for counterfactual fairness is one method for surfacing potential sources of bias in a model.</p> <p>See either of the following for more information:</p> <ul> <li><a href="/machine-learning/crash-course/fairness/counterfactual-fairness">Fairness: Counterfactual fairness</a> in Machine Learning Crash Course.</li> <li><a href="https://papers.nips.cc/paper/2017/file/1271a7029c9df08643b631b02cf9e116-Paper.pdf">When Worlds Collide: Integrating Different Counterfactual Assumptions in Fairness</a></li> </ul> <p><a class="glossary-anchor" name="coverage_bias"></a> <h2 class="hide-from-toc" id="coverage-bias" data-text=" coverage bias " tabindex="-1"> coverage bias </h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="Fairness">#fairness</div> </div></p> <p>See <a href="#selection_bias"><strong>selection bias</strong></a>.</p> <p><a class="glossary-anchor" name="crash_blossom"></a> <h2 class="hide-from-toc" id="crash-blossom" data-text=" crash blossom" tabindex="-1"> crash blossom</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="Language Evaluation">#language</div> </div></p> <p>A sentence or phrase with an ambiguous meaning. Crash blossoms present a significant problem in <a href="#natural_language_understanding"><strong>natural language understanding</strong></a>. For example, the headline <em>Red Tape Holds Up Skyscraper</em> is a crash blossom because an NLU model could interpret the headline literally or figuratively.</p> <section class="expandable"> <h4 class="showalways" id="click-the-icon-for-additional-notes._4" data-text=" Click the icon for additional notes. " tabindex="-1"> Click the icon for additional notes. </h4> <div class="expand-background"> Just to clarify that mysterious headline: <ul> <li><b>Red Tape</b> could refer to either of the following: <ul> <li>An adhesive</li> <li>Excessive bureaucracy</li> </ul> </li> <li><b>Holds Up</b> could refer to either of the following: <ul> <li>Structural support</li> <li>Delays</li> </ul> </li> </ul> </div> <hr /> </section> <p><a class="glossary-anchor" name="critic"></a> <h2 class="hide-from-toc" id="critic" data-text=" critic" tabindex="-1"> critic</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="Reinforcement Learning">#rl</div> </div></p> <p>Synonym for <a href="#deep_q-network"><strong>Deep Q-Network</strong></a>.</p> <p><a class="glossary-anchor" name="cross-entropy"></a> <h2 class="hide-from-toc" id="cross-entropy" data-text=" cross-entropy" tabindex="-1"> cross-entropy</h2></p> <p>A generalization of <a href="#Log_Loss"><strong>Log Loss</strong></a> to <a href="#multi-class"><strong>multi-class classification problems</strong></a>. Cross-entropy quantifies the difference between two probability distributions. See also <a href="#perplexity"><strong>perplexity</strong></a>.</p> <p><a class="glossary-anchor" name="cross-validation"></a> <h2 class="hide-from-toc" id="cross-validation" data-text=" cross-validation" tabindex="-1"> cross-validation</h2></p> <p>A mechanism for estimating how well a <a href="#model"><strong>model</strong></a> would generalize to new data by testing the model against one or more non-overlapping data subsets withheld from the <a href="#training_set"><strong>training set</strong></a>.</p> <p><a class="glossary-anchor" name="cumulative_distribution_function"></a> <h2 class="hide-from-toc" id="cumulative-distribution-function-cdf" data-text=" cumulative distribution function (CDF)" tabindex="-1"> cumulative distribution function (CDF)</h2></p> <p>A function that defines the frequency of samples less than or equal to a target value. For example, consider a normal distribution of continuous values. A CDF tells you that approximately 50% of samples should be less than or equal to the mean and that approximately 84% of samples should be less than or equal to one standard deviation above the mean.</p> <p><a class="glossary-anchor" name="d"></a> <h2 class="glossary" id="d" data-text="D" tabindex="-1">D</h2></p> <p><a class="glossary-anchor" name="data_analysis"></a> <h2 class="hide-from-toc" id="data-analysis" data-text=" data analysis" tabindex="-1"> data analysis</h2></p> <p>Obtaining an understanding of data by considering samples, measurement, and visualization. Data analysis can be particularly useful when a dataset is first received, before one builds the first <a href="#model"><strong>model</strong></a>. It is also crucial in understanding experiments and debugging problems with the system.</p> <p><a class="glossary-anchor" name="data_augmentation"></a> <h2 class="hide-from-toc" id="data-augmentation" data-text=" data augmentation" tabindex="-1"> data augmentation</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="Image Models">#image</div> </div></p> <p>Artificially boosting the range and number of <a href="#training"><strong>training</strong></a> examples by transforming existing <a href="#example"><strong>examples</strong></a> to create additional examples. For example, suppose images are one of your <a href="#feature"><strong>features</strong></a>, but your dataset doesn't contain enough image examples for the model to learn useful associations. Ideally, you'd add enough <a href="#label"><strong>labeled</strong></a> images to your dataset to enable your model to train properly. If that's not possible, data augmentation can rotate, stretch, and reflect each image to produce many variants of the original picture, possibly yielding enough labeled data to enable excellent training.</p> <p><a class="glossary-anchor" name="DataFrame"></a> <h2 class="hide-from-toc" id="dataframe" data-text=" DataFrame" tabindex="-1"> DataFrame</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="ML Fundamentals">#fundamentals</div> </div></p> <p>A popular <a href="#pandas"><strong>pandas</strong></a> data type for representing <a href="#dataset"><strong>datasets</strong></a> in memory.</p> <p>A DataFrame is analogous to a table or a spreadsheet. Each column of a DataFrame has a name (a header), and each row is identified by a unique number.</p> <p>Each column in a DataFrame is structured like a 2D array, except that each column can be assigned its own data type.</p> <p>See also the official <a href="https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.html"><strong>pandas.DataFrame reference page</strong></a>.</p> <p><a class="glossary-anchor" name="data-parallelism"></a> <h2 class="hide-from-toc" id="data-parallelism" data-text=" data parallelism" tabindex="-1"> data parallelism</h2></p> <p>A way of scaling <a href="#training"><strong>training</strong></a> or <a href="#inference"><strong>inference</strong></a> that replicates an entire <a href="#model"><strong>model</strong></a> onto multiple devices and then passes a subset of the input data to each device. Data parallelism can enable training and inference on very large <a href="#batch_size"><strong>batch sizes</strong></a>; however, data parallelism requires that the model be small enough to fit on all devices.</p> <p>Data parallelism typically speeds training and inference.</p> <p>See also <a href="#model-parallelism"><strong>model parallelism</strong></a>.</p> <p><a class="glossary-anchor" name="data_set"></a> <a class="glossary-anchor" name="dataset"></a> <h2 class="hide-from-toc" id="data-set-or-dataset" data-text=" data set or dataset" tabindex="-1"> data set or dataset</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="ML Fundamentals">#fundamentals</div> </div></p> <p>A collection of raw data, commonly (but not exclusively) organized in one of the following formats:</p> <ul> <li>a spreadsheet</li> <li>a file in CSV (comma-separated values) format</li> </ul> <p><a class="glossary-anchor" name="dataset_API"></a> <h2 class="hide-from-toc" id="dataset-api-tf.data" data-text=" Dataset API (tf.data)" tabindex="-1"> Dataset API (tf.data)</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="TensorFlow">#TensorFlow</div> </div></p> <p>A high-level <a href="#TensorFlow"><strong>TensorFlow</strong></a> API for reading data and transforming it into a form that a machine learning algorithm requires. A <code translate="no" dir="ltr">tf.data.Dataset</code> object represents a sequence of elements, in which each element contains one or more <a href="#tensor"><strong>Tensors</strong></a>. A <code translate="no" dir="ltr">tf.data.Iterator</code> object provides access to the elements of a <code translate="no" dir="ltr">Dataset</code>.</p> <p><a class="glossary-anchor" name="decision_boundary"></a> <h2 class="hide-from-toc" id="decision-boundary" data-text=" decision boundary" tabindex="-1"> decision boundary</h2></p> <p>The separator between <a href="#class"><strong>classes</strong></a> learned by a <a href="#model"><strong>model</strong></a> in a <a href="#binary_classification"><strong>binary class</strong></a> or <a href="#multi-class"><strong>multi-class classification problems</strong></a>. For example, in the following image representing a binary classification problem, the decision boundary is the frontier between the orange class and the blue class:</p> <p> <img src="/static/machine-learning/glossary/images/decision_boundary.png" loading="lazy" alt="A well-defined boundary between one class and another." > </p> <p><a class="glossary-anchor" name="decision-forest"></a> <h2 class="hide-from-toc" id="decision-forest" data-text=" decision forest " tabindex="-1"> decision forest </h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="Decision Forests">#df</div> </div></p> <p>A model created from multiple <a href="#decision-tree"><strong>decision trees</strong></a>. A decision forest makes a prediction by aggregating the predictions of its decision trees. Popular types of decision forests include <a href="#random-forest"><strong>random forests</strong></a> and <a href="#gbt"><strong>gradient boosted trees</strong></a>.</p> <p>See the <a href="/machine-learning/decision-forests/intro-to-decision-forests-real">Decision Forests</a> section in the Decision Forests course for more information.</p> <p><a class="glossary-anchor" name="decision_threshold"></a> <h2 class="hide-from-toc" id="decision-threshold" data-text=" decision threshold" tabindex="-1"> decision threshold</h2></p> <p>Synonym for <a href="#classification_threshold"><strong>classification threshold</strong></a>.</p> <p><a class="glossary-anchor" name="decision-tree"></a> <h2 class="hide-from-toc" id="decision-tree" data-text=" decision tree" tabindex="-1"> decision tree</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="Decision Forests">#df</div> </div></p> <p>A supervised learning model composed of a set of <a href="#condition"><strong>conditions</strong></a> and <a href="#leaf"><strong>leaves</strong></a> organized hierarchically. For example, the following is a decision tree:</p> <p> <img src="/static/machine-learning/glossary/images/DecisionTree.png" loading="lazy" width="559" alt="A decision tree consisting of four conditions arranged hierarchically, which lead to five leaves." > </p> <p><a class="glossary-anchor" name="decoder"></a> <h2 class="hide-from-toc" id="decoder" data-text=" decoder" tabindex="-1"> decoder</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="Language Evaluation">#language</div> </div></p> <p>In general, any ML system that converts from a processed, dense, or internal representation to a more raw, sparse, or external representation.</p> <p>Decoders are often a component of a larger model, where they are frequently paired with an <a href="#encoder"><strong>encoder</strong></a>.</p> <p>In <a href="#sequence-to-sequence-task"><strong>sequence-to-sequence tasks</strong></a>, a decoder starts with the internal state generated by the encoder to predict the next sequence.</p> <p>Refer to <a href="#Transformer"><strong>Transformer</strong></a> for the definition of a decoder within the Transformer architecture.</p> <p>See <a href="/machine-learning/crash-course/llm">Large language models</a> in Machine Learning Crash Course for more information.</p> <p><a class="glossary-anchor" name="deep_model"></a> <h2 class="hide-from-toc" id="deep-model" data-text=" deep model" tabindex="-1"> deep model</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="ML Fundamentals">#fundamentals</div> </div></p> <p>A <a href="#neural_network"><strong>neural network</strong></a> containing more than one <a href="#hidden_layer"><strong>hidden layer</strong></a>.</p> <p>A deep model is also called a <strong>deep neural network</strong>.</p> <p>Contrast with <a href="#wide_model"><strong>wide model</strong></a>.</p> <p><a class="glossary-anchor" name="deep_neural_network"></a> <h2 class="hide-from-toc" id="deep-neural-network" data-text=" deep neural network" tabindex="-1"> deep neural network</h2></p> <p>Synonym for <a href="#deep_model"><strong>deep model</strong></a>.</p> <p><a class="glossary-anchor" name="deep_q-network"></a> <h2 class="hide-from-toc" id="deep-q-network-dqn" data-text=" Deep Q-Network (DQN)" tabindex="-1"> Deep Q-Network (DQN)</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="Reinforcement Learning">#rl</div> </div></p> <p>In <a href="#q-learning"><strong>Q-learning</strong></a>, a deep <a href="#neural_network"><strong>neural network</strong></a> that predicts <a href="#q-function"><strong>Q-functions</strong></a>.</p> <p><strong>Critic</strong> is a synonym for Deep Q-Network.</p> <p><a class="glossary-anchor" name="demographic_parity"></a> <h2 class="hide-from-toc" id="demographic-parity" data-text=" demographic parity" tabindex="-1"> demographic parity</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="Fairness">#fairness</div> </div></p> <p>A <a href="#fairness_metric"><strong>fairness metric</strong></a> that is satisfied if the results of a model's classification are not dependent on a given <a href="#sensitive_attribute"><strong>sensitive attribute</strong></a>.</p> <p>For example, if both Lilliputians and Brobdingnagians apply to Glubbdubdrib University, demographic parity is achieved if the percentage of Lilliputians admitted is the same as the percentage of Brobdingnagians admitted, irrespective of whether one group is on average more qualified than the other.</p> <p>Contrast with <a href="#equalized_odds"><strong>equalized odds</strong></a> and <a href="#equality_of_opportunity"><strong>equality of opportunity</strong></a>, which permit classification results in aggregate to depend on sensitive attributes, but don't permit classification results for certain specified <a href="#ground_truth"><strong>ground truth</strong></a> labels to depend on sensitive attributes. See <a href="http://research.google.com/bigpicture/attacking-discrimination-in-ml/" target="T">"Attacking discrimination with smarter machine learning"</a> for a visualization exploring the tradeoffs when optimizing for demographic parity.</p> <p>See <a href="/machine-learning/crash-course/fairness/demographic-parity">Fairness: demographic parity</a> in Machine Learning Crash Course for more information.</p> <p><a class="glossary-anchor" name="denoising"></a> <h2 class="hide-from-toc" id="denoising" data-text=" denoising" tabindex="-1"> denoising</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="Language Evaluation">#language</div> </div></p> <p>A common approach to <a href="#self-supervised-learning"><strong>self-supervised learning</strong></a> in which:</p> <ol> <li><a href="#noise"><strong>Noise</strong></a> is artificially added to the dataset.</li> <li>The <a href="#model"><strong>model</strong></a> tries to remove the noise.</li> </ol> <p>Denoising enables learning from <a href="#unlabeled_example"><strong>unlabeled examples</strong></a>. The original <a href="#dataset"><strong>dataset</strong></a> serves as the target or <a href="#label"><strong>label</strong></a> and the noisy data as the input.</p> <p>Some <a href="#masked-language-model"><strong>masked language models</strong></a> use denoising as follows:</p> <ol> <li>Noise is artificially added to an unlabeled sentence by masking some of the tokens.</li> <li>The model tries to predict the original tokens.</li> </ol> <p><a class="glossary-anchor" name="dense_feature"></a> <h2 class="hide-from-toc" id="dense-feature" data-text=" dense feature" tabindex="-1"> dense feature</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="ML Fundamentals">#fundamentals</div> </div></p> <p>A <a href="#feature"><strong>feature</strong></a> in which most or all values are nonzero, typically a <a href="#tensor"><strong>Tensor</strong></a> of floating-point values. For example, the following 10-element Tensor is dense because 9 of its values are nonzero:</p> <table> <tr> <td>8</td> <td>3</td> <td>7</td> <td>5</td> <td>2</td> <td>4</td> <td>0</td> <td>4</td> <td>9</td> <td>6</td> </tr> </table> <p>Contrast with <a href="#sparse_features"><strong>sparse feature</strong></a>.</p> <p><a class="glossary-anchor" name="dense_layer"></a> <h2 class="hide-from-toc" id="dense-layer" data-text=" dense layer" tabindex="-1"> dense layer</h2></p> <p>Synonym for <a href="#fully_connected_layer"><strong>fully connected layer</strong></a>.</p> <p><a class="glossary-anchor" name="depth"></a> <h2 class="hide-from-toc" id="depth" data-text=" depth" tabindex="-1"> depth</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="ML Fundamentals">#fundamentals</div> </div></p> <p>The sum of the following in a <a href="#neural_network"><strong>neural network</strong></a>:</p> <ul> <li>the number of <a href="#hidden_layer"><strong>hidden layers</strong></a></li> <li>the number of <a href="#output_layer"><strong>output layers</strong></a>, which is typically 1</li> <li>the number of any <a href="#embedding_layer"><strong>embedding layers</strong></a></li> </ul> <p>For example, a neural network with five hidden layers and one output layer has a depth of 6.</p> <p>Notice that the <a href="#input-layer"><strong>input layer</strong></a> doesn't influence depth.</p> <p><a class="glossary-anchor" name="depthwise_separable_cnn"></a> <h2 class="hide-from-toc" id="depthwise-separable-convolutional-neural-network-sepcnn" data-text="depthwise separable convolutional neural network (sepCNN)" tabindex="-1">depthwise separable convolutional neural network (sepCNN)</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="Image Models">#image</div> </div></p> <p>A <a href="#convolutional_neural_network"><strong>convolutional neural network</strong></a> architecture based on <a href="https://github.com/tensorflow/tpu/tree/master/models/experimental/inception">Inception</a>, but where Inception modules are replaced with depthwise separable convolutions. Also known as Xception.</p> <p>A depthwise separable convolution (also abbreviated as separable convolution) factors a standard 3D convolution into two separate convolution operations that are more computationally efficient: first, a depthwise convolution, with a depth of 1 (n ✕ n ✕ 1), and then second, a pointwise convolution, with length and width of 1 (1 ✕ 1 ✕ n).</p> <p>To learn more, see <a href="https://arxiv.org/pdf/1610.02357.pdf">Xception: Deep Learning with Depthwise Separable Convolutions</a>.</p> <p><a class="glossary-anchor" name="derived-label"></a> <h2 class="hide-from-toc" id="derived-label" data-text=" derived label" tabindex="-1"> derived label</h2></p> <p>Synonym for <a href="#proxy_labels"><strong>proxy label</strong></a>.</p> <p><a class="glossary-anchor" name="device"></a> <h2 class="hide-from-toc" id="device" data-text=" device" tabindex="-1"> device</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="TensorFlow">#TensorFlow</div> <div class="glossary-icon" title="Google Cloud">#GoogleCloud</div> </div></p> <p>An overloaded term with the following two possible definitions:</p> <ol> <li>A category of hardware that can run a TensorFlow session, including CPUs, GPUs, and <a href="#TPU"><strong>TPUs</strong></a>.</li> <li>When training an ML model on <a href="#accelerator-chip"><strong>accelerator chips</strong></a> (GPUs or TPUs), the part of the system that actually manipulates <a href="#tensor"><strong>tensors</strong></a> and <a href="#embedding_layer"><strong>embeddings</strong></a>. The device runs on accelerator chips. In contrast, the <a href="#host"><strong>host</strong></a> typically runs on a CPU.</li> </ol> <p><a class="glossary-anchor" name="differential-privacy"></a> <h2 class="hide-from-toc" id="differential-privacy" data-text=" differential privacy" tabindex="-1"> differential privacy</h2></p> <p>In machine learning, an anonymization approach to protect any sensitive data (for example, an individual's personal information) included in a model's <a href="#training_set"><strong>training set</strong></a> from being exposed. This approach ensures that the <a href="#model"><strong>model</strong></a> doesn't learn or remember much about a specific individual. This is accomplished by sampling and adding noise during model training to obscure individual data points, mitigating the risk of exposing sensitive training data.</p> <p>Differential privacy is also used outside of machine learning. For example, data scientists sometimes use differential privacy to protect individual privacy when computing product usage statistics for different demographics.</p> <p><a class="glossary-anchor" name="dimension_reduction"></a> <h2 class="hide-from-toc" id="dimension-reduction" data-text=" dimension reduction" tabindex="-1"> dimension reduction</h2></p> <p>Decreasing the number of dimensions used to represent a particular feature in a feature vector, typically by converting to an <a href="#embedding_vector"><strong>embedding vector</strong></a>.</p> <p><a class="glossary-anchor" name="dimensions"></a> <h2 class="hide-from-toc" id="dimensions" data-text=" dimensions " tabindex="-1"> dimensions </h2></p> <p>Overloaded term having any of the following definitions:</p> <ul> <li><p>The number of levels of coordinates in a <a href="#tensor"><strong>Tensor</strong></a>. For example:</p> <ul> <li>A scalar has zero dimensions; for example, <code translate="no" dir="ltr">["Hello"]</code>.</li> <li>A vector has one dimension; for example, <code translate="no" dir="ltr">[3, 5, 7, 11]</code>.</li> <li>A matrix has two dimensions; for example, <code translate="no" dir="ltr">[[2, 4, 18], [5, 7, 14]]</code>. You can uniquely specify a particular cell in a one-dimensional vector with one coordinate; you need two coordinates to uniquely specify a particular cell in a two-dimensional matrix.</li> </ul></li> <li><p>The number of entries in a <a href="#feature_vector"><strong>feature vector</strong></a>.</p></li> <li><p>The number of elements in an <a href="#embedding_layer"><strong>embedding layer</strong></a>.</p></li> </ul> <p><a class="glossary-anchor" name="direct-prompting"></a> <h2 class="hide-from-toc" id="direct-prompting" data-text=" direct prompting " tabindex="-1"> direct prompting </h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="Language Evaluation">#language</div> <div class="glossary-icon" title="Generative AI">#generativeAI</div> </div></p> <p>Synonym for <a href="#zero-shot-prompting"><strong>zero-shot prompting</strong></a>.</p> <p><a class="glossary-anchor" name="discrete_feature"></a> <h2 class="hide-from-toc" id="discrete-feature" data-text=" discrete feature" tabindex="-1"> discrete feature</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="ML Fundamentals">#fundamentals</div> </div></p> <p>A <a href="#feature"><strong>feature</strong></a> with a finite set of possible values. For example, a feature whose values may only be <em>animal</em>, <em>vegetable</em>, or <em>mineral</em> is a discrete (or categorical) feature.</p> <p>Contrast with <a href="#continuous_feature"><strong>continuous feature</strong></a>.</p> <p><a class="glossary-anchor" name="discriminative_model"></a> <h2 class="hide-from-toc" id="discriminative-model" data-text=" discriminative model" tabindex="-1"> discriminative model</h2></p> <p>A <a href="#model"><strong>model</strong></a> that predicts <a href="#label"><strong>labels</strong></a> from a set of one or more <a href="#feature"><strong>features</strong></a>. More formally, discriminative models define the conditional probability of an output given the features and <a href="#weight"><strong>weights</strong></a>; that is:</p> <div></div><devsite-code><pre class="devsite-click-to-copy" translate="no" dir="ltr" is-upgraded syntax="Text only">p(output | features, weights)</pre></devsite-code> <p>For example, a model that predicts whether an email is spam from features and weights is a discriminative model.</p> <p>The vast majority of supervised learning models, including classification and regression models, are discriminative models.</p> <p>Contrast with <a href="#generative_model"><strong>generative model</strong></a>.</p> <p><a class="glossary-anchor" name="discriminator"></a> <h2 class="hide-from-toc" id="discriminator" data-text=" discriminator" tabindex="-1"> discriminator</h2></p> <p>A system that determines whether <a href="#example"><strong>examples</strong></a> are real or fake.</p> <p>Alternatively, the subsystem within a <a href="#generative_adversarial_network"><strong>generative adversarial network</strong></a> that determines whether the examples created by the <a href="#generator"><strong>generator</strong></a> are real or fake.</p> <p>See <a href="/machine-learning/gan/discriminator">The discriminator</a> in the GAN course for more information.</p> <p><a class="glossary-anchor" name="disparate_impact"></a> <h2 class="hide-from-toc" id="disparate-impact" data-text=" disparate impact" tabindex="-1"> disparate impact</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="Fairness">#fairness</div> </div></p> <p>Making decisions about people that impact different population subgroups disproportionately. This usually refers to situations where an algorithmic decision-making process harms or benefits some subgroups more than others.</p> <p>For example, suppose an algorithm that determines a Lilliputian's eligibility for a miniature-home loan is more likely to classify them as "ineligible" if their mailing address contains a certain postal code. If Big-Endian Lilliputians are more likely to have mailing addresses with this postal code than Little-Endian Lilliputians, then this algorithm may result in disparate impact.</p> <p>Contrast with <a href="#disparate_treatment"><strong>disparate treatment</strong></a>, which focuses on disparities that result when subgroup characteristics are explicit inputs to an algorithmic decision-making process.</p> <p><a class="glossary-anchor" name="disparate_treatment"></a> <h2 class="hide-from-toc" id="disparate-treatment" data-text=" disparate treatment" tabindex="-1"> disparate treatment</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="Fairness">#fairness</div> </div></p> <p>Factoring subjects' <a href="#sensitive_attribute"><strong>sensitive attributes</strong></a> into an algorithmic decision-making process such that different subgroups of people are treated differently.</p> <p>For example, consider an algorithm that determines Lilliputians' eligibility for a miniature-home loan based on the data they provide in their loan application. If the algorithm uses a Lilliputian's affiliation as Big-Endian or Little-Endian as an input, it is enacting disparate treatment along that dimension.</p> <p>Contrast with <a href="#disparate_impact"><strong>disparate impact</strong></a>, which focuses on disparities in the societal impacts of algorithmic decisions on subgroups, irrespective of whether those subgroups are inputs to the model.</p> <aside class="warning"><strong>Warning:</strong><span> Because sensitive attributes are almost always correlated with other features the data may have, explicitly removing sensitive attribute information doesn't guarantee that subgroups will be treated equally. For example, removing sensitive demographic attributes from a training dataset that still includes postal code as a feature may address disparate treatment of subgroups, but there still might be disparate impact upon these groups because postal code might serve as a <a href="#proxy_sensitive_attributes"><strong>proxy</strong></a> for other demographic information.</span></aside> <p><a class="glossary-anchor" name="distillation"></a> <h2 class="hide-from-toc" id="distillation" data-text=" distillation" tabindex="-1"> distillation</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="Generative AI">#generativeAI</div> </div></p> <p>The process of reducing the size of one <a href="#model"><strong>model</strong></a> (known as the <strong>teacher</strong>) into a smaller model (known as the <strong>student</strong>) that emulates the original model's predictions as faithfully as possible. Distillation is useful because the smaller model has two key benefits over the larger model (the teacher):</p> <ul> <li>Faster inference time</li> <li>Reduced memory and energy usage</li> </ul> <p>However, the student's predictions are typically not as good as the teacher's predictions.</p> <p>Distillation trains the student model to minimize a <a href="#loss-function"><strong>loss function</strong></a> based on the difference between the outputs of the predictions of the student and teacher models.</p> <p>Compare and contrast distillation with the following terms:</p> <ul> <li><a href="#fine-tuning"><strong>fine-tuning</strong></a></li> <li><a href="#prompt-based-learning"><strong>prompt-based learning</strong></a></li> </ul> <p>See <a href="/machine-learning/crash-course/llm/tuning">LLMs: Fine-tuning, distillation, and prompt engineering</a> in Machine Learning Crash Course for more information.</p> <p><a class="glossary-anchor" name="distribution"></a> <h2 class="hide-from-toc" id="distribution" data-text=" distribution" tabindex="-1"> distribution</h2></p> <p>The frequency and range of different values for a given <a href="#feature"><strong>feature</strong></a> or <a href="#label"><strong>label</strong></a>. A distribution captures how likely a particular value is.</p> <p>The following image shows histograms of two different distributions:</p> <ul> <li>On the left, a power law distribution of wealth versus the number of people possessing that wealth.</li> <li>On the right, a normal distribution of height versus the number of people possessing that height.</li> </ul> <p> <img src="/static/machine-learning/glossary/images/Distributions.png" loading="lazy" alt="Two histograms. One histogram shows a power law distribution with wealth on the x-axis and number of people having that wealth on the y-axis. Most people have very little wealth, and a few people have a lot of wealth. The other histogram shows a normal distribution with height on the x-axis and number of people having that height on the y-axis. Most people are clustered somewhere near the mean." > </p> <p>Understanding each feature and label's distribution can help you determine how to <a href="#normalization"><strong>normalize</strong></a> values and detect <a href="#outliers"><strong>outliers</strong></a>.</p> <p>The phrase <strong>out of distribution</strong> refers to a value that doesn't appear in the dataset or is very rare. For example, an image of the planet Saturn would be considered out of distribution for a dataset consisting of cat images.</p> <p><a class="glossary-anchor" name="divisive_clustering"></a> <h2 class="hide-from-toc" id="divisive-clustering" data-text=" divisive clustering" tabindex="-1"> divisive clustering</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="Clustering">#clustering</div> </div></p> <p>See <a href="#hierarchical_clustering"><strong>hierarchical clustering</strong></a>.</p> <p><a class="glossary-anchor" name="downsampling"></a> <h2 class="hide-from-toc" id="downsampling" data-text=" downsampling" tabindex="-1"> downsampling</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="Image Models">#image</div> </div></p> <p>Overloaded term that can mean either of the following:</p> <ul> <li>Reducing the amount of information in a <a href="#feature"><strong>feature</strong></a> in order to <a href="#training"><strong>train</strong></a> a model more efficiently. For example, before training an image recognition model, downsampling high-resolution images to a lower-resolution format.</li> <li>Training on a disproportionately low percentage of over-represented <a href="#class"><strong>class</strong></a> examples in order to improve model training on under-represented classes. For example, in a <a href="#class_imbalanced_data_set"><strong>class-imbalanced dataset</strong></a>, models tend to learn a lot about the <a href="#majority_class"><strong>majority class</strong></a> and not enough about the <a href="#minority_class"><strong>minority class</strong></a>. Downsampling helps balance the amount of training on the majority and minority classes.</li> </ul> <p>See <a href="/machine-learning/crash-course/overfitting/imbalanced-datasets">Datasets: Imbalanced datasets</a> in Machine Learning Crash Course for more information.</p> <p><a class="glossary-anchor" name="DQN"></a> <h2 class="hide-from-toc" id="dqn" data-text=" DQN" tabindex="-1"> DQN</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="Reinforcement Learning">#rl</div> </div></p> <p>Abbreviation for <a href="#deep_q-network"><strong>Deep Q-Network</strong></a>.</p> <p><a class="glossary-anchor" name="dropout_regularization"></a> <h2 class="hide-from-toc" id="dropout-regularization" data-text=" dropout regularization" tabindex="-1"> dropout regularization</h2></p> <p>A form of <a href="#regularization"><strong>regularization</strong></a> useful in training <a href="#neural_network"><strong>neural networks</strong></a>. Dropout regularization removes a random selection of a fixed number of the units in a network layer for a single gradient step. The more units dropped out, the stronger the regularization. This is analogous to training the network to emulate an exponentially large <a href="#ensemble"><strong>ensemble</strong></a> of smaller networks. For full details, see <a href="http://jmlr.org/papers/volume15/srivastava14a/srivastava14a.pdf" target="T">Dropout: A Simple Way to Prevent Neural Networks from Overfitting</a>.</p> <p><a class="glossary-anchor" name="dynamic"></a> <h2 class="hide-from-toc" id="dynamic" data-text=" dynamic" tabindex="-1"> dynamic</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="ML Fundamentals">#fundamentals</div> </div></p> <p>Something done frequently or continuously. The terms <strong>dynamic</strong> and <strong>online</strong> are synonyms in machine learning. The following are common uses of <strong>dynamic</strong> and <strong>online</strong> in machine learning:</p> <ul> <li>A <a href="#dynamic_model"><strong>dynamic model</strong></a> (or <strong>online model</strong>) is a model that is retrained frequently or continuously.</li> <li><strong>Dynamic training</strong> (or <strong>online training</strong>) is the process of training frequently or continuously.</li> <li><strong>Dynamic inference</strong> (or <strong>online inference</strong>) is the process of generating predictions on demand.</li> </ul> <p><a class="glossary-anchor" name="dynamic_model"></a> <h2 class="hide-from-toc" id="dynamic-model" data-text=" dynamic model" tabindex="-1"> dynamic model</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="ML Fundamentals">#fundamentals</div> </div></p> <p>A <a href="#model"><strong>model</strong></a> that is frequently (maybe even continuously) retrained. A dynamic model is a "lifelong learner" that constantly adapts to evolving data. A dynamic model is also known as an <strong>online model</strong>.</p> <p>Contrast with <a href="#static-model"><strong>static model</strong></a>.</p> <p><a class="glossary-anchor" name="e"></a> <h2 class="glossary" id="e" data-text="E" tabindex="-1">E</h2></p> <p><a class="glossary-anchor" name="eager_execution"></a> <h2 class="hide-from-toc" id="eager-execution" data-text=" eager execution" tabindex="-1"> eager execution</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="TensorFlow">#TensorFlow</div> </div></p> <p>A TensorFlow programming environment in which <a href="#Operation"><strong>operations</strong></a> run immediately. In contrast, operations called in <a href="#graph_execution"><strong>graph execution</strong></a> don't run until they are explicitly evaluated. Eager execution is an <a href="https://wikipedia.org/wiki/Imperative_programming" target="T">imperative interface</a>, much like the code in most programming languages. Eager execution programs are generally far easier to debug than graph execution programs.</p> <p><a class="glossary-anchor" name="early_stopping"></a> <h2 class="hide-from-toc" id="early-stopping" data-text=" early stopping" tabindex="-1"> early stopping</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="ML Fundamentals">#fundamentals</div> </div></p> <p>A method for <a href="#regularization"><strong>regularization</strong></a> that involves ending <a href="#training"><strong>training</strong></a> <em>before</em> training loss finishes decreasing. In early stopping, you intentionally stop training the model when the loss on a <a href="#validation_set"><strong>validation dataset</strong></a> starts to increase; that is, when <a href="#generalization"><strong>generalization</strong></a> performance worsens.</p> <section class="expandable"> <h4 class="showalways" id="click-the-icon-for-additional-notes._5" data-text=" Click the icon for additional notes. " tabindex="-1"> Click the icon for additional notes. </h4> <div class="expand-background"> <p> Early stopping may seem counterintuitive. After all, telling a model to halt training while the loss is still decreasing may seem like telling a chef to stop cooking before the dessert has fully baked. However, training a model for too long can lead to <a href="#overfitting">overfitting</a>. That is, if you train a model too long, the model may fit the training data so closely that the model doesn't make good predictions on new examples. </p> </div> <hr /> </section> <p><a class="glossary-anchor" name="earth-movers-distance"></a> <h2 class="hide-from-toc" id="earth-movers-distance-emd" data-text=" earth mover's distance (EMD)" tabindex="-1"> earth mover's distance (EMD)</h2></p> <p>A measure of the relative similarity of two <a href="#distribution"><strong>distributions</strong></a>. The lower the earth mover's distance, the more similar the distributions.</p> <p><a class="glossary-anchor" name="edit-distance"></a> <h2 class="hide-from-toc" id="edit-distance" data-text=" edit distance" tabindex="-1"> edit distance</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="Language Evaluation">#language</div> </div></p> <p>A measurement of how similar two text strings are to each other. In machine learning, edit distance is useful because it is simple to compute, and an effective way to compare two strings that are known to be similar or to find strings that are similar to a given string.</p> <p>There are several definitions of edit distance, each using different string operations. For example, the <a href="https://wikipedia.org/wiki/Levenshtein_distance" target="T"> Levenshtein distance</a> considers the fewest delete, insert, and substitute operations.</p> <p>For example, the Levenshtein distance between the words "heart" and "darts" is 3 because the following 3 edits are the fewest changes to turn one word into the other:</p> <ol> <li>heart → deart (substitute "h" with "d")</li> <li>deart → dart (delete "e")</li> <li>dart → darts (insert "s")</li> </ol> <p><a class="glossary-anchor" name="einsum-notation"></a> <h2 class="hide-from-toc" id="einsum-notation" data-text=" Einsum notation" tabindex="-1"> Einsum notation</h2></p> <p>An efficient notation for describing how two <a href="#tensor"><strong>tensors</strong></a> are to be combined. The tensors are combined by multiplying the elements of one tensor by the elements of the other tensor and then summing the products. Einsum notation uses symbols to identify the axes of each tensor, and those same symbols are rearranged to specify the shape of the new resulting tensor.</p> <p><a href="#numpy"><strong>NumPy</strong></a> provides a common Einsum implementation.</p> <p><a class="glossary-anchor" name="embedding_layer"></a> <h2 class="hide-from-toc" id="embedding-layer" data-text=" embedding layer" tabindex="-1"> embedding layer</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="Language Evaluation">#language</div> <div class="glossary-icon" title="ML Fundamentals">#fundamentals</div> </div></p> <p>A special <a href="#hidden_layer"><strong>hidden layer</strong></a> that trains on a high-dimensional <a href="#categorical_data"><strong>categorical</strong></a> feature to gradually learn a lower dimension embedding vector. An embedding layer enables a neural network to train far more efficiently than training just on the high-dimensional categorical feature.</p> <p>For example, Earth currently supports about 73,000 tree species. Suppose tree species is a <a href="#feature"><strong>feature</strong></a> in your model, so your model's input layer includes a <a href="#one-hot_encoding"><strong>one-hot vector</strong></a> 73,000 elements long. For example, perhaps <code translate="no" dir="ltr">baobab</code> would be represented something like this:</p> <p> <img src="/static/machine-learning/glossary/images/One-HotRepresentationOfTreeSpecies.png" loading="lazy" alt="An array of 73,000 elements. The first 6,232 elements hold the value 0. The next element holds the value 1. The final 66,767 elements hold the value zero." > </p> <p>A 73,000-element array is very long. If you don't add an embedding layer to the model, training is going to be very time consuming due to multiplying 72,999 zeros. Perhaps you pick the embedding layer to consist of 12 dimensions. Consequently, the embedding layer will gradually learn a new embedding vector for each tree species.</p> <p>In certain situations, <a href="#hashing"><strong>hashing</strong></a> is a reasonable alternative to an embedding layer.</p> <p>See <a href="/machine-learning/crash-course/embeddings">Embeddings</a> in Machine Learning Crash Course for more information.</p> <p><a class="glossary-anchor" name="embedding_space"></a> <h2 class="hide-from-toc" id="embedding-space" data-text=" embedding space" tabindex="-1"> embedding space</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="Language Evaluation">#language</div> </div></p> <p>The d-dimensional vector space that features from a higher-dimensional vector space are mapped to. Ideally, the embedding space contains a structure that yields meaningful mathematical results; for example, in an ideal embedding space, addition and subtraction of embeddings can solve word analogy tasks.</p> <p>The <a href="https://wikipedia.org/wiki/Dot_product" target="T">dot product</a> of two embeddings is a measure of their similarity.</p> <p><a class="glossary-anchor" name="embedding_vector"></a> <h2 class="hide-from-toc" id="embedding-vector" data-text=" embedding vector" tabindex="-1"> embedding vector</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="Language Evaluation">#language</div> </div></p> <p>Broadly speaking, an array of floating-point numbers taken from <strong>any</strong> <a href="#hidden_layer"><strong>hidden layer</strong></a> that describe the inputs to that hidden layer. Often, an embedding vector is the array of floating-point numbers trained in an embedding layer. For example, suppose an embedding layer must learn an embedding vector for each of the 73,000 tree species on Earth. Perhaps the following array is the embedding vector for a baobab tree:</p> <p> <img src="/static/machine-learning/glossary/images/EmbeddingBaobab.png" loading="lazy" alt="An array of 12 elements, each holding a floating-point number between 0.0 and 1.0." > </p> <p>An embedding vector is not a bunch of random numbers. An embedding layer determines these values through training, similar to the way a neural network learns other weights during training. Each element of the array is a rating along some characteristic of a tree species. Which element represents which tree species' characteristic? That's very hard for humans to determine.</p> <p>The mathematically remarkable part of an embedding vector is that similar items have similar sets of floating-point numbers. For example, similar tree species have a more similar set of floating-point numbers than dissimilar tree species. Redwoods and sequoias are related tree species, so they'll have a more similar set of floating-pointing numbers than redwoods and coconut palms. The numbers in the embedding vector will change each time you retrain the model, even if you retrain the model with identical input.</p> <p><a class="glossary-anchor" name="empirical_cumulative_distribution_function"></a> <h2 class="hide-from-toc" id="empirical-cumulative-distribution-function-ecdf-or-edf" data-text=" empirical cumulative distribution function (eCDF or EDF)" tabindex="-1"> empirical cumulative distribution function (eCDF or EDF)</h2></p> <p>A <a href="#cumulative_distribution_function"><strong>cumulative distribution function</strong></a> based on <em>empirical measurements</em> from a real dataset. The value of the function at any point along the x-axis is the fraction of observations in the dataset that are less than or equal to the specified value.</p> <p><a class="glossary-anchor" name="ERM"></a> <h2 class="hide-from-toc" id="empirical-risk-minimization-erm" data-text=" empirical risk minimization (ERM)" tabindex="-1"> empirical risk minimization (ERM)</h2></p> <p>Choosing the function that minimizes loss on the training set. Contrast with <a href="#SRM"><strong>structural risk minimization</strong></a>.</p> <p><a class="glossary-anchor" name="encoder"></a> <h2 class="hide-from-toc" id="encoder" data-text=" encoder" tabindex="-1"> encoder</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="Language Evaluation">#language</div> </div></p> <p>In general, any ML system that converts from a raw, sparse, or external representation into a more processed, denser, or more internal representation.</p> <p>Encoders are often a component of a larger model, where they are frequently paired with a <a href="#decoder"><strong>decoder</strong></a>. Some <a href="#Transformer"><strong>Transformers</strong></a> pair encoders with decoders, though other Transformers use only the encoder or only the decoder.</p> <p>Some systems use the encoder's output as the input to a classification or regression network.</p> <p>In <a href="#sequence-to-sequence-task"><strong>sequence-to-sequence tasks</strong></a>, an encoder takes an input sequence and returns an internal state (a vector). Then, the <a href="#decoder"><strong>decoder</strong></a> uses that internal state to predict the next sequence.</p> <p>Refer to <a href="#Transformer"><strong>Transformer</strong></a> for the definition of an encoder in the Transformer architecture.</p> <p>See <a href="/machine-learning/crash-course/llm/transformers">LLMs: What's a large language model</a> in Machine Learning Crash Course for more information.</p> <p><a class="glossary-anchor" name="ensemble"></a> <h2 class="hide-from-toc" id="ensemble" data-text=" ensemble" tabindex="-1"> ensemble</h2></p> <p>A collection of <a href="#model"><strong>models</strong></a> trained independently whose predictions are averaged or aggregated. In many cases, an ensemble produces better predictions than a single model. For example, a <a href="#random-forest"><strong>random forest</strong></a> is an ensemble built from multiple <a href="#decision-tree"><strong>decision trees</strong></a>. Note that not all <a href="#decision-forest"><strong>decision forests</strong></a> are ensembles.</p> <p>See <a href="/machine-learning/decision-forests/intro-to-decision-forests">Random Forest</a> in Machine Learning Crash Course for more information.</p> <p><a class="glossary-anchor" name="entropy"></a> <h2 class="hide-from-toc" id="entropy" data-text=" entropy " tabindex="-1"> entropy </h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="Decision Forests">#df</div> </div></p> <p>In <a href="https://wikipedia.org/wiki/Information_theory" target="T"> information theory</a>, a description of how unpredictable a probability distribution is. Alternatively, entropy is also defined as how much information each <a href="#example"><strong>example</strong></a> contains. A distribution has the highest possible entropy when all values of a random variable are equally likely.</p> <p>The entropy of a set with two possible values "0" and "1" (for example, the labels in a <a href="#binary_classification"><strong>binary classification</strong></a> problem) has the following formula:</p> <p><tt>   H = -p log p - q log q = -p log p - (1-p) * log (1-p) </tt></p> <p>where:</p> <ul> <li><tt>H</tt> is the entropy.</li> <li><tt>p</tt> is the fraction of "1" examples.</li> <li><tt>q</tt> is the fraction of "0" examples. Note that q = (1 - p)</li> <li><tt>log</tt> is generally log<sub>2</sub>. In this case, the entropy unit is a bit.</li> </ul> <p>For example, suppose the following:</p> <ul> <li>100 examples contain the value "1"</li> <li>300 examples contain the value "0"</li> </ul> <p>Therefore, the entropy value is:</p> <ul> <li><tt>p = 0.25</tt></li> <li><tt>q = 0.75</tt></li> <li><tt>H = (-0.25)log<sub>2</sub>(0.25) - (0.75)log<sub>2</sub>(0.75) = 0.81 bits per example</tt></li> </ul> <p>A set that is perfectly balanced (for example, 200 "0"s and 200 "1"s) would have an entropy of 1.0 bit per example. As a set becomes more <a href="#class_imbalanced_data_set"><strong>imbalanced</strong></a>, its entropy moves towards 0.0.</p> <p>In <a href="#decision-tree"><strong>decision trees</strong></a>, entropy helps formulate <a href="#information-gain"><strong>information gain</strong></a> to help the <a href="#splitter"><strong>splitter</strong></a> select the <a href="#condition"><strong>conditions</strong></a> during the growth of a classification decision tree.</p> <p>Compare entropy with:</p> <ul> <li><a href="#gini-impurity"><strong>gini impurity</strong></a></li> <li><a href="#cross-entropy"><strong>cross-entropy</strong></a> loss function</li> </ul> <p>Entropy is often called <em>Shannon's entropy</em>.</p> <p>See <a href="/machine-learning/decision-forests/binary-classification">Exact splitter for binary classification with numerical features</a> in the Decision Forests course for more information.</p> <p><a class="glossary-anchor" name="environment"></a> <h2 class="hide-from-toc" id="environment" data-text=" environment" tabindex="-1"> environment</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="Reinforcement Learning">#rl</div> </div></p> <p>In reinforcement learning, the world that contains the <a href="#agent"><strong>agent</strong></a> and allows the agent to observe that world's <a href="#state"><strong>state</strong></a>. For example, the represented world can be a game like chess, or a physical world like a maze. When the agent applies an <a href="#action"><strong>action</strong></a> to the environment, then the environment transitions between states.</p> <p><a class="glossary-anchor" name="episode"></a> <h2 class="hide-from-toc" id="episode" data-text=" episode" tabindex="-1"> episode</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="Reinforcement Learning">#rl</div> </div></p> <p>In reinforcement learning, each of the repeated attempts by the <a href="#agent"><strong>agent</strong></a> to learn an <a href="#environment"><strong>environment</strong></a>.</p> <p><a class="glossary-anchor" name="epoch"></a> <h2 class="hide-from-toc" id="epoch" data-text=" epoch" tabindex="-1"> epoch</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="ML Fundamentals">#fundamentals</div> </div></p> <p>A full training pass over the entire <a href="#training_set"><strong>training set</strong></a> such that each <a href="#example"><strong>example</strong></a> has been processed once.</p> <p>An epoch represents <code translate="no" dir="ltr">N</code>/<a href="#batch_size"><strong>batch size</strong></a> training <a href="#iteration"><strong>iterations</strong></a>, where <code translate="no" dir="ltr">N</code> is the total number of examples.</p> <p>For instance, suppose the following:</p> <ul> <li>The dataset consists of 1,000 examples.</li> <li>The batch size is 50 examples.</li> </ul> <p>Therefore, a single epoch requires 20 iterations:</p> <div></div><devsite-code><pre translate="no" dir="ltr" is-upgraded> 1 epoch = (N/batch size) = (1,000 / 50) = 20 iterations </pre></devsite-code> <p>See <a href="/machine-learning/crash-course/linear-regression/hyperparameters">Linear regression: Hyperparameters</a> in Machine Learning Crash Course for more information.</p> <p><a class="glossary-anchor" name="epsilon_greedy_policy"></a> <h2 class="hide-from-toc" id="epsilon-greedy-policy" data-text=" epsilon greedy policy" tabindex="-1"> epsilon greedy policy</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="Reinforcement Learning">#rl</div> </div></p> <p>In reinforcement learning, a <a href="#policy"><strong>policy</strong></a> that either follows a <a href="#random_policy"><strong>random policy</strong></a> with epsilon probability or a <a href="#greedy_policy"><strong>greedy policy</strong></a> otherwise. For example, if epsilon is 0.9, then the policy follows a random policy 90% of the time and a greedy policy 10% of the time.</p> <p>Over successive episodes, the algorithm reduces epsilon's value in order to shift from following a random policy to following a greedy policy. By shifting the policy, the agent first randomly explores the environment and then greedily exploits the results of random exploration.</p> <p><a class="glossary-anchor" name="equality_of_opportunity"></a> <h2 class="hide-from-toc" id="equality-of-opportunity" data-text=" equality of opportunity " tabindex="-1"> equality of opportunity </h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="Fairness">#fairness</div> </div></p> <p>A <a href="#fairness_metric"><strong>fairness metric</strong></a> to assess whether a model is predicting the desirable outcome equally well for all values of a <a href="#sensitive_attribute"><strong>sensitive attribute</strong></a>. In other words, if the desirable outcome for a model is the <a href="#positive_class"><strong>positive class</strong></a>, the goal would be to have the <a href="#TP_rate"><strong>true positive rate</strong></a> be the same for all groups.</p> <p>Equality of opportunity is related to <a href="#equalized_odds"><strong>equalized odds</strong></a>, which requires that <em>both</em> the true positive rates and <a href="#FP_rate"><strong>false positive rates</strong></a> are the same for all groups.</p> <p>Suppose Glubbdubdrib University admits both Lilliputians and Brobdingnagians to a rigorous mathematics program. Lilliputians' secondary schools offer a robust curriculum of math classes, and the vast majority of students are qualified for the university program. Brobdingnagians' secondary schools don't offer math classes at all, and as a result, far fewer of their students are qualified. Equality of opportunity is satisfied for the preferred label of "admitted" with respect to nationality (Lilliputian or Brobdingnagian) if qualified students are equally likely to be admitted irrespective of whether they're a Lilliputian or a Brobdingnagian.</p> <p>For example, suppose 100 Lilliputians and 100 Brobdingnagians apply to Glubbdubdrib University, and admissions decisions are made as follows:</p> <p><strong>Table 1.</strong> Lilliputian applicants (90% are qualified)</p> <table> <tr> <th> </th> <th>Qualified</th> <th>Unqualified</th> </tr> <tr> <th>Admitted</th> <td>45</td> <td>3</td> </tr> <tr> <th>Rejected</th> <td>45</td> <td>7</td> </tr> <tr> <th>Total</th> <td>90</td> <td>10</td> </tr> <tr> <td colspan="3"> Percentage of qualified students admitted: 45/90 = 50%<br/> Percentage of unqualified students rejected: 7/10 = 70%<br/> Total percentage of Lilliputian students admitted: (45+3)/100 = 48% </td> </tr> </table> <p> </p> <p><strong>Table 2.</strong> Brobdingnagian applicants (10% are qualified):</p> <table> <tr> <th> </th> <th>Qualified</th> <th>Unqualified</th> </tr> <tr> <th>Admitted</th> <td>5</td> <td>9</td> </tr> <tr> <th>Rejected</th> <td>5</td> <td>81</td> </tr> <tr> <th>Total</th> <td>10</td> <td>90</td> </tr> <tr> <td colspan="3"> Percentage of qualified students admitted: 5/10 = 50%<br/> Percentage of unqualified students rejected: 81/90 = 90%<br/> Total percentage of Brobdingnagian students admitted: (5+9)/100 = 14% </td> </tr> </table> <p>The preceding examples satisfy equality of opportunity for acceptance of qualified students because qualified Lilliputians and Brobdingnagians both have a 50% chance of being admitted.</p> <p>While equality of opportunity is satisfied, the following two fairness metrics are not satisfied:</p> <ul> <li><a href="#demographic_parity"><strong>demographic parity</strong></a>: Lilliputians and Brobdingnagians are admitted to the university at different rates; 48% of Lilliputians students are admitted, but only 14% of Brobdingnagian students are admitted.</li> <li><a href="#equalized_odds"><strong>equalized odds</strong></a>: While qualified Lilliputian and Brobdingnagian students both have the same chance of being admitted, the additional constraint that unqualified Lilliputians and Brobdingnagians both have the same chance of being rejected is not satisfied. Unqualified Lilliputians have a 70% rejection rate, whereas unqualified Brobdingnagians have a 90% rejection rate.</li> </ul> <p>See <a href="/machine-learning/crash-course/fairness/equality-of-opportunity">Fairness: Equality of opportunity</a> in Machine Learning Crash Course for more information.</p> <p><a class="glossary-anchor" name="equalized_odds"></a> <h2 class="hide-from-toc" id="equalized-odds" data-text=" equalized odds" tabindex="-1"> equalized odds</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="Fairness">#fairness</div> </div></p> <p>A fairness metric to assess whether a model is predicting outcomes equally well for all values of a <a href="#sensitive_attribute"><strong>sensitive attribute</strong></a> with respect to both the <a href="#positive_class"><strong>positive class</strong></a> and <a href="#negative_class"><strong>negative class</strong></a>—not just one class or the other exclusively. In other words, both the <a href="#TP_rate"><strong>true positive rate</strong></a> and <a href="#false-negative-rate"><strong>false negative rate</strong></a> should be the same for all groups.</p> <p>Equalized odds is related to <a href="#equality_of_opportunity"><strong>equality of opportunity</strong></a>, which only focuses on error rates for a single class (positive or negative).</p> <p>For example, suppose Glubbdubdrib University admits both Lilliputians and Brobdingnagians to a rigorous mathematics program. Lilliputians' secondary schools offer a robust curriculum of math classes, and the vast majority of students are qualified for the university program. Brobdingnagians' secondary schools don't offer math classes at all, and as a result, far fewer of their students are qualified. Equalized odds is satisfied provided that no matter whether an applicant is a Lilliputian or a Brobdingnagian, if they are qualified, they are equally as likely to get admitted to the program, and if they are not qualified, they are equally as likely to get rejected.</p> <p>Suppose 100 Lilliputians and 100 Brobdingnagians apply to Glubbdubdrib University, and admissions decisions are made as follows:</p> <p><strong>Table 3.</strong> Lilliputian applicants (90% are qualified)</p> <table> <tr> <th> </th> <th>Qualified</th> <th>Unqualified</th> </tr> <tr> <th>Admitted</th> <td>45</td> <td>2</td> </tr> <tr> <th>Rejected</th> <td>45</td> <td>8</td> </tr> <tr> <th>Total</th> <td>90</td> <td>10</td> </tr> <tr> <td colspan="3"> Percentage of qualified students admitted: 45/90 = 50%<br/> Percentage of unqualified students rejected: 8/10 = 80%<br/> Total percentage of Lilliputian students admitted: (45+2)/100 = 47% </td> </tr> </table> <p> </p> <p><strong>Table 4.</strong> Brobdingnagian applicants (10% are qualified):</p> <table> <tr> <th> </th> <th>Qualified</th> <th>Unqualified</th> </tr> <tr> <th>Admitted</th> <td>5</td> <td>18</td> </tr> <tr> <th>Rejected</th> <td>5</td> <td>72</td> </tr> <tr> <th>Total</th> <td>10</td> <td>90</td> </tr> <tr> <td colspan="3"> Percentage of qualified students admitted: 5/10 = 50%<br/> Percentage of unqualified students rejected: 72/90 = 80%<br/> Total percentage of Brobdingnagian students admitted: (5+18)/100 = 23% </td> </tr> </table> <p>Equalized odds is satisfied because qualified Lilliputian and Brobdingnagian students both have a 50% chance of being admitted, and unqualified Lilliputian and Brobdingnagian have an 80% chance of being rejected.</p> <aside class="note"><strong>Note:</strong><span> While equalized odds is satisfied here, <a href="#demographic_parity"><strong>demographic parity</strong></a> is <em>not satisfied</em>. Lilliputian and Brobdingnagian students are admitted to Glubbdubdrib University at different rates; 47% of Lilliputian students are admitted, and 23% of Brobdingnagian students are admitted.</span></aside> <p>Equalized odds is formally defined in <a href="https://arxiv.org/pdf/1610.02413.pdf" target="T">"Equality of Opportunity in Supervised Learning"</a> as follows: "predictor Ŷ satisfies equalized odds with respect to protected attribute A and outcome Y if Ŷ and A are independent, conditional on Y."</p> <aside class="note"><strong>Note:</strong><span> Contrast equalized odds with the more relaxed <a href="#equality_of_opportunity"><strong>equality of opportunity</strong></a> metric.</span></aside> <p><a class="glossary-anchor" name="Estimators"></a> <h2 class="hide-from-toc" id="estimator" data-text=" Estimator" tabindex="-1"> Estimator</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="TensorFlow">#TensorFlow</div> </div></p> <p>A deprecated TensorFlow API. Use <a href="#tf.keras"><strong>tf.keras</strong></a> instead of Estimators.</p> <p><a class="glossary-anchor" name="evals"></a> <h2 class="hide-from-toc" id="evals" data-text=" evals" tabindex="-1"> evals</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="Language Evaluation">#language</div> <div class="glossary-icon" title="Generative AI">#generativeAI</div> </div></p> <p>Primarily used as an abbreviation for <a href="#LLM-evaluations"><strong>LLM evaluations</strong></a>. More broadly, <strong>evals</strong> is an abbreviation for any form of <a href="#evaluation"><strong>evaluation</strong></a>.</p> <p><a class="glossary-anchor" name="evaluation"></a> <h2 class="hide-from-toc" id="evaluation" data-text=" evaluation" tabindex="-1"> evaluation</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="Language Evaluation">#language</div> <div class="glossary-icon" title="Generative AI">#generativeAI</div> </div></p> <p>The process of measuring a model's quality or comparing different models against each other.</p> <p>To evaluate a <a href="#supervised_machine_learning"><strong>supervised machine learning</strong></a> model, you typically judge it against a <a href="#validation_set"><strong>validation set</strong></a> and a <a href="#test_set"><strong>test set</strong></a>. <a href="#LLM-evaluations"><strong>Evaluating a LLM</strong></a> typically involves broader quality and safety assessments.</p> <p><a class="glossary-anchor" name="example"></a> <h2 class="hide-from-toc" id="example" data-text=" example" tabindex="-1"> example</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="ML Fundamentals">#fundamentals</div> </div></p> <p>The values of one row of <a href="#feature"><strong>features</strong></a> and possibly a <a href="#label"><strong>label</strong></a>. Examples in <a href="#supervised_machine_learning"><strong>supervised learning</strong></a> fall into two general categories:</p> <ul> <li>A <a href="#labeled_example"><strong>labeled example</strong></a> consists of one or more features and a label. Labeled examples are used during training.</li> <li>An <a href="#unlabeled_example"><strong>unlabeled example</strong></a> consists of one or more features but no label. Unlabeled examples are used during inference.</li> </ul> <p>For instance, suppose you are training a model to determine the influence of weather conditions on student test scores. Here are three labeled examples:</p> <table> <tr><th colspan="3">Features</th> <th>Label</th></tr> <tr><th width="25%">Temperature</th> <th width="25%">Humidity</th> <th width="25%">Pressure</th> <th width="25%">Test score</th></tr> <tr><td>15</td> <td>47</td> <td>998</td> <td>Good</td></tr> <tr><td>19</td> <td>34</td> <td>1020</td> <td>Excellent</td></tr> <tr><td>18</td> <td>92</td> <td>1012</td> <td>Poor</td></tr> </table> <p>Here are three unlabeled examples:</p> <table> <tr><th width="25%">Temperature</th> <th width="25%">Humidity</th> <th width="25%">Pressure</th> <th width="25%"> </th></tr> <tr><td>12</td> <td>62</td> <td>1014</td> <td> </td></tr> <tr><td>21</td> <td>47</td> <td>1017</td> <td> </td></tr> <tr><td>19</td> <td>41</td> <td>1021</td> <td> </td></tr> </table> <p>The row of a <a href="#dataset"><strong>dataset</strong></a> is typically the raw source for an example. That is, an example typically consists of a subset of the columns in the dataset. Furthermore, the features in an example can also include <a href="#synthetic_feature"><strong>synthetic features</strong></a>, such as <a href="#feature_cross"><strong>feature crosses</strong></a>.</p> <p>See <a href="/machine-learning/intro-to-ml/supervised">Supervised Learning</a> in the Introduction to Machine Learning course for more information.</p> <p><a class="glossary-anchor" name="experience_replay"></a> <h2 class="hide-from-toc" id="experience-replay" data-text=" experience replay" tabindex="-1"> experience replay</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="Reinforcement Learning">#rl</div> </div></p> <p>In reinforcement learning, a <a href="#deep_q-network"><strong>DQN</strong></a> technique used to reduce temporal correlations in training data. The <a href="#agent"><strong>agent</strong></a> stores state transitions in a <a href="#replay_buffer"><strong>replay buffer</strong></a>, and then samples transitions from the replay buffer to create training data.</p> <p><a class="glossary-anchor" name="experimenters_bias"></a> <h2 class="hide-from-toc" id="experimenters-bias" data-text=" experimenter's bias " tabindex="-1"> experimenter's bias </h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="Fairness">#fairness</div> </div></p> <p>See <a href="#confirmation_bias"><strong>confirmation bias</strong></a>.</p> <p><a class="glossary-anchor" name="exploding_gradient_problem"></a> <h2 class="hide-from-toc" id="exploding-gradient-problem" data-text=" exploding gradient problem" tabindex="-1"> exploding gradient problem</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="Sequence Models">#seq</div> </div></p> <p>The tendency for <a href="#gradient"><strong>gradients</strong></a> in <a href="#deep_neural_network"><strong>deep neural networks</strong></a> (especially <a href="#recurrent_neural_network"><strong>recurrent neural networks</strong></a>) to become surprisingly steep (high). Steep gradients often cause very large updates to the <a href="#weight"><strong>weights</strong></a> of each <a href="#node"><strong>node</strong></a> in a deep neural network.</p> <p>Models suffering from the exploding gradient problem become difficult or impossible to train. <a href="#gradient_clipping"><strong>Gradient clipping</strong></a> can mitigate this problem.</p> <p>Compare to <a href="#vanishing_gradient_problem"><strong>vanishing gradient problem</strong></a>.</p> <p><a class="glossary-anchor" name="f"></a> <h2 class="glossary" id="f" data-text="F" tabindex="-1">F</h2></p> <p><a class="glossary-anchor" name="F1"></a> <h2 class="hide-from-toc" id="f1" data-text=" F1" tabindex="-1"> F<sub>1</sub></h2></p> <p>A "roll-up" <a href="#binary-classification"><strong>binary classification</strong></a> metric that relies on both <a href="#precision"><strong>precision</strong></a> and <a href="#recall"><strong>recall</strong></a>. Here is the formula:</p> <div> $$F{_1} = \frac{\text{2 * precision * recall}} {\text{precision + recall}}$$ </div> <p>For example, given the following:</p> <ul> <li>precision = 0.6</li> <li>recall = 0.4</li> </ul> <div> $$F{_1} = \frac{\text{2 * 0.6 * 0.4}} {\text{0.6 + 0.4}} = 0.48$$ </div> <p>When precision and recall are fairly similar (as in the preceding example), F<sub>1</sub> is close to their mean. When precision and recall differ significantly, F<sub>1</sub> is closer to the lower value. For example:</p> <ul> <li>precision = 0.9</li> <li>recall = 0.1</li> </ul> <div> $$F{_1} = \frac{\text{2 * 0.9 * 0.1}} {\text{0.9 + 0.1}} = 0.18$$ </div> <p><a class="glossary-anchor" name="fairness_constraint"></a> <h2 class="hide-from-toc" id="fairness-constraint" data-text=" fairness constraint" tabindex="-1"> fairness constraint</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="Fairness">#fairness</div> </div> Applying a constraint to an algorithm to ensure one or more definitions of fairness are satisfied. Examples of fairness constraints include:</p> <ul> <li><a href="#post-processing"><strong>Post-processing</strong></a> your model's output.</li> <li>Altering the <a href="#loss"><strong>loss function</strong></a> to incorporate a penalty for violating a <a href="#fairness_metric"><strong>fairness metric</strong></a>.</li> <li>Directly adding a mathematical constraint to an optimization problem.</li> </ul> <p><a class="glossary-anchor" name="fairness_metric"></a> <h2 class="hide-from-toc" id="fairness-metric" data-text=" fairness metric" tabindex="-1"> fairness metric</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="Fairness">#fairness</div> </div></p> <p>A mathematical definition of "fairness" that is measurable. Some commonly used fairness metrics include:</p> <ul> <li><a href="#equalized_odds"><strong>equalized odds</strong></a></li> <li><a href="#predictive_parity"><strong>predictive parity</strong></a></li> <li><a href="#counterfactual_fairness"><strong>counterfactual fairness</strong></a></li> <li><a href="#demographic_parity"><strong>demographic parity</strong></a></li> </ul> <p>Many fairness metrics are mutually exclusive; see <a href="#incompatibility_of_fairness_metrics"><strong>incompatibility of fairness metrics</strong></a>.</p> <p><a class="glossary-anchor" name="FN"></a> <h2 class="hide-from-toc" id="false-negative-fn" data-text=" false negative (FN)" tabindex="-1"> false negative (FN)</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="ML Fundamentals">#fundamentals</div> </div></p> <p>An example in which the model mistakenly predicts the <a href="#negative_class"><strong>negative class</strong></a>. For example, the model predicts that a particular email message is <em>not spam</em> (the negative class), but that email message <em>actually is spam</em>.</p> <p><a class="glossary-anchor" name="false-negative-rate"></a> <h2 class="hide-from-toc" id="false-negative-rate" data-text=" false negative rate" tabindex="-1"> false negative rate</h2></p> <p>The proportion of actual positive examples for which the model mistakenly predicted the negative class. The following formula calculates the false negative rate:</p> <div> $$\text{false negative rate} = \frac{\text{false negatives}}{\text{false negatives} + \text{true positives}}$$ </div> <p>See <a href="/machine-learning/crash-course/classification/thresholding">Thresholds and the confusion matrix</a> in Machine Learning Crash Course for more information.</p> <p><a class="glossary-anchor" name="FP"></a> <a class="glossary-anchor" name="false_positive"></a> <h2 class="hide-from-toc" id="false-positive-fp" data-text=" false positive (FP)" tabindex="-1"> false positive (FP)</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="ML Fundamentals">#fundamentals</div> </div></p> <p>An example in which the model mistakenly predicts the <a href="#positive_class"><strong>positive class</strong></a>. For example, the model predicts that a particular email message is <em>spam</em> (the positive class), but that email message is <em>actually not spam</em>.</p> <p>See <a href="/machine-learning/crash-course/classification/thresholding">Thresholds and the confusion matrix</a> in Machine Learning Crash Course for more information.</p> <p><a class="glossary-anchor" name="FP_rate"></a> <h2 class="hide-from-toc" id="false-positive-rate-fpr" data-text=" false positive rate (FPR)" tabindex="-1"> false positive rate (FPR)</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="ML Fundamentals">#fundamentals</div> </div></p> <p>The proportion of actual negative examples for which the model mistakenly predicted the positive class. The following formula calculates the false positive rate:</p> <div> $$\text{false positive rate} = \frac{\text{false positives}}{\text{false positives} + \text{true negatives}}$$ </div> <p>The false positive rate is the x-axis in an <a href="#ROC"><strong>ROC curve</strong></a>.</p> <p>See <a href="/machine-learning/crash-course/classification/roc-and-auc">Classification: ROC and AUC</a> in Machine Learning Crash Course for more information.</p> <p><a class="glossary-anchor" name="feature"></a> <h2 class="hide-from-toc" id="feature" data-text=" feature" tabindex="-1"> feature</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="ML Fundamentals">#fundamentals</div> </div></p> <p>An input variable to a machine learning model. An <a href="#example"><strong>example</strong></a> consists of one or more features. For instance, suppose you are training a model to determine the influence of weather conditions on student test scores. The following table shows three examples, each of which contains three features and one label:</p> <table> <tr><th colspan="3">Features</th> <th>Label</th></tr> <tr><th>Temperature</th> <th>Humidity</th> <th>Pressure</th> <th>Test score </th></tr> <tr><td>15</td> <td>47</td> <td>998</td> <td>92</td></tr> <tr><td>19</td> <td>34</td> <td>1020</td> <td>84</td></tr> <tr><td>18</td> <td>92</td> <td>1012</td> <td>87</td></tr> </table> <p>Contrast with <a href="#label"><strong>label</strong></a>.</p> <p>See <a href="/machine-learning/intro-to-ml/supervised">Supervised Learning</a> in the Introduction to Machine Learning course for more information.</p> <p><a class="glossary-anchor" name="feature_cross"></a> <h2 class="hide-from-toc" id="feature-cross" data-text=" feature cross" tabindex="-1"> feature cross</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="ML Fundamentals">#fundamentals</div> </div></p> <p>A <a href="#synthetic_feature"><strong>synthetic feature</strong></a> formed by "crossing" <a href="#categorical_data"><strong>categorical</strong></a> or <a href="#bucketing"><strong>bucketed</strong></a> features.</p> <p>For example, consider a "mood forecasting" model that represents temperature in one of the following four buckets:</p> <ul> <li><code translate="no" dir="ltr">freezing</code></li> <li><code translate="no" dir="ltr">chilly</code></li> <li><code translate="no" dir="ltr">temperate</code></li> <li><code translate="no" dir="ltr">warm</code></li> </ul> <p>And represents wind speed in one of the following three buckets:</p> <ul> <li><code translate="no" dir="ltr">still</code></li> <li><code translate="no" dir="ltr">light</code></li> <li><code translate="no" dir="ltr">windy</code></li> </ul> <p>Without feature crosses, the linear model trains independently on each of the preceding seven various buckets. So, the model trains on, for example, <code translate="no" dir="ltr">freezing</code> independently of the training on, for example, <code translate="no" dir="ltr">windy</code>.</p> <p>Alternatively, you could create a feature cross of temperature and wind speed. This synthetic feature would have the following 12 possible values:</p> <ul> <li><code translate="no" dir="ltr">freezing-still</code></li> <li><code translate="no" dir="ltr">freezing-light</code></li> <li><code translate="no" dir="ltr">freezing-windy</code></li> <li><code translate="no" dir="ltr">chilly-still</code></li> <li><code translate="no" dir="ltr">chilly-light</code></li> <li><code translate="no" dir="ltr">chilly-windy</code></li> <li><code translate="no" dir="ltr">temperate-still</code></li> <li><code translate="no" dir="ltr">temperate-light</code></li> <li><code translate="no" dir="ltr">temperate-windy</code></li> <li><code translate="no" dir="ltr">warm-still</code></li> <li><code translate="no" dir="ltr">warm-light</code></li> <li><code translate="no" dir="ltr">warm-windy</code></li> </ul> <p>Thanks to feature crosses, the model can learn mood differences between a <code translate="no" dir="ltr">freezing-windy</code> day and a <code translate="no" dir="ltr">freezing-still</code> day.</p> <p>If you create a synthetic feature from two features that each have a lot of different buckets, the resulting feature cross will have a huge number of possible combinations. For example, if one feature has 1,000 buckets and the other feature has 2,000 buckets, the resulting feature cross has 2,000,000 buckets.</p> <p>Formally, a cross is a <a href="https://wikipedia.org/wiki/Cartesian_product" target="T">Cartesian product</a>.</p> <p>Feature crosses are mostly used with linear models and are rarely used with neural networks.</p> <p>See <a href="/machine-learning/crash-course/categorical-data/feature-crosses">Categorical data: Feature crosses</a> in Machine Learning Crash Course for more information.</p> <p><a class="glossary-anchor" name="feature_engineering"></a> <h2 class="hide-from-toc" id="feature-engineering" data-text=" feature engineering" tabindex="-1"> feature engineering</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="ML Fundamentals">#fundamentals</div> <div class="glossary-icon" title="TensorFlow">#TensorFlow</div> </div></p> <p>A process that involves the following steps:</p> <ol> <li>Determining which <a href="#feature"><strong>features</strong></a> might be useful in training a model.</li> <li>Converting raw data from the dataset into efficient versions of those features.</li> </ol> <p>For example, you might determine that <code translate="no" dir="ltr">temperature</code> might be a useful feature. Then, you might experiment with <a href="#bucketing"><strong>bucketing</strong></a> to optimize what the model can learn from different <code translate="no" dir="ltr">temperature</code> ranges.</p> <p>Feature engineering is sometimes called <a href="#feature_extraction"><strong>feature extraction</strong></a> or <a href="#featurization"><strong>featurization</strong></a>.</p> <section class="expandable"> <h4 class="showalways" id="click-the-icon-for-additional-notes-about-tensorflow." data-text=" Click the icon for additional notes about TensorFlow. " tabindex="-1"> Click the icon for additional notes about TensorFlow. </h4> <div class="expand-background"> <p> In TensorFlow, feature engineering often means converting raw log file entries to <a href="#tf.Example"><b>tf.Example</b></a> protocol buffers. See also <a href="https://github.com/tensorflow/transform" target="T">tf.Transform</a>. </p> </div> <hr /> </section> <p>See <a href="/machine-learning/crash-course/numerical-data/feature-vectors">Numerical data: How a model ingests data using feature vectors</a> in Machine Learning Crash Course for more information.</p> <p><a class="glossary-anchor" name="feature_extraction"></a> <h2 class="hide-from-toc" id="feature-extraction" data-text=" feature extraction" tabindex="-1"> feature extraction</h2></p> <p>Overloaded term having either of the following definitions:</p> <ul> <li>Retrieving intermediate feature representations calculated by an <a href="#unsupervised_machine_learning"><strong>unsupervised</strong></a> or pretrained model (for example, <a href="#hidden_layer"><strong>hidden layer</strong></a> values in a <a href="#neural_network"><strong>neural network</strong></a>) for use in another model as input.</li> <li>Synonym for <a href="#feature_engineering"><strong>feature engineering</strong></a>.</li> </ul> <p><a class="glossary-anchor" name="feature-importances"></a> <h2 class="hide-from-toc" id="feature-importances" data-text=" feature importances " tabindex="-1"> feature importances </h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="Decision Forests">#df</div> </div></p> <p>Synonym for <a href="#variable-importances"><strong>variable importances</strong></a>.</p> <p><a class="glossary-anchor" name="feature_set"></a> <h2 class="hide-from-toc" id="feature-set" data-text=" feature set" tabindex="-1"> feature set</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="ML Fundamentals">#fundamentals</div> </div></p> <p>The group of <a href="#feature"><strong>features</strong></a> your machine learning <a href="#model"><strong>model</strong></a> trains on. For example, postal code, property size, and property condition might comprise a simple feature set for a model that predicts housing prices.</p> <p><a class="glossary-anchor" name="feature_spec"></a> <h2 class="hide-from-toc" id="feature-spec" data-text=" feature spec" tabindex="-1"> feature spec</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="TensorFlow">#TensorFlow</div> </div></p> <p>Describes the information required to extract <a href="#feature"><strong>features</strong></a> data from the <a href="#tf.Example"><strong>tf.Example</strong></a> protocol buffer. Because the tf.Example protocol buffer is just a container for data, you must specify the following:</p> <ul> <li>The data to extract (that is, the keys for the features)</li> <li>The data type (for example, float or int)</li> <li>The length (fixed or variable)</li> </ul> <p><a class="glossary-anchor" name="feature_vector"></a> <h2 class="hide-from-toc" id="feature-vector" data-text=" feature vector" tabindex="-1"> feature vector</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="ML Fundamentals">#fundamentals</div> </div></p> <p>The array of <a href="#feature"><strong>feature</strong></a> values comprising an <a href="#example"><strong>example</strong></a>. The feature vector is input during <a href="#training"><strong>training</strong></a> and during <a href="#inference"><strong>inference</strong></a>. For example, the feature vector for a model with two discrete features might be:</p> <div></div><devsite-code><pre class="devsite-click-to-copy" translate="no" dir="ltr" is-upgraded syntax="Text only">[0.92, 0.56]</pre></devsite-code> <p> <img src="/static/machine-learning/glossary/images/FeatureVector.png" loading="lazy" alt="Four layers: an input layer, two hidden layers, and one output layer. The input layer contains two nodes, one containing the value 0.92 and the other containing the value 0.56." > </p> <p>Each example supplies different values for the feature vector, so the feature vector for the next example could be something like:</p> <div></div><devsite-code><pre class="devsite-click-to-copy" translate="no" dir="ltr" is-upgraded syntax="Text only">[0.73, 0.49]</pre></devsite-code> <p><a href="#feature_engineering"><strong>Feature engineering</strong></a> determines how to represent features in the feature vector. For example, a binary categorical feature with five possible values might be represented with <a href="#one-hot_encoding"><strong>one-hot encoding</strong></a>. In this case, the portion of the feature vector for a particular example would consist of four zeroes and a single 1.0 in the third position, as follows:</p> <div></div><devsite-code><pre translate="no" dir="ltr" is-upgraded> [0.0, 0.0, 1.0, 0.0, 0.0] </pre></devsite-code> <p>As another example, suppose your model consists of three features:</p> <ul> <li>a binary categorical feature with <em>five</em> possible values represented with one-hot encoding; for example: <code translate="no" dir="ltr">[0.0, 1.0, 0.0, 0.0, 0.0]</code></li> <li>another binary categorical feature with <em>three</em> possible values represented with one-hot encoding; for example: <code translate="no" dir="ltr">[0.0, 0.0, 1.0]</code></li> <li>a floating-point feature; for example: <code translate="no" dir="ltr">8.3</code>.</li> </ul> <p>In this case, the feature vector for each example would be represented by <em>nine</em> values. Given the example values in the preceding list, the feature vector would be:</p> <div></div><devsite-code><pre translate="no" dir="ltr" is-upgraded> 0.0 1.0 0.0 0.0 0.0 0.0 0.0 1.0 8.3 </pre></devsite-code> <p>See <a href="/machine-learning/crash-course/numerical-data/feature-vectors">Numerical data: How a model ingests data using feature vectors</a> in Machine Learning Crash Course for more information.</p> <p><a class="glossary-anchor" name="featurization"></a> <h2 class="hide-from-toc" id="featurization" data-text=" featurization" tabindex="-1"> featurization</h2></p> <p>The process of extracting <a href="#feature"><strong>features</strong></a> from an input source, such as a document or video, and mapping those features into a <a href="#feature_vector"><strong>feature vector</strong></a>.</p> <p>Some ML experts use featurization as a synonym for <a href="#feature_engineering"><strong>feature engineering</strong></a> or <a href="#feature_extraction"><strong>feature extraction</strong></a>.</p> <p><a class="glossary-anchor" name="federated_learning"></a> <h2 class="hide-from-toc" id="federated-learning" data-text=" federated learning" tabindex="-1"> federated learning</h2></p> <p>A distributed machine learning approach that <a href="#training"><strong>trains</strong></a> machine learning <a href="#model"><strong>models</strong></a> using decentralized <a href="#example"><strong>examples</strong></a> residing on devices such as smartphones. In federated learning, a subset of devices downloads the current model from a central coordinating server. The devices use the examples stored on the devices to make improvements to the model. The devices then upload the model improvements (but not the training examples) to the coordinating server, where they are aggregated with other updates to yield an improved global model. After the aggregation, the model updates computed by devices are no longer needed, and can be discarded.</p> <p>Since the training examples are never uploaded, federated learning follows the privacy principles of focused data collection and data minimization.</p> <p>For more information about federated learning, see <a href="https://federated.withgoogle.com" target="T">this tutorial</a>.</p> <p><a class="glossary-anchor" name="feedback_loop"></a> <h2 class="hide-from-toc" id="feedback-loop" data-text=" feedback loop" tabindex="-1"> feedback loop</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="ML Fundamentals">#fundamentals</div> </div></p> <p>In machine learning, a situation in which a model's predictions influence the training data for the same model or another model. For example, a model that recommends movies will influence the movies that people see, which will then influence subsequent movie recommendation models.</p> <p>See <a href="/machine-learning/crash-course/production-ml-systems/questions">Production ML systems: Questions to ask</a> in Machine Learning Crash Course for more information.</p> <p><a class="glossary-anchor" name="feedforward_neural_network"></a> <h2 class="hide-from-toc" id="feedforward-neural-network-ffn" data-text=" feedforward neural network (FFN)" tabindex="-1"> feedforward neural network (FFN)</h2></p> <p>A neural network without cyclic or recursive connections. For example, traditional <a href="#deep_neural_network"><strong>deep neural networks</strong></a> are feedforward neural networks. Contrast with <a href="#recurrent_neural_network"><strong>recurrent neural networks</strong></a>, which are cyclic.</p> <p><a class="glossary-anchor" name="few-shot_learning"></a> <h2 class="hide-from-toc" id="few-shot-learning" data-text=" few-shot learning" tabindex="-1"> few-shot learning</h2></p> <p>A machine learning approach, often used for object classification, designed to train effective classifiers from only a small number of training examples.</p> <p>See also <a href="#one-shot_learning"><strong>one-shot learning</strong></a> and <a href="#zero-shot-learning"><strong>zero-shot learning</strong></a>.</p> <p><a class="glossary-anchor" name="few-shot-prompting"></a> <h2 class="hide-from-toc" id="few-shot-prompting" data-text=" few-shot prompting" tabindex="-1"> few-shot prompting</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="Language Evaluation">#language</div> <div class="glossary-icon" title="Generative AI">#generativeAI</div> </div></p> <p>A <a href="#prompt"><strong>prompt</strong></a> that contains more than one (a "few") example demonstrating how the <a href="#large-language-model"><strong>large language model</strong></a> should respond. For example, the following lengthy prompt contains two examples showing a large language model how to answer a query.</p> <table> <tr> <th>Parts of one prompt</th> <th>Notes</th> </tr> <tr> <td><tt>What is the official currency of the specified country?</tt></td> <td>The question you want the LLM to answer.</td> </tr> <tr> <td><tt>France: EUR</tt></td> <td>One example.</td> </tr> <tr> <td><tt>United Kingdom: GBP</tt></td> <td>Another example.</td> </tr> <tr> <td><tt>India:</tt></td> <td>The actual query.</td> </tr> </table> <p>Few-shot prompting generally produces more desirable results than <a href="#zero-shot-prompting"><strong>zero-shot prompting</strong></a> and <a href="#one-shot-prompting"><strong>one-shot prompting</strong></a>. However, few-shot prompting requires a lengthier prompt.</p> <p>Few-shot prompting is a form of <a href="#few-shot_learning"><strong>few-shot learning</strong></a> applied to <a href="#prompt-based-learning"><strong>prompt-based learning</strong></a>.</p> <p>See <a href="https://developers.google.com/machine-learning/crash-course/llm/tuning#prompt_engineering">Prompt engineering</a> in Machine Learning Crash Course for more information.</p> <p><a class="glossary-anchor" name="fiddle"></a> <h2 class="hide-from-toc" id="fiddle" data-text=" Fiddle" tabindex="-1"> Fiddle</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="Language Evaluation">#language</div> </div></p> <p>A Python-first <a href="#configuration"><strong>configuration</strong></a> library that sets the values of functions and classes without invasive code or infrastructure. In the case of <a href="#pax"><strong>Pax</strong></a>—and other ML codebases—these functions and classes represent <a href="#model"><strong>models</strong></a> and <a href="#training"><strong>training</strong></a> <a href="#hyperparameter"><strong>hyperparameters</strong></a>.</p> <p><a href="https://github.com/google/fiddle" target="T">Fiddle</a> assumes that machine learning codebases are typically divided into:</p> <ul> <li>Library code, which defines the layers and optimizers.</li> <li>Dataset "glue" code, which calls the libraries and wires everything together.</li> </ul> <p>Fiddle captures the call structure of the glue code in an unevaluated and mutable form.</p> <p><a class="glossary-anchor" name="fine-tuning"></a> <h2 class="hide-from-toc" id="fine-tuning" data-text="fine-tuning" tabindex="-1">fine-tuning</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="Language Evaluation">#language</div> <div class="glossary-icon" title="Image Models">#image</div> <div class="glossary-icon" title="Generative AI">#generativeAI</div> </div></p> <p>A second, task-specific training pass performed on a <a href="#pre-trained-model"><strong>pre-trained model</strong></a> to refine its parameters for a specific use case. For example, the full training sequence for some <a href="#large-language-model"><strong>large language models</strong></a> is as follows:</p> <ol> <li><strong>Pre-training:</strong> Train a large language model on a vast <em>general</em> dataset, such as all the English language Wikipedia pages.</li> <li><strong>Fine-tuning:</strong> Train the pre-trained model to perform a <em>specific</em> task, such as responding to medical queries. Fine-tuning typically involves hundreds or thousands of examples focused on the specific task.</li> </ol> <p>As another example, the full training sequence for a large image model is as follows:</p> <ol> <li><strong>Pre-training:</strong> Train a large image model on a vast <em>general</em> image dataset, such as all the images in Wikimedia commons.</li> <li><strong>Fine-tuning:</strong> Train the pre-trained model to perform a <em>specific</em> task, such as generating images of orcas.</li> </ol> <p>Fine-tuning can entail any combination of the following strategies:</p> <ul> <li>Modifying <em>all</em> of the pre-trained model's existing <a href="#parameter"><strong>parameters</strong></a>. This is sometimes called <strong>full fine-tuning</strong>.</li> <li>Modifying only <em>some</em> of the pre-trained model's existing parameters (typically, the layers closest to the <a href="#output_layer"><strong>output layer</strong></a>), while keeping other existing parameters unchanged (typically, the layers closest to the <a href="#input-layer"><strong>input layer</strong></a>). See <a href="#parameter-efficient-tuning"><strong>parameter-efficient tuning</strong></a>.</li> <li>Adding more layers, typically on top of the existing layers closest to the output layer.</li> </ul> <p>Fine-tuning is a form of <a href="#transfer_learning"><strong>transfer learning</strong></a>. As such, fine-tuning might use a different loss function or a different model type than those used to train the pre-trained model. For example, you could fine-tune a pre-trained large image model to produce a regression model that returns the number of birds in an input image.</p> <p>Compare and contrast fine-tuning with the following terms:</p> <ul> <li><a href="#distillation"><strong>distillation</strong></a></li> <li><a href="#prompt-based-learning"><strong>prompt-based learning</strong></a></li> </ul> <p>See <a href="https://developers.google.com/machine-learning/crash-course/llm/tuning#fine-tuning">Fine-tuning</a> in Machine Learning Crash Course for more information.</p> <p><a class="glossary-anchor" name="flax"></a> <h2 class="hide-from-toc" id="flax" data-text=" Flax" tabindex="-1"> Flax</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="Language Evaluation">#language</div> </div></p> <p>A high-performance open-source <a href="https://github.com/google/flax" target="T"> library</a> for deep learning built on top of <a href="#JAX"><strong>JAX</strong></a>. Flax provides functions for <a href="#training"><strong>training</strong></a> <a href="#neural-network"><strong>neural networks</strong></a>, as well as methods for evaluating their performance.</p> <p><a class="glossary-anchor" name="flaxformer"></a> <h2 class="hide-from-toc" id="flaxformer" data-text=" Flaxformer" tabindex="-1"> Flaxformer</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="Language Evaluation">#language</div> </div></p> <p>An open-source <a href="#transformer"><strong>Transformer</strong></a> <a href="https://github.com/google/flaxformer" target="T">library</a>, built on <a href="#flax"><strong>Flax</strong></a>, designed primarily for natural language processing and multimodal research.</p> <p><a class="glossary-anchor" name="forget_gate"></a> <h2 class="hide-from-toc" id="forget-gate" data-text="forget gate" tabindex="-1">forget gate</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="Sequence Models">#seq</div> </div></p> <p>The portion of a <a href="#Long_Short-Term_Memory"><strong>Long Short-Term Memory</strong></a> cell that regulates the flow of information through the cell. Forget gates maintain context by deciding which information to discard from the cell state.</p> <p><a class="glossary-anchor" name="full_softmax"></a> <h2 class="hide-from-toc" id="full-softmax" data-text=" full softmax" tabindex="-1"> full softmax</h2></p> <p>Synonym for <a href="#softmax"><strong>softmax</strong></a>.</p> <p>Contrast with <a href="#candidate_sampling"><strong>candidate sampling</strong></a>.</p> <p>See <a href="/machine-learning/crash-course/neural-networks/multi-class">Neural networks: Multi-class classification</a> in Machine Learning Crash Course for more information.</p> <p><a class="glossary-anchor" name="fully_connected_layer"></a> <h2 class="hide-from-toc" id="fully-connected-layer" data-text=" fully connected layer" tabindex="-1"> fully connected layer</h2></p> <p>A <a href="#hidden_layer"><strong>hidden layer</strong></a> in which each <a href="#node"><strong>node</strong></a> is connected to <em>every</em> node in the subsequent hidden layer.</p> <p>A fully connected layer is also known as a <a href="#dense_layer"><strong>dense layer</strong></a>.</p> <p><a class="glossary-anchor" name="function_transformation"></a> <h2 class="hide-from-toc" id="function-transformation" data-text=" function transformation" tabindex="-1"> function transformation</h2></p> <p>A function that takes a function as input and returns a transformed function as output. <a href="#JAX"><strong>JAX</strong></a> uses function transformations.</p> <p><a class="glossary-anchor" name="g"></a> <h2 class="glossary" id="g" data-text="G" tabindex="-1">G</h2></p> <p><a class="glossary-anchor" name="GAN"></a> <h2 class="hide-from-toc" id="gan" data-text=" GAN" tabindex="-1"> GAN</h2></p> <p>Abbreviation for <a href="#generative_adversarial_network"><strong>generative adversarial network</strong></a>.</p> <p><a class="glossary-anchor" name="generalization"></a> <h2 class="hide-from-toc" id="generalization" data-text=" generalization" tabindex="-1"> generalization</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="ML Fundamentals">#fundamentals</div> </div></p> <p>A <a href="#model"><strong>model's</strong></a> ability to make correct predictions on new, previously unseen data. A model that can generalize is the opposite of a model that is <a href="#overfitting"><strong>overfitting</strong></a>.</p> <section class="expandable"> <h4 class="showalways" id="click-the-icon-for-additional-notes._6" data-text=" Click the icon for additional notes. " tabindex="-1"> Click the icon for additional notes. </h4> <div class="expand-background"> <p> You train a model on the examples in the training set. Consequently, the model learns the peculiarities of the data in the training set. Generalization essentially asks whether your model can make good predictions on examples that are <i>not</i> in the training set. </p> <p> To encourage generalization, <a href="#regularization"><b>regularization</b></a> helps a model train less exactly to the peculiarities of the data in the training set. </p> </div> <hr /> </section> <p>See <a href="/machine-learning/crash-course/overfitting/generalization">Generalization</a> in Machine Learning Crash Course for more information.</p> <p><a class="glossary-anchor" name="Gemini"></a> <h2 class="hide-from-toc" id="gemini" data-text=" Gemini" tabindex="-1"> Gemini</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="Language Evaluation">#language</div> <div class="glossary-icon" title="Image Models">#image</div> <div class="glossary-icon" title="Generative AI">#generativeAI</div> </div></p> <p>The ecosystem comprising Google's most advanced AI. Elements of this ecosystem include:</p> <ul> <li>Various <a href="#Gemini-models"><strong>Gemini models</strong></a>.</li> <li>The interactive conversational interface to a <a href="#Gemini-models"><strong>Gemini model</strong></a>. Users type prompts and Gemini responds to those prompts.</li> <li>Various Gemini APIs.</li> <li>Various business products based on Gemini models; for example, <a href="https://cloud.google.com/products/gemini">Gemini for Google Cloud</a>.</li> </ul> <p><a class="glossary-anchor" name="Gemini-models"></a> <h2 class="hide-from-toc" id="gemini-models" data-text=" Gemini models" tabindex="-1"> Gemini models</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="Language Evaluation">#language</div> <div class="glossary-icon" title="Image Models">#image</div> <div class="glossary-icon" title="Generative AI">#generativeAI</div> </div></p> <p>Google's state-of-the-art <a href="#Transformer"><strong>Transformer</strong></a>-based <a href="#multimodal-model"><strong>multimodal models</strong></a>. Gemini models are specifically designed to integrate with <a href="#agent"><strong>agents</strong></a>.</p> <p>Users can interact with Gemini models in a variety of ways, including through an interactive dialog interface and through SDKs.</p> <p><a class="glossary-anchor" name="generalization_curve"></a> <h2 class="hide-from-toc" id="generalization-curve" data-text=" generalization curve" tabindex="-1"> generalization curve</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="ML Fundamentals">#fundamentals</div> </div></p> <p>A plot of both <a href="#training-loss"><strong>training loss</strong></a> and <a href="#validation-loss"><strong>validation loss</strong></a> as a function of the number of <a href="#iteration"><strong>iterations</strong></a>.</p> <p>A generalization curve can help you detect possible <a href="#overfitting"><strong>overfitting</strong></a>. For example, the following generalization curve suggests overfitting because validation loss ultimately becomes significantly higher than training loss.</p> <p> <img src="/static/machine-learning/glossary/images/GeneralizationCurveSmooth.png" loading="lazy" alt="A Cartesian graph in which the y-axis is labeled loss and the x-axis is labeled iterations. Two plots appear. One plots shows the training loss and the other shows the validation loss. The two plots start off similarly, but the training loss eventually dips far lower than the validation loss." > </p> <p>See <a href="/machine-learning/crash-course/overfitting/generalization">Generalization</a> in Machine Learning Crash Course for more information.</p> <p><a class="glossary-anchor" name="generalized_linear_model"></a> <h2 class="hide-from-toc" id="generalized-linear-model" data-text=" generalized linear model" tabindex="-1"> generalized linear model</h2></p> <p>A generalization of <a href="#least_squares_regression"><strong>least squares regression</strong></a> models, which are based on <a href="https://wikipedia.org/wiki/Gaussian_noise" target="T">Gaussian noise</a>, to other types of models based on other types of noise, such as <a href="https://wikipedia.org/wiki/Shot_noise" target="T">Poisson noise</a> or categorical noise. Examples of generalized linear models include:</p> <ul> <li><a href="#logistic_regression"><strong>logistic regression</strong></a></li> <li>multi-class regression</li> <li>least squares regression</li> </ul> <p>The parameters of a generalized linear model can be found through <a href="#convex_optimization"><strong>convex optimization</strong></a>.</p> <p>Generalized linear models exhibit the following properties:</p> <ul> <li>The average prediction of the optimal least squares regression model is equal to the average label on the training data.</li> <li>The average probability predicted by the optimal logistic regression model is equal to the average label on the training data.</li> </ul> <p>The power of a generalized linear model is limited by its features. Unlike a deep model, a generalized linear model cannot "learn new features."</p> <p><a class="glossary-anchor" name="generative_adversarial_network"></a> <h2 class="hide-from-toc" id="generative-adversarial-network-gan" data-text=" generative adversarial network (GAN)" tabindex="-1"> generative adversarial network (GAN)</h2></p> <p>A system to create new data in which a <a href="#generator"><strong>generator</strong></a> creates data and a <a href="#discriminator"><strong>discriminator</strong></a> determines whether that created data is valid or invalid.</p> <p>See the <a href="/machine-learning/gan">Generative Adversarial Networks course</a> for more information.</p> <p><a class="glossary-anchor" name="generative-ai"></a> <a class="glossary-anchor" name="generative-AI"></a> <h2 class="hide-from-toc" id="generative-ai" data-text=" generative AI" tabindex="-1"> generative AI</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="Language Evaluation">#language</div> <div class="glossary-icon" title="Image Models">#image</div> <div class="glossary-icon" title="Generative AI">#generativeAI</div> </div></p> <p>An emerging transformative field with no formal definition. That said, most experts agree that generative AI models can create ("generate") content that is all of the following:</p> <ul> <li>complex</li> <li>coherent</li> <li>original</li> </ul> <p>For example, a generative AI model can create sophisticated essays or images.</p> <p>Some earlier technologies, including <a href="#Long_Short-Term_Memory"><strong>LSTMs</strong></a> and <a href="#recurrent_neural_network"><strong>RNNs</strong></a>, can also generate original and coherent content. Some experts view these earlier technologies as generative AI, while others feel that true generative AI requires more complex output than those earlier technologies can produce.</p> <p>Contrast with <a href="#predictive-ML"><strong>predictive ML</strong></a>.</p> <p><a class="glossary-anchor" name="generative_model"></a> <h2 class="hide-from-toc" id="generative-model" data-text=" generative model" tabindex="-1"> generative model</h2></p> <p>Practically speaking, a model that does either of the following:</p> <ul> <li>Creates (generates) new examples from the training dataset. For example, a generative model could create poetry after training on a dataset of poems. The <a href="#generator"><strong>generator</strong></a> part of a <a href="#generative_adversarial_network"><strong>generative adversarial network</strong></a> falls into this category.</li> <li>Determines the probability that a new example comes from the training set, or was created from the same mechanism that created the training set. For example, after training on a dataset consisting of English sentences, a generative model could determine the probability that new input is a valid English sentence.</li> </ul> <p>A generative model can theoretically discern the distribution of examples or particular features in a dataset. That is:</p> <div></div><devsite-code><pre class="devsite-click-to-copy" translate="no" dir="ltr" is-upgraded syntax="Text only">p(examples)</pre></devsite-code> <p>Unsupervised learning models are generative.</p> <p>Contrast with <a href="#discriminative_model"><strong>discriminative models</strong></a>.</p> <p><a class="glossary-anchor" name="generator"></a> <h2 class="hide-from-toc" id="generator" data-text=" generator" tabindex="-1"> generator</h2></p> <p>The subsystem within a <a href="#generative_adversarial_network"><strong>generative adversarial network</strong></a> that creates new <a href="#example"><strong>examples</strong></a>.</p> <p>Contrast with <a href="#discriminative_model"><strong>discriminative model</strong></a>.</p> <p><a class="glossary-anchor" name="gini-impurity"></a> <h2 class="hide-from-toc" id="gini-impurity" data-text=" gini impurity " tabindex="-1"> gini impurity </h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="Decision Forests">#df</div> </div></p> <p>A metric similar to <a href="#entropy"><strong>entropy</strong></a>. <a href="#splitter"><strong>Splitters</strong></a> use values derived from either gini impurity or entropy to compose <a href="#condition"><strong>conditions</strong></a> for classification <a href="#decision-tree"><strong>decision trees</strong></a>. <a href="#information-gain"><strong>Information gain</strong></a> is derived from entropy. There is no universally accepted equivalent term for the metric derived from gini impurity; however, this unnamed metric is just as important as information gain.</p> <p>Gini impurity is also called <strong>gini index</strong>, or simply <strong>gini</strong>.</p> <section class="expandable"> <h4 class="showalways" id="click-the-icon-for-mathematical-details-about-gini-impurity." data-text=" Click the icon for mathematical details about gini impurity. " tabindex="-1"> Click the icon for mathematical details about gini impurity. </h4> <div class="expand-background"> <p>Gini impurity is the probability of misclassifying a new piece of data taken from the same distribution. The gini impurity of a set with two possible values "0" and "1" (for example, the labels in a <b><a href="#binary_classification">binary classification</a></b> problem) is calculated from the following formula:</p> <p> <tt>   I = 1 - (p<sup>2</sup> + q<sup>2</sup>) = 1 - (p<sup>2</sup> + (1-p)<sup>2</sup>) </tt> </p> <p>where:</p> <ul> <li><tt>I</tt> is the gini impurity.</li> <li><tt>p</tt> is the fraction of "1" examples.</li> <li><tt>q</tt> is the fraction of "0" examples. Note that <tt>q = 1-p</tt></li> </ul> <p>For example, consider the following dataset:</p> <ul> <li>100 labels (0.25 of the dataset) contain the value "1"</li> <li>300 labels (0.75 of the dataset) contain the value "0"</li> </ul> <p>Therefore, the gini impurity is:</p> <ul> <li><tt>p = 0.25</tt></li> <li><tt>q = 0.75</tt></li> <li><tt>I = 1 - (0.25<sup>2</sup> + 0.75<sup>2</sup>) = <b>0.375</b></tt></li> </ul> <p>Consequently, a random label from the same dataset would have a 37.5% chance of being misclassified, and a 62.5% chance of being properly classified.</p> <p>A perfectly balanced label (for example, 200 "0"s and 200 "1"s) would have a gini impurity of 0.5. A highly <a href="#class_imbalanced_data_set"><b>imbalanced</b></a> label would have a gini impurity close to 0.0.</p> </div> <hr /> </section> <p><a class="glossary-anchor" name="golden dataset"></a> <h2 class="hide-from-toc" id="golden-dataset" data-text=" golden dataset" tabindex="-1"> golden dataset</h2></p> <p>A set of manually curated data that captures <a href="#ground_truth"><strong>ground truth</strong></a>. Teams can use one or more golden datasets to evaluate a model's quality.</p> <p>Some golden datasets capture different subdomains of ground truth. For example, a golden dataset for image classification might capture lighting conditions and image resolution.</p> <p><a class="glossary-anchor" name="GPT"></a> <h2 class="hide-from-toc" id="gpt-generative-pre-trained-transformer" data-text=" GPT (Generative Pre-trained Transformer)" tabindex="-1"> GPT (Generative Pre-trained Transformer)</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="Language Evaluation">#language</div> </div></p> <p>A family of <a href="#Transformer"><strong>Transformer</strong></a>-based <a href="#large-language-model"><strong>large language models</strong></a> developed by <a href="https://openai.com/">OpenAI</a>.</p> <p>GPT variants can apply to multiple <a href="#modality"><strong>modalities</strong></a>, including:</p> <ul> <li>image generation (for example, ImageGPT)</li> <li>text-to-image generation (for example, <a href="https://openai.com/blog/dall-e/">DALL-E</a>).</li> </ul> <p><a class="glossary-anchor" name="gradient"></a> <h2 class="hide-from-toc" id="gradient" data-text=" gradient" tabindex="-1"> gradient</h2></p> <p>The vector of <a href="#partial_derivative"><strong>partial derivatives</strong></a> with respect to all of the independent variables. In machine learning, the gradient is the vector of partial derivatives of the model function. The gradient points in the direction of steepest ascent.</p> <p><a class="glossary-anchor" name="gradient_accumulation"></a> <h2 class="hide-from-toc" id="gradient-accumulation" data-text=" gradient accumulation" tabindex="-1"> gradient accumulation</h2></p> <p>A <a href="#backpropagation"><strong>backpropagation</strong></a> technique that updates the <a href="#parameter"><strong>parameters</strong></a> only <em>once per epoch</em> rather than once per iteration. After processing each <a href="#mini-batch"><strong>mini-batch</strong></a>, gradient accumulation simply updates a running total of gradients. Then, after processing the last mini-batch in the epoch, the system finally updates the parameters based on the total of all gradient changes.</p> <p>Gradient accumulation is useful when the <a href="#batch_size"><strong>batch size</strong></a> is very large compared to the amount of available memory for training. When memory is an issue, the natural tendency is to reduce batch size. However, reducing the batch size in normal backpropagation <em>increases</em> the number of parameter updates. Gradient accumulation enables the model to avoid memory issues but still train efficiently.</p> <p><a class="glossary-anchor" name="gbt"></a> <h2 class="hide-from-toc" id="gradient-boosted-decision-trees-gbt" data-text=" gradient boosted (decision) trees (GBT) " tabindex="-1"> gradient boosted (decision) trees (GBT) </h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="Decision Forests">#df</div> </div></p> <p>A type of <a href="#decision-forest"><strong>decision forest</strong></a> in which:</p> <ul> <li><a href="#training"><strong>Training</strong></a> relies on <a href="#gradient-boosting"><strong>gradient boosting</strong></a>.</li> <li>The weak model is a <a href="#decision-tree"><strong>decision tree</strong></a>.</li> </ul> <p>See <a href="/machine-learning/decision-forests/intro-to-gbdt">Gradient Boosted Decision Trees</a> in the Decision Forests course for more information.</p> <p><a class="glossary-anchor" name="gradient-boosting"></a> <h2 class="hide-from-toc" id="gradient-boosting" data-text=" gradient boosting" tabindex="-1"> gradient boosting</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="Decision Forests">#df</div> </div></p> <p>A training algorithm where weak models are trained to iteratively improve the quality (reduce the loss) of a strong model. For example, a weak model could be a linear or small decision tree model. The strong model becomes the sum of all the previously trained weak models.</p> <p>In the simplest form of gradient boosting, at each iteration, a weak model is trained to predict the loss gradient of the strong model. Then, the strong model's output is updated by subtracting the predicted gradient, similar to <a href="#gradient_descent"><strong>gradient descent</strong></a>.</p> <div> $$F_{0} = 0$$ $$F_{i+1} = F_i - \xi f_i $$ </div> <p>where:</p> <ul> <li>$F_{0}$ is the starting strong model.</li> <li>$F_{i+1}$ is the next strong model.</li> <li>$F_{i}$ is the current strong model.</li> <li>$\xi$ is a value between 0.0 and 1.0 called <a href="#shrinkage"><b>shrinkage</b></a>, which is analogous to the <a href="#learning_rate"><b>learning rate</b></a> in gradient descent.</li> <li>$f_{i}$ is the weak model trained to predict the loss gradient of $F_{i}$.</li> </ul> <p>Modern variations of gradient boosting also include the second derivative (Hessian) of the loss in their computation.</p> <p><a href="#decision-tree"><strong>Decision trees</strong></a> are commonly used as weak models in gradient boosting. See <a href="#gbt"><strong>gradient boosted (decision) trees</strong></a>.</p> <p><a class="glossary-anchor" name="gradient_clipping"></a> <h2 class="hide-from-toc" id="gradient-clipping" data-text=" gradient clipping" tabindex="-1"> gradient clipping</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="Sequence Models">#seq</div> </div></p> <p>A commonly used mechanism to mitigate the <a href="#exploding_gradient_problem"><strong>exploding gradient problem</strong></a> by artificially limiting (clipping) the maximum value of gradients when using <a href="#gradient_descent"><strong>gradient descent</strong></a> to <a href="#training"><strong>train</strong></a> a model.</p> <p><a class="glossary-anchor" name="gradient_descent"></a> <h2 class="hide-from-toc" id="gradient-descent" data-text=" gradient descent" tabindex="-1"> gradient descent</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="ML Fundamentals">#fundamentals</div> </div></p> <p>A mathematical technique to minimize <a href="#loss"><strong>loss</strong></a>. Gradient descent iteratively adjusts <a href="#weight"><strong>weights</strong></a> and <a href="#bias"><strong>biases</strong></a>, gradually finding the best combination to minimize loss.</p> <p>Gradient descent is older—much, much older—than machine learning.</p> <p>See the <a href="/machine-learning/crash-course/linear-regression/gradient-descent">Linear regression: Gradient descent</a> in Machine Learning Crash Course for more information.</p> <p><a class="glossary-anchor" name="graph"></a> <h2 class="hide-from-toc" id="graph" data-text=" graph" tabindex="-1"> graph</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="TensorFlow">#TensorFlow</div> </div></p> <p>In TensorFlow, a computation specification. Nodes in the graph represent operations. Edges are directed and represent passing the result of an operation (a <a href="#tensor"><strong>Tensor</strong></a>) as an operand to another operation. Use <a href="#TensorBoard"><strong>TensorBoard</strong></a> to visualize a graph.</p> <p><a class="glossary-anchor" name="graph_execution"></a> <h2 class="hide-from-toc" id="graph-execution" data-text=" graph execution" tabindex="-1"> graph execution</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="TensorFlow">#TensorFlow</div> </div></p> <p>A TensorFlow programming environment in which the program first constructs a <a href="#graph"><strong>graph</strong></a> and then executes all or part of that graph. Graph execution is the default execution mode in TensorFlow 1.x.</p> <p>Contrast with <a href="#eager_execution"><strong>eager execution</strong></a>.</p> <p><a class="glossary-anchor" name="greedy_policy"></a> <h2 class="hide-from-toc" id="greedy-policy" data-text=" greedy policy" tabindex="-1"> greedy policy</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="Reinforcement Learning">#rl</div> </div></p> <p>In reinforcement learning, a <a href="#policy"><strong>policy</strong></a> that always chooses the action with the highest expected <a href="#return"><strong>return</strong></a>.</p> <p><a class="glossary-anchor" name="ground_truth"></a> <h2 class="hide-from-toc" id="ground-truth" data-text=" ground truth" tabindex="-1"> ground truth</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="ML Fundamentals">#fundamentals</div> </div></p> <p>Reality.</p> <p>The thing that actually happened.</p> <p>For example, consider a <a href="#binary_classification"><strong>binary classification</strong></a> model that predicts whether a student in their first year of university will graduate within six years. Ground truth for this model is whether or not that student actually graduated within six years.</p> <section class="expandable"> <h4 class="showalways" id="click-the-icon-for-additional-notes._7" data-text=" Click the icon for additional notes. " tabindex="-1"> Click the icon for additional notes. </h4> <div class="expand-background"> <p> We assess model quality against ground truth. However, ground truth is not always completely, well, truthful. For example, consider the following examples of potential imperfections in ground truth:</p> <ul> <li>In the graduation example, are we <i>certain</i> that the graduation records for each student are always correct? Is the university's record-keeping flawless?</li> <li>Suppose the label is a floating-point value measured by instruments (for example, barometers). How can we be sure that each instrument is calibrated identically or that each reading was taken under the same circumstances?</li> <li>If the label is a matter of human opinion, how can we be sure that each human <a href="#rater"><b>rater</b></a> is evaluating events in the same way? To improve consistency, <i>expert</i> human raters sometimes intervene.</li> </ul> </div> <hr /> </section> <p><a class="glossary-anchor" name="group_attribution_bias"></a> <h2 class="hide-from-toc" id="group-attribution-bias" data-text=" group attribution bias " tabindex="-1"> group attribution bias </h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="Fairness">#fairness</div> </div></p> <p>Assuming that what is true for an individual is also true for everyone in that group. The effects of group attribution bias can be exacerbated if a <a href="#convenience_sampling"><strong>convenience sampling</strong></a> is used for data collection. In a non-representative sample, attributions may be made that don't reflect reality.</p> <p>See also <a href="#out-group_homogeneity_bias"><strong>out-group homogeneity bias</strong></a> and <a href="#in-group_bias"><strong>in-group bias</strong></a>. Also, see <a href="/machine-learning/crash-course/fairness/types-of-bias">Fairness: Types of bias</a> in Machine Learning Crash Course for more information.</p> <p><a class="glossary-anchor" name="h"></a> <h2 class="glossary" id="h" data-text="H" tabindex="-1">H</h2></p> <p><a class="glossary-anchor" name="hallucination"></a> <h2 class="hide-from-toc" id="hallucination" data-text=" hallucination" tabindex="-1"> hallucination</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="Language Evaluation">#language</div> </div></p> <p>The production of plausible-seeming but factually incorrect output by a <a href="#generative-AI"><strong>generative AI</strong></a> model that purports to be making an assertion about the real world. For example, a generative AI model that claims that Barack Obama died in 1865 is <em>hallucinating</em>.</p> <p><a class="glossary-anchor" name="hashing"></a> <h2 class="hide-from-toc" id="hashing" data-text=" hashing" tabindex="-1"> hashing</h2></p> <p>In machine learning, a mechanism for bucketing <a href="#categorical_data"><strong>categorical data</strong></a>, particularly when the number of categories is large, but the number of categories actually appearing in the dataset is comparatively small.</p> <p>For example, Earth is home to about 73,000 tree species. You could represent each of the 73,000 tree species in 73,000 separate categorical buckets. Alternatively, if only 200 of those tree species actually appear in a dataset, you could use hashing to divide tree species into perhaps 500 buckets.</p> <p>A single bucket could contain multiple tree species. For example, hashing could place <em>baobab</em> and <em>red maple</em>—two genetically dissimilar species—into the same bucket. Regardless, hashing is still a good way to map large categorical sets into the selected number of buckets. Hashing turns a categorical feature having a large number of possible values into a much smaller number of values by grouping values in a deterministic way.</p> <p>See <a href="/machine-learning/crash-course/categorical-data/one-hot-encoding">Categorical data: Vocabulary and one-hot encoding</a> in Machine Learning Crash Course for more information.</p> <p><a class="glossary-anchor" name="heuristic"></a> <h2 class="hide-from-toc" id="heuristic" data-text=" heuristic" tabindex="-1"> heuristic</h2></p> <p>A simple and quickly implemented solution to a problem. For example, "With a heuristic, we achieved 86% accuracy. When we switched to a deep neural network, accuracy went up to 98%."</p> <p><a class="glossary-anchor" name="hidden_layer"></a> <h2 class="hide-from-toc" id="hidden-layer" data-text=" hidden layer" tabindex="-1"> hidden layer</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="ML Fundamentals">#fundamentals</div> </div></p> <p>A layer in a <a href="#neural_network"><strong>neural network</strong></a> between the <a href="#input-layer"><strong>input layer</strong></a> (the features) and the <a href="#output_layer"><strong>output layer</strong></a> (the prediction). Each hidden layer consists of one or more <a href="#neuron"><strong>neurons</strong></a>. For example, the following neural network contains two hidden layers, the first with three neurons and the second with two neurons:</p> <p> <img src="/static/machine-learning/glossary/images/HiddenLayerBigPicture.png" loading="lazy" width="750" alt="Four layers. The first layer is an input layer containing two features. The second layer is a hidden layer containing three neurons. The third layer is a hidden layer containing two neurons. The fourth layer is an output layer. Each feature contains three edges, each of which points to a different neuron in the second layer. Each of the neurons in the second layer contains two edges, each of which points to a different neuron in the third layer. Each of the neurons in the third layer contain one edge, each pointing to the output layer." > </p> <p>A <a href="#deep_neural_network"><strong>deep neural network</strong></a> contains more than one hidden layer. For example, the preceding illustration is a deep neural network because the model contains two hidden layers.</p> <p>See <a href="/machine-learning/crash-course/neural-networks/nodes-hidden-layers">Neural networks: Nodes and hidden layers</a> in Machine Learning Crash Course for more information.</p> <p><a class="glossary-anchor" name="hierarchical_clustering"></a> <h2 class="hide-from-toc" id="hierarchical-clustering" data-text=" hierarchical clustering" tabindex="-1"> hierarchical clustering</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="Clustering">#clustering</div> </div></p> <p>A category of <a href="#clustering"><strong>clustering</strong></a> algorithms that create a tree of clusters. Hierarchical clustering is well-suited to hierarchical data, such as botanical taxonomies. There are two types of hierarchical clustering algorithms:</p> <ul> <li><strong>Agglomerative clustering</strong> first assigns every example to its own cluster, and iteratively merges the closest clusters to create a hierarchical tree.</li> <li><strong>Divisive clustering</strong> first groups all examples into one cluster and then iteratively divides the cluster into a hierarchical tree.</li> </ul> <p>Contrast with <a href="#centroid_based_clustering"><strong>centroid-based clustering</strong></a>.</p> <p>See <a href="/machine-learning/clustering/clustering-algorithms">Clustering algorithms</a> in the Clustering course for more information.</p> <p><a class="glossary-anchor" name="hinge-loss"></a> <h2 class="hide-from-toc" id="hinge-loss" data-text=" hinge loss" tabindex="-1"> hinge loss</h2></p> <p>A family of <a href="#loss"><strong>loss</strong></a> functions for <a href="#classification_model"><strong>classification</strong></a> designed to find the <a href="#decision_boundary"><strong>decision boundary</strong></a> as distant as possible from each training example, thus maximizing the margin between examples and the boundary. <a href="#KSVMs"><strong>KSVMs</strong></a> use hinge loss (or a related function, such as squared hinge loss). For binary classification, the hinge loss function is defined as follows:</p> <div> $$\text{loss} = \text{max}(0, 1 - (y * y'))$$ </div> <p>where <em>y</em> is the true label, either -1 or +1, and <em>y'</em> is the raw output of the classifier model:</p> <div> $$y' = b + w_1x_1 + w_2x_2 + … w_nx_n$$ </div> <p>Consequently, a plot of hinge loss versus (y * y') looks as follows:</p> <p> <img src="/static/machine-learning/glossary/images/hinge-loss.svg" loading="lazy" alt="A Cartesian plot consisting of two joined line segments. The first line segment starts at (-3, 4) and ends at (1, 0). The second line segment begins at (1, 0) and continues indefinitely with a slope of 0." > </p> <p><a class="glossary-anchor" name="historical_bias"></a> <h2 class="hide-from-toc" id="historical-bias" data-text=" historical bias" tabindex="-1"> historical bias</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="Fairness">#fairness</div> </div></p> <p>A type of <a href="#bias_ethics"><strong>bias</strong></a> that already exists in the world and has made its way into a dataset. These biases have a tendency to reflect existing cultural stereotypes, demographic inequalities, and prejudices against certain social groups.</p> <p>For example, consider a <a href="#classification_model"><strong>classification model</strong></a> that predicts whether or not a loan applicant will default on their loan, which was trained on historical loan-default data from the 1980s from local banks in two different communities. If past applicants from Community A were six times more likely to default on their loans than applicants from Community B, the model might learn a historical bias resulting in the model being less likely to approve loans in Community A, even if the historical conditions that resulted in that community's higher default rates were no longer relevant.</p> <p>See <a href="/machine-learning/crash-course/fairness/types-of-bias">Fairness: Types of bias</a> in Machine Learning Crash Course for more information.</p> <p><a class="glossary-anchor" name="holdout_data"></a> <h2 class="hide-from-toc" id="holdout-data" data-text=" holdout data" tabindex="-1"> holdout data</h2></p> <p><a href="#example"><strong>Examples</strong></a> intentionally not used ("held out") during training. The <a href="#validation_set"><strong>validation dataset</strong></a> and <a href="#test_set"><strong>test dataset</strong></a> are examples of holdout data. Holdout data helps evaluate your model's ability to generalize to data other than the data it was trained on. The loss on the holdout set provides a better estimate of the loss on an unseen dataset than does the loss on the training set.</p> <p><a class="glossary-anchor" name="host"></a> <h2 class="hide-from-toc" id="host" data-text=" host" tabindex="-1"> host</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="TensorFlow">#TensorFlow</div> <div class="glossary-icon" title="Google Cloud">#GoogleCloud</div> </div></p> <p>When training an ML model on <a href="#accelerator-chip"><strong>accelerator chips</strong></a> (GPUs or <a href="#TPU"><strong>TPUs</strong></a>), the part of the system that controls both of the following:</p> <ul> <li>The overall flow of the code.</li> <li>The extraction and transformation of the input pipeline.</li> </ul> <p>The host typically runs on a CPU, not on an accelerator chip; the <a href="#device"><strong>device</strong></a> manipulates <a href="#tensor"><strong>tensors</strong></a> on the accelerator chips.</p> <p><a class="glossary-anchor" name="hyperparameter"></a> <h2 class="hide-from-toc" id="hyperparameter" data-text=" hyperparameter" tabindex="-1"> hyperparameter</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="ML Fundamentals">#fundamentals</div> </div></p> <p>The variables that you or a hyperparameter tuning service adjust during successive runs of training a model. For example, <a href="#learning_rate"><strong>learning rate</strong></a> is a hyperparameter. You could set the learning rate to 0.01 before one training session. If you determine that 0.01 is too high, you could perhaps set the learning rate to 0.003 for the next training session.</p> <p>In contrast, <a href="#parameter"><strong>parameters</strong></a> are the various <a href="#weight"><strong>weights</strong></a> and <a href="#bias"><strong>bias</strong></a> that the model <em>learns</em> during training.</p> <p>See <a href="/machine-learning/crash-course/linear-regression/hyperparameters">Linear regression: Hyperparameters</a> in Machine Learning Crash Course for more information.</p> <p><a class="glossary-anchor" name="hyperplane"></a> <h2 class="hide-from-toc" id="hyperplane" data-text=" hyperplane" tabindex="-1"> hyperplane</h2></p> <p>A boundary that separates a space into two subspaces. For example, a line is a hyperplane in two dimensions and a plane is a hyperplane in three dimensions. More typically in machine learning, a hyperplane is the boundary separating a high-dimensional space. <a href="#KSVMs"><strong>Kernel Support Vector Machines</strong></a> use hyperplanes to separate positive classes from negative classes, often in a very high-dimensional space.</p> <p><a class="glossary-anchor" name="i"></a> <h2 class="glossary" id="i" data-text="I" tabindex="-1">I</h2></p> <p><a class="glossary-anchor" name="iid_abbreviation"></a> <h2 class="hide-from-toc" id="i.i.d." data-text=" i.i.d." tabindex="-1"> i.i.d.</h2></p> <p>Abbreviation for <a href="#iid"><strong>independently and identically distributed</strong></a>.</p> <p><a class="glossary-anchor" name="image_recognition"></a> <h2 class="hide-from-toc" id="image-recognition" data-text=" image recognition" tabindex="-1"> image recognition</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="Image Models">#image</div> </div></p> <p>A process that classifies object(s), pattern(s), or concept(s) in an image. Image recognition is also known as <strong>image classification</strong>.</p> <p>For more information, see <a href="/machine-learning/practica/image-classification" target="T" class="gc-analytics-event" data-category="launchImageClassificationPracticum" data-label="ml-glossary" data-action="click">ML Practicum: Image Classification</a>.</p> <p>See the <a href="/machine-learning/practica/image-classification">ML Practicum: Image Classification course</a> for more information.</p> <p><a class="glossary-anchor" name="imbalanced_data_set"></a> <h2 class="hide-from-toc" id="imbalanced-dataset" data-text=" imbalanced dataset" tabindex="-1"> imbalanced dataset</h2></p> <p>Synonym for <a href="#class_imbalanced_data_set"><strong>class-imbalanced dataset</strong></a>.</p> <p><a class="glossary-anchor" name="implicit_bias"></a> <h2 class="hide-from-toc" id="implicit-bias" data-text=" implicit bias " tabindex="-1"> implicit bias </h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="Fairness">#fairness</div> </div></p> <p>Automatically making an association or assumption based on one's mind models and memories. Implicit bias can affect the following:</p> <ul> <li>How data is collected and classified.</li> <li>How machine learning systems are designed and developed.</li> </ul> <p>For example, when building a classifier to identify wedding photos, an engineer may use the presence of a white dress in a photo as a feature. However, white dresses have been customary only during certain eras and in certain cultures.</p> <p>See also <a href="#confirmation_bias"><strong>confirmation bias</strong></a>.</p> <p><a class="glossary-anchor" name="imputation"></a> <h2 class="hide-from-toc" id="imputation" data-text=" imputation" tabindex="-1"> imputation</h2></p> <p>Short form of <a href="#value-imputation"><strong>value imputation</strong></a>.</p> <p><a class="glossary-anchor" name="incompatibility_of_fairness_metrics"></a> <a class="glossary-anchor" name="incompatibility"></a> <h2 class="hide-from-toc" id="incompatibility-of-fairness-metrics" data-text=" incompatibility of fairness metrics" tabindex="-1"> incompatibility of fairness metrics</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="Fairness">#fairness</div> </div></p> <p>The idea that some notions of fairness are mutually incompatible and cannot be satisfied simultaneously. As a result, there is no single universal <a href="#fairness_metric"><strong>metric</strong></a> for quantifying fairness that can be applied to all ML problems.</p> <p>While this may seem discouraging, incompatibility of fairness metrics doesn't imply that fairness efforts are fruitless. Instead, it suggests that fairness must be defined contextually for a given ML problem, with the goal of preventing harms specific to its use cases.</p> <p>See <a href="https://arxiv.org/pdf/1609.07236.pdf" target="T">"On the (im)possibility of fairness"</a> for a more detailed discussion of this topic.</p> <p><a class="glossary-anchor" name="in-context-learning"></a> <h2 class="hide-from-toc" id="in-context-learning" data-text="in-context learning" tabindex="-1">in-context learning</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="Language Evaluation">#language</div> <div class="glossary-icon" title="Generative AI">#generativeAI</div> </div></p> <p>Synonym for <a href="#few-shot-prompting"><strong>few-shot prompting</strong></a>.</p> <p><a class="glossary-anchor" name="iid"></a> <h2 class="hide-from-toc" id="independently-and-identically-distributed-i.i.d" data-text="independently and identically distributed (i.i.d)" tabindex="-1">independently and identically distributed (i.i.d)</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="ML Fundamentals">#fundamentals</div> </div></p> <p>Data drawn from a distribution that doesn't change, and where each value drawn doesn't depend on values that have been drawn previously. An i.i.d. is the <a href="https://wikipedia.org/wiki/Ideal_gas" target="T">ideal gas</a> of machine learning—a useful mathematical construct but almost never exactly found in the real world. For example, the distribution of visitors to a web page may be i.i.d. over a brief window of time; that is, the distribution doesn't change during that brief window and one person's visit is generally independent of another's visit. However, if you expand that window of time, seasonal differences in the web page's visitors may appear.</p> <p>See also <a href="#nonstationarity"><strong>nonstationarity</strong></a>.</p> <p><a class="glossary-anchor" name="individual_fairness"></a> <h2 class="hide-from-toc" id="individual-fairness" data-text=" individual fairness" tabindex="-1"> individual fairness</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="Fairness">#fairness</div> </div></p> <p>A fairness metric that checks whether similar individuals are classified similarly. For example, Brobdingnagian Academy might want to satisfy individual fairness by ensuring that two students with identical grades and standardized test scores are equally likely to gain admission.</p> <p>Note that individual fairness relies entirely on how you define "similarity" (in this case, grades and test scores), and you can run the risk of introducing new fairness problems if your similarity metric misses important information (such as the rigor of a student's curriculum).</p> <p>See <a href="https://arxiv.org/pdf/1104.3913.pdf" target="T">"Fairness Through Awareness"</a> for a more detailed discussion of individual fairness.</p> <p><a class="glossary-anchor" name="inference"></a> <h2 class="hide-from-toc" id="inference" data-text=" inference" tabindex="-1"> inference</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="ML Fundamentals">#fundamentals</div> </div></p> <p>In machine learning, the process of making predictions by applying a trained model to <a href="#unlabeled_example"><strong>unlabeled examples</strong></a>.</p> <p>Inference has a somewhat different meaning in statistics. See the <a href="https://wikipedia.org/wiki/Statistical_inference" target="T"> Wikipedia article on statistical inference</a> for details.</p> <p>See <a href="/machine-learning/intro-to-ml/supervised">Supervised Learning</a> in the Intro to ML course to see inference's role in a supervised learning system.</p> <p><a class="glossary-anchor" name="inference-path"></a> <h2 class="hide-from-toc" id="inference-path" data-text=" inference path " tabindex="-1"> inference path </h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="Decision Forests">#df</div> </div></p> <p>In a <a href="#decision-tree"><strong>decision tree</strong></a>, during <a href="#inference"><strong>inference</strong></a>, the route a particular <a href="#example"><strong>example</strong></a> takes from the <a href="#root"><strong>root</strong></a> to other <a href="#condition"><strong>conditions</strong></a>, terminating with a <a href="#leaf"><strong>leaf</strong></a>. For example, in the following decision tree, the thicker arrows show the inference path for an example with the following feature values:</p> <ul> <li>x = 7</li> <li>y = 12</li> <li>z = -3</li> </ul> <p>The inference path in the following illustration travels through three conditions before reaching the leaf (<code translate="no" dir="ltr">Zeta</code>).</p> <p> <img src="/static/machine-learning/glossary/images/information-gain.png" loading="lazy" width="490" alt="A decision tree consisting of four conditions and five leaves. The root condition is (x > 0). Since the answer is Yes, the inference path travels from the root to the next condition (y > 0). Since the answer is Yes, the inference path then travels to the next condition (z > 0). Since the answer is No, the inference path travels to its terminal node, which is the leaf (Zeta)." > </p> <p><b>The three thick arrows show the inference path.</b></p> <p>See <a href="/machine-learning/decision-forests/decision-trees">Decision trees</a> in the Decision Forests course for more information.</p> <p><a class="glossary-anchor" name="information-gain"></a> <h2 class="hide-from-toc" id="information-gain" data-text=" information gain " tabindex="-1"> information gain </h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="Decision Forests">#df</div> </div></p> <p>In <a href="#decision-forest"><strong>decision forests</strong></a>, the difference between a node's <a href="#entropy"><strong>entropy</strong></a> and the weighted (by number of examples) sum of the entropy of its children nodes. A node's entropy is the entropy of the examples in that node.</p> <p>For example, consider the following entropy values:</p> <ul> <li>entropy of parent node = 0.6</li> <li>entropy of one child node with 16 relevant examples = 0.2</li> <li>entropy of another child node with 24 relevant examples = 0.1</li> </ul> <p>So 40% of the examples are in one child node and 60% are in the other child node. Therefore:</p> <ul> <li>weighted entropy sum of child nodes = (0.4 * 0.2) + (0.6 * 0.1) = 0.14</li> </ul> <p>So, the information gain is:</p> <ul> <li>information gain = entropy of parent node - weighted entropy sum of child nodes</li> <li>information gain = 0.6 - 0.14 = 0.46</li> </ul> <p>Most <a href="#splitter"><strong>splitters</strong></a> seek to create <a href="#condition"><strong>conditions</strong></a> that maximize information gain.</p> <p><a class="glossary-anchor" name="in-group_bias"></a> <h2 class="hide-from-toc" id="in-group-bias" data-text=" in-group bias " tabindex="-1"> in-group bias </h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="Fairness">#fairness</div> </div></p> <p>Showing partiality to one's own group or own characteristics. If testers or raters consist of the machine learning developer's friends, family, or colleagues, then in-group bias may invalidate product testing or the dataset.</p> <p>In-group bias is a form of <a href="#group_attribution_bias"><strong>group attribution bias</strong></a>. See also <a href="#out-group_homogeneity_bias"><strong>out-group homogeneity bias</strong></a>.</p> <p>See <a href="/machine-learning/crash-course/fairness/types-of-bias">Fairness: Types of bias</a> in Machine Learning Crash Course for more information.</p> <p><a class="glossary-anchor" name="input_generator"></a> <h2 class="hide-from-toc" id="input-generator" data-text=" input generator" tabindex="-1"> input generator</h2></p> <p>A mechanism by which data is loaded into a <a href="#neural-network"><strong>neural network</strong></a>.</p> <p>An input generator can be thought of as a component responsible for processing raw data into tensors which are iterated over to generate batches for training, evaluation, and inference.</p> <p><a class="glossary-anchor" name="input-layer"></a> <h2 class="hide-from-toc" id="input-layer" data-text=" input layer" tabindex="-1"> input layer</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="ML Fundamentals">#fundamentals</div> </div></p> <p>The <a href="#layer"><strong>layer</strong></a> of a <a href="#neural_network"><strong>neural network</strong></a> that holds the <a href="#feature_vector"><strong>feature vector</strong></a>. That is, the input layer provides <a href="#example"><strong>examples</strong></a> for <a href="#training"><strong>training</strong></a> or <a href="#inference"><strong>inference</strong></a>. For example, the input layer in the following neural network consists of two features:</p> <p> <img src="/static/machine-learning/glossary/images/InputLayer.png" loading="lazy" width="750" alt="Four layers: an input layer, two hidden layers, and an output layer." > </p> <p><a class="glossary-anchor" name="in-set-condition"></a> <h2 class="hide-from-toc" id="in-set-condition" data-text=" in-set condition " tabindex="-1"> in-set condition </h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="Decision Forests">#df</div> </div></p> <p>In a <a href="#decision-tree"><strong>decision tree</strong></a>, a <a href="#condition"><strong>condition</strong></a> that tests for the presence of one item in a set of items. For example, the following is an in-set condition:</p> <div></div><devsite-code><pre class="devsite-click-to-copy" translate="no" dir="ltr" is-upgraded syntax="Text only"><code translate="no" dir="ltr"> house-style in [tudor, colonial, cape] </code></pre></devsite-code> <p>During inference, if the value of the house-style <a href="#feature"><strong>feature</strong></a> is <code translate="no" dir="ltr">tudor</code> or <code translate="no" dir="ltr">colonial</code> or <code translate="no" dir="ltr">cape</code>, then this condition evaluates to Yes. If the value of the house-style feature is something else (for example, <code translate="no" dir="ltr">ranch</code>), then this condition evaluates to No.</p> <p>In-set conditions usually lead to more efficient decision trees than conditions that test <a href="#one-hot_encoding"><strong>one-hot encoded</strong></a> features.</p> <p><a class="glossary-anchor" name="instance"></a> <h2 class="hide-from-toc" id="instance" data-text=" instance" tabindex="-1"> instance</h2></p> <p>Synonym for <a href="#example"><strong>example</strong></a>.</p> <p><a class="glossary-anchor" name="instruction-tuning"></a> <h2 class="hide-from-toc" id="instruction-tuning" data-text=" instruction tuning" tabindex="-1"> instruction tuning</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="Generative AI">#generativeAI</div> </div></p> <p>A form of <a href="#fine-tuning"><strong>fine-tuning</strong></a> that improves a <a href="#generative-AI"><strong>generative AI</strong></a> model's ability to follow instructions. Instruction tuning involves training a model on a series of instruction prompts, typically covering a wide variety of tasks. The resulting instruction-tuned model then tends to generate useful responses to <a href="#zero-shot-prompting"><strong>zero-shot prompts</strong></a> across a variety of tasks.</p> <p>Compare and contrast with:</p> <ul> <li><a href="#parameter-efficient-tuning"><strong>parameter-efficient tuning</strong></a></li> <li><a href="#prompt-tuning"><strong>prompt tuning</strong></a></li> </ul> <p><a class="glossary-anchor" name="interpretability"></a> <h2 class="hide-from-toc" id="interpretability" data-text=" interpretability" tabindex="-1"> interpretability</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="ML Fundamentals">#fundamentals</div> </div></p> <p>The ability to explain or to present an ML <a href="#model"><strong>model's</strong></a> reasoning in understandable terms to a human.</p> <p>Most <a href="#linear_regression"><strong>linear regression</strong></a> models, for example, are highly interpretable. (You merely need to look at the trained weights for each feature.) Decision forests are also highly interpretable. Some models, however, require sophisticated visualization to become interpretable.</p> <p>You can use the <a href="#Learning-Interpretability-Tool"><strong>Learning Interpretability Tool (LIT)</strong></a> to interpret ML models.</p> <p><a class="glossary-anchor" name="inter-rater_agreement"></a> <h2 class="hide-from-toc" id="inter-rater-agreement" data-text=" inter-rater agreement" tabindex="-1"> inter-rater agreement</h2></p> <p>A measurement of how often human raters agree when doing a task. If raters disagree, the task instructions may need to be improved. Also sometimes called <strong>inter-annotator agreement</strong> or <strong>inter-rater reliability</strong>. See also <a href="https://wikipedia.org/wiki/Cohen%27s_kappa" target="T">Cohen's kappa</a>, which is one of the most popular inter-rater agreement measurements.</p> <p>See <a href="/machine-learning/crash-course/categorical-data/issues">Categorical data: Common issues</a> in Machine Learning Crash Course for more information.</p> <p><a class="glossary-anchor" name="intersection_over_union"></a> <h2 class="hide-from-toc" id="intersection-over-union-iou" data-text=" intersection over union (IoU)" tabindex="-1"> intersection over union (IoU)</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="Image Models">#image</div> </div></p> <p>The intersection of two sets divided by their union. In machine-learning image-detection tasks, IoU is used to measure the accuracy of the model's predicted <a href="#bounding_box"><strong>bounding box</strong></a> with respect to the <a href="#ground_truth"><strong>ground-truth</strong></a> bounding box. In this case, the IoU for the two boxes is the ratio between the overlapping area and the total area, and its value ranges from 0 (no overlap of predicted bounding box and ground-truth bounding box) to 1 (predicted bounding box and ground-truth bounding box have the exact same coordinates).</p> <p>For example, in the image below:</p> <ul> <li>The predicted bounding box (the coordinates delimiting where the model predicts the night table in the painting is located) is outlined in purple.</li> <li>The ground-truth bounding box (the coordinates delimiting where the night table in the painting is actually located) is outlined in green.</li> </ul> <p> <img src="/static/machine-learning/glossary/images/iou_van_gogh_bounding_boxes.jpg" loading="lazy" alt="The Van Gogh painting Vincent's Bedroom in Arles, with two different bounding boxes around the night table beside the bed. The ground-truth bounding box (in green) perfectly circumscribes the night table. The predicted bounding box (in purple) is offset 50% down and to the right of the ground-truth bounding box; it encloses the bottom-right quarter of the night table, but misses the rest of the table." > </p> <p>Here, the intersection of the bounding boxes for prediction and ground truth (below left) is 1, and the union of the bounding boxes for prediction and ground truth (below right) is 7, so the IoU is $\frac{1}{7}$.</p> <div id="intersection-union-side-by-side"> <img src="/static/machine-learning/glossary/images/iou_van_gogh_intersection.jpg" loading="lazy" alt="Same image as above, but with each bounding box divided into four quadrants. There are seven quadrants total, as the bottom-right quadrant of the ground-truth bounding box and the top-left quadrant of the predicted bounding box overlap each other. This overlapping section (highlighted in green) represents the intersection, and has an area of 1." > <img src="/static/machine-learning/glossary/images/iou_van_gogh_union.jpg" loading="lazy" alt="Same image as above, but with each bounding box divided into four quadrants. There are seven quadrants total, as the bottom-right quadrant of the ground-truth bounding box and the top-left quadrant of the predicted bounding box overlap each other. The entire interior enclosed by both bounding boxes (highlighted in green) represents the union, and has an area of 7." > </div> <p><a class="glossary-anchor" name="iou"></a> <h2 class="hide-from-toc" id="iou" data-text="IoU" tabindex="-1">IoU</h2></p> <p>Abbreviation for <a href="#intersection_over_union"><strong>intersection over union</strong></a>.</p> <p><a class="glossary-anchor" name="item_matrix"></a> <h2 class="hide-from-toc" id="item-matrix" data-text=" item matrix" tabindex="-1"> item matrix</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="Recommendation Systems">#recsystems</div> </div></p> <p>In <a href="#recommendation_system"><strong>recommendation systems</strong></a>, a matrix of <a href="#embedding_vector"><strong>embedding vectors</strong></a> generated by <a href="#matrix_factorization"><strong>matrix factorization</strong></a> that holds latent signals about each <a href="#items"><strong>item</strong></a>. Each row of the item matrix holds the value of a single latent feature for all items. For example, consider a movie recommendation system. Each column in the item matrix represents a single movie. The latent signals might represent genres, or might be harder-to-interpret signals that involve complex interactions among genre, stars, movie age, or other factors.</p> <p>The item matrix has the same number of columns as the target matrix that is being factorized. For example, given a movie recommendation system that evaluates 10,000 movie titles, the item matrix will have 10,000 columns.</p> <p><a class="glossary-anchor" name="items"></a> <h2 class="hide-from-toc" id="items" data-text=" items" tabindex="-1"> items</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="Recommendation Systems">#recsystems</div> </div></p> <p>In a <a href="#recommendation_system"><strong>recommendation system</strong></a>, the entities that a system recommends. For example, videos are the items that a video store recommends, while books are the items that a bookstore recommends.</p> <p><a class="glossary-anchor" name="iteration"></a> <h2 class="hide-from-toc" id="iteration" data-text=" iteration" tabindex="-1"> iteration</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="ML Fundamentals">#fundamentals</div> </div></p> <p>A single update of a <a href="#model"><strong>model's</strong></a> parameters—the model's <a href="#weight"><strong>weights</strong></a> and <a href="#bias"><strong>biases</strong></a>—during <a href="#training"><strong>training</strong></a>. The <a href="#batch_size"><strong>batch size</strong></a> determines how many examples the model processes in a single iteration. For instance, if the batch size is 20, then the model processes 20 examples before adjusting the parameters.</p> <p>When training a <a href="#neural_network"><strong>neural network</strong></a>, a single iteration involves the following two passes:</p> <ol> <li>A forward pass to evaluate loss on a single batch.</li> <li>A backward pass (<a href="#backpropagation"><strong>backpropagation</strong></a>) to adjust the model's parameters based on the loss and the learning rate.</li> </ol> <p><a class="glossary-anchor" name="j"></a> <h2 class="glossary" id="j" data-text="J" tabindex="-1">J</h2></p> <p><a class="glossary-anchor" name="JAX"></a> <h2 class="hide-from-toc" id="jax" data-text=" JAX" tabindex="-1"> JAX</h2></p> <p>An array computing library, bringing together <a href="#XLA"><strong>XLA (Accelerated Linear Algebra)</strong></a> and automatic differentiation for high-performance numerical computing. JAX provides a simple and powerful API for writing accelerated numerical code with composable transformations. JAX provides features such as:</p> <ul> <li><code translate="no" dir="ltr">grad</code> (automatic differentiation)</li> <li><code translate="no" dir="ltr">jit</code> (just-in-time compilation)</li> <li><code translate="no" dir="ltr">vmap</code> (automatic vectorization or batching)</li> <li><code translate="no" dir="ltr">pmap</code> (parallelization)</li> </ul> <p>JAX is a language for expressing and composing transformations of numerical code, analogous—but much larger in scope—to Python's <a href="#numpy"><strong>NumPy</strong></a> library. (In fact, the .numpy library under JAX is a functionally equivalent, but entirely rewritten version of the Python NumPy library.)</p> <p>JAX is particularly well-suited for speeding up many machine learning tasks by transforming the models and data into a form suitable for parallelism across GPU and <a href="#TPU"><strong>TPU</strong></a> <a href="#accelerator-chip"><strong>accelerator chips</strong></a>.</p> <p><a href="#flax"><strong>Flax</strong></a>, <a href="#optax"><strong>Optax</strong></a>, <a href="#pax"><strong>Pax</strong></a>, and many other libraries are built on the JAX infrastructure.</p> <p><a class="glossary-anchor" name="k"></a> <h2 class="glossary" id="k" data-text="K" tabindex="-1">K</h2></p> <p><a class="glossary-anchor" name="Keras"></a> <h2 class="hide-from-toc" id="keras" data-text=" Keras" tabindex="-1"> Keras</h2></p> <p>A popular Python machine learning API. <a href="https://keras.io" target="T">Keras</a> runs on several deep learning frameworks, including TensorFlow, where it is made available as <a href="https://www.tensorflow.org/api_docs/python/tf/keras" target="T">tf.keras</a>.</p> <p><a class="glossary-anchor" name="KSVMs"></a> <h2 class="hide-from-toc" id="kernel-support-vector-machines-ksvms" data-text="Kernel Support Vector Machines (KSVMs)" tabindex="-1">Kernel Support Vector Machines (KSVMs)</h2></p> <p>A classification algorithm that seeks to maximize the margin between <a href="#positive_class"><strong>positive</strong></a> and <a href="#negative_class"><strong>negative classes</strong></a> by mapping input data vectors to a higher dimensional space. For example, consider a classification problem in which the input dataset has a hundred features. To maximize the margin between positive and negative classes, a KSVM could internally map those features into a million-dimension space. KSVMs uses a loss function called <a href="#hinge-loss"><strong>hinge loss</strong></a>.</p> <p><a class="glossary-anchor" name="keypoints"></a> <h2 class="hide-from-toc" id="keypoints" data-text=" keypoints" tabindex="-1"> keypoints</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="Image Models">#image</div> </div></p> <p>The coordinates of particular features in an image. For example, for an <a href="#image_recognition"><strong>image recognition</strong></a> model that distinguishes flower species, keypoints might be the center of each petal, the stem, the stamen, and so on.</p> <p><a class="glossary-anchor" name="k-fold-cross-validation"></a> <a class="glossary-anchor" name="k-fold"></a> <h2 class="hide-from-toc" id="k-fold-cross-validation" data-text=" k-fold cross validation" tabindex="-1"> k-fold cross validation</h2></p> <p>An algorithm for predicting a model's ability to <a href="#generalization"><strong>generalize</strong></a> to new data. The <em>k</em> in k-fold refers to the number of equal groups you divide a dataset's examples into; that is, you train and test your model k times. For each round of training and testing, a different group is the test set, and all remaining groups become the training set. After k rounds of training and testing, you calculate the mean and standard deviation of the chosen test metric(s).</p> <p>For example, suppose your dataset consists of 120 examples. Further suppose, you decide to set k to 4. Therefore, after shuffling the examples, you divide the dataset into four equal groups of 30 examples and conduct four training and testing rounds:</p> <p> <img src="/static/machine-learning/glossary/images/k-folds.png" loading="lazy" width="500" alt="A dataset broken into four equal groups of examples. In Round 1, the first three groups are used for training and the last group is used for testing. In Round 2, the first two groups and the last group are used for training, while the third group is used for testing. In Round 3, the first group and the last two groups are used for training, while the second group is used for testing. In Round 4, the first group is used is for testing, while the final three groups are used for training." > </p> <p>For example, <a href="#MSE"><strong>Mean Squared Error (MSE)</strong></a> might be the most meaningful metric for a linear regression model. Therefore, you would find the mean and standard deviation of the MSE across all four rounds.</p> <p><a class="glossary-anchor" name="k-means"></a> <h2 class="hide-from-toc" id="k-means" data-text=" k-means" tabindex="-1"> k-means</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="Clustering">#clustering</div> </div></p> <p>A popular <a href="#clustering"><strong>clustering</strong></a> algorithm that groups examples in unsupervised learning. The k-means algorithm basically does the following:</p> <ul> <li>Iteratively determines the best k center points (known as <a href="#centroid"><strong>centroids</strong></a>).</li> <li>Assigns each example to the closest centroid. Those examples nearest the same centroid belong to the same group.</li> </ul> <p>The k-means algorithm picks centroid locations to minimize the cumulative <em>square</em> of the distances from each example to its closest centroid.</p> <p>For example, consider the following plot of dog height to dog width:</p> <p> <img src="/static/machine-learning/glossary/images/DogDimensions.svg" loading="lazy" width="450" alt="A Cartesian plot with several dozen data points." > </p> <p>If k=3, the k-means algorithm will determine three centroids. Each example is assigned to its closest centroid, yielding three groups:</p> <p> <img src="/static/machine-learning/glossary/images/DogDimensionsKMeans.svg" loading="lazy" width="450" alt="The same Cartesian plot as in the previous illustration, except with three centroids added. The previous data points are clustered into three distinct groups, with each group representing the data points closest to a particular centroid." > </p> <p>Imagine that a manufacturer wants to determine the ideal sizes for small, medium, and large sweaters for dogs. The three centroids identify the mean height and mean width of each dog in that cluster. So, the manufacturer should probably base sweater sizes on those three centroids. Note that the centroid of a cluster is typically <em>not</em> an example in the cluster.</p> <p>The preceding illustrations shows k-means for examples with only two features (height and width). Note that k-means can group examples across many features.</p> <p><a class="glossary-anchor" name="k-median"></a> <h2 class="hide-from-toc" id="k-median" data-text=" k-median" tabindex="-1"> k-median</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="Clustering">#clustering</div> </div></p> <p>A clustering algorithm closely related to <a href="#k-means"><strong>k-means</strong></a>. The practical difference between the two is as follows:</p> <ul> <li>In k-means, centroids are determined by minimizing the sum of the <em>squares</em> of the distance between a centroid candidate and each of its examples.</li> <li>In k-median, centroids are determined by minimizing the sum of the distance between a centroid candidate and each of its examples.</li> </ul> <p>Note that the definitions of distance are also different:</p> <ul> <li>k-means relies on the <a href="https://wikipedia.org/wiki/Euclidean_distance" target="T">Euclidean distance</a> from the centroid to an example. (In two dimensions, the Euclidean distance means using the Pythagorean theorem to calculate the hypotenuse.) For example, the k-means distance between (2,2) and (5,-2) would be:</li> </ul> <div> $$ {\text{Euclidean distance}} = {\sqrt {(2-5)^2 + (2--2)^2}} = 5 $$ </div> <ul> <li>k-median relies on the <a href="https://wikipedia.org/wiki/Taxicab_geometry" target="T"> Manhattan distance</a> from the centroid to an example. This distance is the sum of the absolute deltas in each dimension. For example, the k-median distance between (2,2) and (5,-2) would be:</li> </ul> <div> $$ {\text{Manhattan distance}} = \lvert 2-5 \rvert + \lvert 2--2 \rvert = 7 $$ </div> <p><a class="glossary-anchor" name="l"></a> <h2 class="glossary" id="l" data-text="L" tabindex="-1">L</h2></p> <p><a class="glossary-anchor" name="L0_regularization"></a> <h2 class="hide-from-toc" id="l0-regularization" data-text=" L0 regularization" tabindex="-1"> L<sub>0</sub> regularization</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="ML Fundamentals">#fundamentals</div> </div></p> <p>A type of <a href="#regularization"><strong>regularization</strong></a> that penalizes the <em>total number</em> of nonzero <a href="#weight"><strong>weights</strong></a> in a model. For example, a model having 11 nonzero weights would be penalized more than a similar model having 10 nonzero weights.</p> <p>L<sub>0</sub> regularization is sometimes called <em>L0-norm regularization</em>.</p> <section class="expandable"> <h4 class="showalways" id="click-the-icon-for-additional-notes._8" data-text=" Click the icon for additional notes. " tabindex="-1"> Click the icon for additional notes. </h4> <div class="expand-background"> <p> L<sub>0</sub> regularization is generally impractical in large models because L<sub>0</sub> regularization turns training into a <a href="#convex_function"><b>convex</b></a> optimization problem. </p> </div> <hr /> </section> <p><a class="glossary-anchor" name="L1_loss"></a> <h2 class="hide-from-toc" id="l1-loss" data-text=" L1 loss" tabindex="-1"> L<sub>1</sub> loss</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="ML Fundamentals">#fundamentals</div> </div></p> <p>A <a href="#loss-function"><strong>loss function</strong></a> that calculates the absolute value of the difference between actual <a href="#label"><strong>label</strong></a> values and the values that a <a href="#model"><strong>model</strong></a> predicts. For example, here's the calculation of L<sub>1</sub> loss for a <a href="#batch"><strong>batch</strong></a> of five <a href="#example"><strong>examples</strong></a>:</p> <table> <tr><th>Actual value of example</th> <th>Model's predicted value</th> <th>Absolute value of delta</th></tr> <tr><td>7</td> <td>6</td> <td>1</td> </tr> <tr><td>5</td> <td>4</td> <td>1</td> </tr> <tr><td>8</td> <td>11</td> <td>3</td> </tr> <tr><td>4</td> <td>6</td> <td>2</td> </tr> <tr><td>9</td> <td>8</td> <td>1</td> </tr> <tr><th colspan="2"> </th> <th>8 = L<sub>1</sub> loss</th> </tr> </table> <p>L<sub>1</sub> loss is less sensitive to <a href="#outliers"><strong>outliers</strong></a> than <a href="#squared_loss"><strong>L<sub>2</sub> loss</strong></a>.</p> <p>The <a href="#MAE"><strong>Mean Absolute Error</strong></a> is the average L<sub>1</sub> loss per example.</p> <section class="expandable"> <h4 class="showalways" id="click-the-icon-to-see-the-formal-math." data-text=" Click the icon to see the formal math. " tabindex="-1"> Click the icon to see the formal math. </h4> <div class="expand-background"> <p> $$ L_1 loss = \sum_{i=0}^n | y_i - \hat{y}_i |$$ </p> where: <ul> <li>$n$ is the number of examples.</li> <li>$y$ is the actual value of the label.</li> <li>$\hat{y}$ is the value that the model predicts for $y$.</li> </ul> </div> <hr /> </section> <p><a class="glossary-anchor" name="L1_regularization"></a> <h2 class="hide-from-toc" id="l1-regularization" data-text=" L1 regularization" tabindex="-1"> L<sub>1</sub> regularization</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="ML Fundamentals">#fundamentals</div> </div></p> <p>A type of <a href="#regularization"><strong>regularization</strong></a> that penalizes <a href="#weight"><strong>weights</strong></a> in proportion to the sum of the absolute value of the weights. L<sub>1</sub> regularization helps drive the weights of irrelevant or barely relevant features to <em>exactly 0</em>. A <a href="#feature"><strong>feature</strong></a> with a weight of 0 is effectively removed from the model.</p> <p>Contrast with <a href="#L2_regularization"><strong>L<sub>2</sub> regularization</strong></a>.</p> <p><a class="glossary-anchor" name="L2_loss"></a> <h2 class="hide-from-toc" id="l2-loss" data-text=" L2 loss" tabindex="-1"> L<sub>2</sub> loss</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="ML Fundamentals">#fundamentals</div> </div></p> <p>A <a href="#loss-function"><strong>loss function</strong></a> that calculates the square of the difference between actual <a href="#label"><strong>label</strong></a> values and the values that a <a href="#model"><strong>model</strong></a> predicts. For example, here's the calculation of L<sub>2</sub> loss for a <a href="#batch"><strong>batch</strong></a> of five <a href="#example"><strong>examples</strong></a>:</p> <table> <tr><th>Actual value of example</th> <th>Model's predicted value</th> <th>Square of delta</th></tr> <tr><td>7</td> <td>6</td> <td>1</td> </tr> <tr><td>5</td> <td>4</td> <td>1</td> </tr> <tr><td>8</td> <td>11</td> <td>9</td> </tr> <tr><td>4</td> <td>6</td> <td>4</td> </tr> <tr><td>9</td> <td>8</td> <td>1</td> </tr> <tr><th colspan="2"> </th> <th>16 = L<sub>2</sub> loss</th> </tr> </table> <p>Due to squaring, L<sub>2</sub> loss amplifies the influence of <a href="#outliers"><strong>outliers</strong></a>. That is, L<sub>2</sub> loss reacts more strongly to bad predictions than <a href="#L1_loss"><strong>L<sub>1</sub> loss</strong></a>. For example, the L<sub>1</sub> loss for the preceding batch would be 8 rather than 16. Notice that a single outlier accounts for 9 of the 16.</p> <p><a href="#regression_model"><strong>Regression models</strong></a> typically use L<sub>2</sub> loss as the loss function.</p> <p>The <a href="#MSE"><strong>Mean Squared Error</strong></a> is the average L<sub>2</sub> loss per example. <strong>Squared loss</strong> is another name for L<sub>2</sub> loss.</p> <section class="expandable"> <h4 class="showalways" id="click-the-icon-to-see-the-formal-math._1" data-text=" Click the icon to see the formal math. " tabindex="-1"> Click the icon to see the formal math. </h4> <div class="expand-background"> <p> $$ L_2 loss = \sum_{i=0}^n {(y_i - \hat{y}_i)}^2$$ </p> where: <ul> <li>$n$ is the number of examples.</li> <li>$y$ is the actual value of the label.</li> <li>$\hat{y}$ is the value that the model predicts for $y$.</li> </ul> </div> <hr /> </section> <p><a class="glossary-anchor" name="L2_regularization"></a> <h2 class="hide-from-toc" id="l2-regularization" data-text=" L2 regularization" tabindex="-1"> L<sub>2</sub> regularization</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="ML Fundamentals">#fundamentals</div> </div></p> <p>A type of <a href="#regularization"><strong>regularization</strong></a> that penalizes <a href="#weight"><strong>weights</strong></a> in proportion to the sum of the <em>squares</em> of the weights. L<sub>2</sub> regularization helps drive <a href="#outliers"><strong>outlier</strong></a> weights (those with high positive or low negative values) closer to 0 but <em>not quite to 0</em>. Features with values very close to 0 remain in the model but don't influence the model's prediction very much.</p> <p>L<sub>2</sub> regularization always improves generalization in <a href="#linear_model"><strong>linear models</strong></a>.</p> <p>Contrast with <a href="#L1_regularization"><strong>L<sub>1</sub> regularization</strong></a>.</p> <p><a class="glossary-anchor" name="label"></a> <h2 class="hide-from-toc" id="label" data-text=" label" tabindex="-1"> label</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="ML Fundamentals">#fundamentals</div> </div></p> <p>In <a href="#supervised_machine_learning"><strong>supervised machine learning</strong></a>, the "answer" or "result" portion of an <a href="#example"><strong>example</strong></a>.</p> <p>Each <a href="#labeled_example"><strong>labeled example</strong></a> consists of one or more <a href="#feature"><strong>features</strong></a> and a label. For example, in a spam detection dataset, the label would probably be either "spam" or "not spam." In a rainfall dataset, the label might be the amount of rain that fell during a certain period.</p> <p><a class="glossary-anchor" name="labeled_example"></a> <h2 class="hide-from-toc" id="labeled-example" data-text=" labeled example" tabindex="-1"> labeled example</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="ML Fundamentals">#fundamentals</div> </div></p> <p>An example that contains one or more <a href="#feature"><strong>features</strong></a> and a <a href="#label"><strong>label</strong></a>. For example, the following table shows three labeled examples from a house valuation model, each with three features and one label:</p> <table> <tr><th>Number of bedrooms</th> <th>Number of bathrooms</th> <th>House age</th> <th>House price (label)</th></tr> <tr><td>3</td> <td>2</td> <td>15</td> <td>$345,000</td></tr> <tr><td>2</td> <td>1</td> <td>72</td> <td>$179,000</td></tr> <tr><td>4</td> <td>2</td> <td>34</td> <td>$392,000</td></tr> </table> <p>In <a href="#supervised_machine_learning"><strong>supervised machine learning</strong></a>, models train on labeled examples and make predictions on <a href="#unlabeled_example"><strong>unlabeled examples</strong></a>.</p> <p>Contrast labeled example with unlabeled examples.</p> <p><a class="glossary-anchor" name="label-leakage"></a> <h2 class="hide-from-toc" id="label-leakage" data-text=" label leakage" tabindex="-1"> label leakage</h2></p> <p>A model design flaw in which a <a href="#feature"><strong>feature</strong></a> is a proxy for the <a href="#label"><strong>label</strong></a>. For example, consider a <a href="#binary-classification"><strong>binary classification</strong></a> model that predicts whether or not a prospective customer will purchase a particular product. Suppose that one of the features for the model is a Boolean named <code translate="no" dir="ltr">SpokeToCustomerAgent</code>. Further suppose that a customer agent is only assigned <em>after</em> the prospective customer has actually purchased the product. During training, the model will quickly learn the association between <code translate="no" dir="ltr">SpokeToCustomerAgent</code> and the label.</p> <p><a class="glossary-anchor" name="lambda"></a> <h2 class="hide-from-toc" id="lambda" data-text=" lambda" tabindex="-1"> lambda</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="ML Fundamentals">#fundamentals</div> </div></p> <p>Synonym for <a href="#regularization_rate"><strong>regularization rate</strong></a>.</p> <p>Lambda is an overloaded term. Here we're focusing on the term's definition within <a href="#regularization"><strong>regularization</strong></a>.</p> <p><a class="glossary-anchor" name="LaMDA"></a> <h2 class="hide-from-toc" id="lamda-language-model-for-dialogue-applications" data-text=" LaMDA (Language Model for Dialogue Applications)" tabindex="-1"> LaMDA (Language Model for Dialogue Applications)</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="Language Evaluation">#language</div> </div></p> <p>A <a href="#Transformer"><strong>Transformer</strong></a>-based <a href="#large-language-model"><strong>large language model</strong></a> developed by Google trained on a large dialogue dataset that can generate realistic conversational responses.</p> <p><a href="https://blog.google/technology/ai/lamda/">LaMDA: our breakthrough conversation technology</a> provides an overview.</p> <p><a class="glossary-anchor" name="landmarks"></a> <h2 class="hide-from-toc" id="landmarks" data-text=" landmarks" tabindex="-1"> landmarks</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="Image Models">#image</div> </div></p> <p>Synonym for <a href="#keypoints"><strong>keypoints</strong></a>.</p> <p><a class="glossary-anchor" name="language-model"></a> <h2 class="hide-from-toc" id="language-model" data-text=" language model" tabindex="-1"> language model</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="Language Evaluation">#language</div> </div></p> <p>A <a href="#model"><strong>model</strong></a> that estimates the probability of a <a href="#token"><strong>token</strong></a> or sequence of tokens occurring in a longer sequence of tokens.</p> <section class="expandable"> <h4 class="showalways" id="click-the-icon-for-additional-notes._9" data-text=" Click the icon for additional notes. " tabindex="-1"> Click the icon for additional notes. </h4> <div class="expand-background"> <p> Though counterintuitive, many models that evaluate text are not <b>language models</b>. For example, text classification models and sentiment analysis models are not <b>language models</b>. </p> </div> <hr /> </section> <p><a class="glossary-anchor" name="large-language-model"></a> <h2 class="hide-from-toc" id="large-language-model" data-text=" large language model" tabindex="-1"> large language model</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="Language Evaluation">#language</div> </div></p> <p>At a minimum, a <a href="#language-model"><strong>language model</strong></a> having a very high number of <a href="#parameter"><strong>parameters</strong></a>. More informally, any <a href="#Transformer"><b>Transformer</b></a>-based language model, such as <a href="#Gemini"><b>Gemini</b></a> or <a href="#GPT"><b>GPT</b></a>.</p> <p><a class="glossary-anchor" name="latent_space"></a> <h2 class="hide-from-toc" id="latent-space" data-text=" latent space" tabindex="-1"> latent space</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="Language Evaluation">#language</div> </div></p> <p>Synonym for <a href="#embedding_space"><strong>embedding space</strong></a>.</p> <p><a class="glossary-anchor" name="layer"></a> <h2 class="hide-from-toc" id="layer" data-text=" layer" tabindex="-1"> layer</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="ML Fundamentals">#fundamentals</div> </div></p> <p>A set of <a href="#neuron"><strong>neurons</strong></a> in a <a href="#neural_network"><strong>neural network</strong></a>. Three common types of layers are as follows:</p> <ul> <li>The <a href="#input-layer"><strong>input layer</strong></a>, which provides values for all the <a href="#feature"><strong>features</strong></a>.</li> <li>One or more <a href="#hidden_layer"><strong>hidden layers</strong></a>, which find nonlinear relationships between the features and the label.</li> <li>The <a href="#output_layer"><strong>output layer</strong></a>, which provides the prediction.</li> </ul> <p>For example, the following illustration shows a neural network with one input layer, two hidden layers, and one output layer:</p> <p> <img src="/static/machine-learning/glossary/images/Layers.png" loading="lazy" width="750" alt="A neural network with one input layer, two hidden layers, and one output layer. The input layer consists of two features. The first hidden layer consists of three neurons and the second hidden layer consists of two neurons. The output layer consists of a single node." > </p> <p>In <a href="#TensorFlow"><strong>TensorFlow</strong></a>, <strong>layers</strong> are also Python functions that take <a href="#tensor"><strong>Tensors</strong></a> and configuration options as input and produce other tensors as output.</p> <p><a class="glossary-anchor" name="layers_API"></a> <h2 class="hide-from-toc" id="layers-api-tf.layers" data-text=" Layers API (tf.layers)" tabindex="-1"> Layers API (tf.layers)</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="TensorFlow">#TensorFlow</div> </div></p> <p>A TensorFlow API for constructing a <a href="#deep_model"><strong>deep</strong></a> neural network as a composition of layers. The Layers API lets you build different types of <a href="#layer"><strong>layers</strong></a>, such as:</p> <ul> <li><code translate="no" dir="ltr">tf.layers.Dense</code> for a <a href="#fully_connected_layer"><strong>fully-connected layer</strong></a>.</li> <li><code translate="no" dir="ltr">tf.layers.Conv2D</code> for a convolutional layer.</li> </ul> <p>The Layers API follows the <a href="#Keras"><strong>Keras</strong></a> layers API conventions. That is, aside from a different prefix, all functions in the Layers API have the same names and signatures as their counterparts in the Keras layers API.</p> <p><a class="glossary-anchor" name="leaf"></a> <h2 class="hide-from-toc" id="leaf" data-text=" leaf " tabindex="-1"> leaf </h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="Decision Forests">#df</div> </div></p> <p>Any endpoint in a <a href="#decision-tree"><strong>decision tree</strong></a>. Unlike a <a href="#condition"><strong>condition</strong></a>, a leaf doesn't perform a test. Rather, a leaf is a possible prediction. A leaf is also the terminal <a href="#node"><strong>node</strong></a> of an <a href="#inference-path"><strong>inference path</strong></a>.</p> <p>For example, the following decision tree contains three leaves:</p> <p> <img src="/static/machine-learning/glossary/images/Leaf.png" width="500" loading="lazy" alt="A decision tree with two conditions leading to three leaves." > </p> <p><a class="glossary-anchor" name="Learning-Interpretability-Tool"></a> <h2 class="hide-from-toc" id="learning-interpretability-tool-lit" data-text=" Learning Interpretability Tool (LIT)" tabindex="-1"> Learning Interpretability Tool (LIT)</h2></p> <p>A visual, interactive model-understanding and data visualization tool.</p> <p>You can use open-source <a href="https://pair-code.github.io/lit/">LIT</a> to <a href="#interpretability"><strong>interpret</strong></a> models or to visualize text, image, and tabular data.</p> <p><a class="glossary-anchor" name="learning_rate"></a> <h2 class="hide-from-toc" id="learning-rate" data-text=" learning rate" tabindex="-1"> learning rate</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="ML Fundamentals">#fundamentals</div> </div></p> <p>A floating-point number that tells the <a href="#gradient_descent"><strong>gradient descent</strong></a> algorithm how strongly to adjust weights and biases on each <a href="#iteration"><strong>iteration</strong></a>. For example, a learning rate of 0.3 would adjust weights and biases three times more powerfully than a learning rate of 0.1.</p> <p>Learning rate is a key <a href="#hyperparameter"><strong>hyperparameter</strong></a>. If you set the learning rate too low, training will take too long. If you set the learning rate too high, gradient descent often has trouble reaching <a href="#convergence"><strong>convergence</strong></a>.</p> <section class="expandable"> <h4 class="showalways" id="click-the-icon-for-a-more-mathematical-explanation." data-text=" Click the icon for a more mathematical explanation. " tabindex="-1"> Click the icon for a more mathematical explanation. </h4> <div class="expand-background"> <p> During each iteration, the <a href="#gradient_descent"><b>gradient descent</b></a> algorithm multiplies the learning rate by the gradient. The resulting product is called the <b>gradient step</b>. </p> </div> <hr /> </section> <p><a class="glossary-anchor" name="least_squares_regression"></a> <h2 class="hide-from-toc" id="least-squares-regression" data-text=" least squares regression" tabindex="-1"> least squares regression</h2></p> <p>A linear regression model trained by minimizing <a href="#L2_loss"><strong>L<sub>2</sub> Loss</strong></a>.</p> <p><a class="glossary-anchor" name="linear"></a> <h2 class="hide-from-toc" id="linear" data-text=" linear " tabindex="-1"> linear </h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="ML Fundamentals">#fundamentals</div> </div></p> <p>A relationship between two or more variables that can be represented solely through addition and multiplication.</p> <p>The plot of a linear relationship is a line.</p> <p>Contrast with <a href="#nonlinear"><strong>nonlinear</strong></a>.</p> <p><a class="glossary-anchor" name="linear_model"></a> <h2 class="hide-from-toc" id="linear-model" data-text=" linear model" tabindex="-1"> linear model</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="ML Fundamentals">#fundamentals</div> </div></p> <p>A <a href="#model"><strong>model</strong></a> that assigns one <a href="#weight"><strong>weight</strong></a> per <a href="#feature"><strong>feature</strong></a> to make <a href="#prediction"><strong>predictions</strong></a>. (Linear models also incorporate a <a href="#bias"><strong>bias</strong></a>.) In contrast, the relationship of features to predictions in <a href="#deep_model"><strong>deep models</strong></a> is generally <strong>nonlinear</strong>.</p> <p>Linear models are usually easier to train and more <a href="#interpretability"><strong>interpretable</strong></a> than deep models. However, deep models can learn complex relationships <em>between</em> features.</p> <p><a href="#linear_regression"><strong>Linear regression</strong></a> and <a href="#logistic_regression"><strong>logistic regression</strong></a> are two types of linear models.</p> <section class="expandable"> <h4 class="showalways" id="click-the-icon-to-see-the-math." data-text=" Click the icon to see the math. " tabindex="-1"> Click the icon to see the math. </h4> <div class="expand-background"> <p>A linear model follows this formula:</p> <div> $$y' = b + w_1x_1 + w_2x_2 + … w_nx_n$$ </div> where: <ul> <li>y' is the raw prediction. (In certain kinds of linear models, this raw prediction will be further modified. For example, see <a href="#logistic_regression"><b>logistic regression</b></a>.)</li> <li>b is the <a href="#bias"><b>bias</b></a>.</li> <li>w is a <a href="#weight"><b>weight</b></a>, so w<sub>1</sub> is the weight of the first feature, w<sub>2</sub> is the weight of the second feature, and so on.</li> <li>x is a <a href="#feature"><b>feature</b></a>, so x<sub>1</sub> is the value of the first feature, x<sub>2</sub> is the value of the second feature, and so on.</li> </ul> For example, suppose a linear model for three features learns the following bias and weights: <ul> <li>b = 7</li> <li>w<sub>1</sub> = -2.5</li> <li>w<sub>2</sub> = -1.2</li> <li>w<sub>3</sub> = 1.4</li> </ul> Therefore, given three features (x<sub>1</sub>, x<sub>2</sub>, and x<sub>3</sub>), the linear model uses the following equation to generate each prediction: <div></div><devsite-code><pre translate="no" dir="ltr" is-upgraded> y' = 7 + (-2.5)(x<sub>1</sub>) + (-1.2)(x<sub>2</sub>) + (1.4)(x<sub>3</sub>) </pre></devsite-code> <p>Suppose a particular example contains the following values:</p> <ul> <li>x<sub>1</sub> = 4</li> <li>x<sub>2</sub> = -10</li> <li>x<sub>3</sub> = 5</li> </ul> Plugging those values into the formula yields a prediction for this example: <div></div><devsite-code><pre translate="no" dir="ltr" is-upgraded> y' = 7 + (-2.5)(4) + (-1.2)(-10) + (1.4)(5) y' = 16 </pre></devsite-code> <p>Linear models include not only models that use only a linear equation to make predictions but also a broader set of models that use a linear equation as just one component of the formula that makes predictions. For example, logistic regression post-processes the raw prediction (y') to produce a final prediction value between 0 and 1, exclusively.</p> </div> <hr /> </section> <p><a class="glossary-anchor" name="linear_regression"></a> <h2 class="hide-from-toc" id="linear-regression" data-text=" linear regression" tabindex="-1"> linear regression</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="ML Fundamentals">#fundamentals</div> </div></p> <p>A type of machine learning model in which both of the following are true:</p> <ul> <li>The model is a <a href="#linear_model"><strong>linear model</strong></a>.</li> <li>The prediction is a floating-point value. (This is the <a href="#regression_model"><strong>regression</strong></a> part of <em>linear regression</em>.)</li> </ul> <p>Contrast linear regression with <a href="#logistic_regression"><strong>logistic regression</strong></a>. Also, contrast regression with <a href="#classification_model"><strong>classification</strong></a>.</p> <p><a class="glossary-anchor" name="LIT"></a> <h2 class="hide-from-toc" id="lit" data-text=" LIT" tabindex="-1"> LIT</h2></p> <p>Abbreviation for the <a href="#Learning-Interpretability-Tool"><strong>Learning Interpretability Tool (LIT)</strong></a>, which was previously known as the Language Interpretability Tool.</p> <p><a class="glossary-anchor" name="LLM"></a> <h2 class="hide-from-toc" id="llm" data-text=" LLM" tabindex="-1"> LLM</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="Language Evaluation">#language</div> <div class="glossary-icon" title="Generative AI">#generativeAI</div> </div></p> <p>Abbreviation for <a href="#large-language-model"><strong>large language model</strong></a>.</p> <p><a class="glossary-anchor" name="LLM-evaluation"></a> <a class="glossary-anchor" name="LLM-evaluations"></a> <h2 class="hide-from-toc" id="llm-evaluations-evals" data-text=" LLM evaluations (evals)" tabindex="-1"> LLM evaluations (evals)</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="Language Evaluation">#language</div> <div class="glossary-icon" title="Generative AI">#generativeAI</div> </div></p> <p>A set of metrics and benchmarks for assessing the performance of <a href="#large-language-model">large language models</a> (LLMs). At a high level, LLM evaluations:</p> <ul> <li>Help researchers identify areas where LLMs need improvement.</li> <li>Are useful in comparing different LLMs and identifying the best LLM for a particular task.</li> <li>Help ensure that LLMs are safe and ethical to use.</li> </ul> <p><a class="glossary-anchor" name="logistic_regression"></a> <h2 class="hide-from-toc" id="logistic-regression" data-text=" logistic regression" tabindex="-1"> logistic regression</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="ML Fundamentals">#fundamentals</div> </div></p> <p>A type of <a href="#regression_model"><strong>regression model</strong></a> that predicts a probability. Logistic regression models have the following characteristics:</p> <ul> <li>The label is <a href="#categorical_data"><strong>categorical</strong></a>. The term logistic regression usually refers to <strong>binary logistic regression</strong>, that is, to a model that calculates probabilities for labels with two possible values. A less common variant, <strong>multinomial logistic regression</strong>, calculates probabilities for labels with more than two possible values.</li> <li>The loss function during training is <a href="#Log_Loss"><strong>Log Loss</strong></a>. (Multiple Log Loss units can be placed in parallel for labels with more than two possible values.)</li> <li>The model has a linear architecture, not a deep neural network. However, the remainder of this definition also applies to <a href="#deep_model"><strong>deep models</strong></a> that predict probabilities for categorical labels.</li> </ul> <p>For example, consider a logistic regression model that calculates the probability of an input email being either spam or not spam. During inference, suppose the model predicts 0.72. Therefore, the model is estimating:</p> <ul> <li>A 72% chance of the email being spam.</li> <li>A 28% chance of the email not being spam.</li> </ul> <p>A logistic regression model uses the following two-step architecture:</p> <ol> <li>The model generates a raw prediction (y') by applying a linear function of input features.</li> <li>The model uses that raw prediction as input to a <a href="#sigmoid-function"><strong>sigmoid function</strong></a>, which converts the raw prediction to a value between 0 and 1, exclusive.</li> </ol> <p>Like any regression model, a logistic regression model predicts a number. However, this number typically becomes part of a binary classification model as follows:</p> <ul> <li>If the predicted number is <em>greater</em> than the <a href="#classification_threshold"><strong>classification threshold</strong></a>, the binary classification model predicts the positive class.</li> <li>If the predicted number is <em>less</em> than the classification threshold, the binary classification model predicts the negative class.</li> </ul> <p><a class="glossary-anchor" name="logits"></a> <h2 class="hide-from-toc" id="logits" data-text=" logits" tabindex="-1"> logits</h2></p> <p>The vector of raw (non-normalized) predictions that a classification model generates, which is ordinarily then passed to a normalization function. If the model is solving a <a href="#multi-class"><strong>multi-class classification</strong></a> problem, logits typically become an input to the <a href="#softmax"><strong>softmax</strong></a> function. The softmax function then generates a vector of (normalized) probabilities with one value for each possible class.</p> <p><a class="glossary-anchor" name="Log_Loss"></a> <h2 class="hide-from-toc" id="log-loss" data-text=" Log Loss" tabindex="-1"> Log Loss</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="ML Fundamentals">#fundamentals</div> </div></p> <p>The <a href="#loss-function"><strong>loss function</strong></a> used in binary <a href="#logistic_regression"><strong>logistic regression</strong></a>.</p> <section class="expandable"> <h4 class="showalways" id="click-the-icon-to-see-the-math._1" data-text=" Click the icon to see the math. " tabindex="-1"> Click the icon to see the math. </h4> <div class="expand-background"> <p> The following formula calculates Log Loss: </p> <div> $$\text{Log Loss} = \sum_{(x,y)\in D} -y\log(y') - (1 - y)\log(1 - y')$$ </div> where: <ul> <li> $(x,y)\in D$ is the dataset containing many labeled examples, which are $(x,y)$ pairs. </li> <li> $y$ is the label in a labeled example. Since this is logistic regression, every value of $y$ must either be 0 or 1. </li> <li> $y'$ is the predicted value (somewhere between 0 and 1, exclusive), given the set of features in $x$. </li> </ul> </div> <hr /> </section> <p><a class="glossary-anchor" name="log-odds"></a> <h2 class="hide-from-toc" id="log-odds" data-text=" log-odds" tabindex="-1"> log-odds</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="ML Fundamentals">#fundamentals</div> </div></p> <p>The logarithm of the odds of some event.</p> <section class="expandable"> <h4 class="showalways" id="click-the-icon-to-see-the-math._2" data-text=" Click the icon to see the math. " tabindex="-1"> Click the icon to see the math. </h4> <div class="expand-background"> <p> If the event is a binary probability, then <b>odds</b> refers to the ratio of the probability of success (<i>p</i>) to the probability of failure (1-<i>p</i>). For example, suppose that a given event has a 90% probability of success and a 10% probability of failure. In this case, odds is calculated as follows: </p> <div> $$ {\text{odds}} = \frac{\text{p}} {\text{(1-p)}} = \frac{.9} {.1} = {\text{9}} $$ </div> <p>The log-odds is simply the logarithm of the odds. By convention, "logarithm" refers to <a href="https://wikipedia.org/wiki/Natural_logarithm" target="T">natural logarithm</a>, but logarithm could actually be any base greater than 1. Sticking to convention, the log-odds of our example is therefore:</p> <div> $$ {\text{log-odds}} = ln(9) ~= 2.2 $$ </div> <p>The log-odds function is the inverse of the <a href="#sigmoid-function"><b>sigmoid function</b></a>. </p> </div> <hr /> </section> <p><a class="glossary-anchor" name="Long_Short-Term_Memory"></a> <h2 class="hide-from-toc" id="long-short-term-memory-lstm" data-text=" Long Short-Term Memory (LSTM)" tabindex="-1"> Long Short-Term Memory (LSTM)</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="Sequence Models">#seq</div> </div></p> <p>A type of cell in a <a href="#recurrent_neural_network"><strong>recurrent neural network</strong></a> used to process sequences of data in applications such as handwriting recognition, machine translation, and image captioning. LSTMs address the <a href="#vanishing_gradient_problem"><strong>vanishing gradient problem</strong></a> that occurs when training RNNs due to long data sequences by maintaining history in an internal memory state based on new input and context from previous cells in the RNN.</p> <p><a class="glossary-anchor" name="LoRA"></a> <h2 class="hide-from-toc" id="lora" data-text=" LoRA" tabindex="-1"> LoRA</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="Language Evaluation">#language</div> <div class="glossary-icon" title="Generative AI">#generativeAI</div> </div></p> <p>Abbreviation for <a href="#Low-Rank-Adaptability"><strong>Low-Rank Adaptability</strong></a>.</p> <p><a class="glossary-anchor" name="loss"></a> <h2 class="hide-from-toc" id="loss" data-text=" loss" tabindex="-1"> loss</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="ML Fundamentals">#fundamentals</div> </div></p> <p>During the <a href="#training"><strong>training</strong></a> of a <a href="#supervised_machine_learning"><strong>supervised model</strong></a>, a measure of how far a model's <a href="#prediction"><strong>prediction</strong></a> is from its <a href="#label"><strong>label</strong></a>.</p> <p>A <a href="#loss-function"><strong>loss function</strong></a> calculates the loss.</p> <p><a class="glossary-anchor" name="loss_aggregator"></a> <h2 class="hide-from-toc" id="loss-aggregator" data-text=" loss aggregator" tabindex="-1"> loss aggregator</h2></p> <p>A type of <a href="#machine_learning"><strong>machine learning</strong></a> algorithm that improves the <a href="#performance"><strong>performance</strong></a> of a <a href="#model"><strong>model</strong></a> by combining the <a href="#prediction"><strong>predictions</strong></a> of multiple models and using those predictions to make a single prediction. As a result, a loss aggregator can reduce the variance of the predictions and improve the <a href="#accuracy"><strong>accuracy</strong></a> of the predictions.</p> <p><a class="glossary-anchor" name="loss_curve"></a> <h2 class="hide-from-toc" id="loss-curve" data-text=" loss curve" tabindex="-1"> loss curve</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="ML Fundamentals">#fundamentals</div> </div></p> <p>A plot of <a href="#loss"><strong>loss</strong></a> as a function of the number of training <a href="#iteration"><strong>iterations</strong></a>. The following plot shows a typical loss curve:</p> <p> <img src="/static/machine-learning/glossary/images/LossCurveSmooth.png" width="350" loading="lazy" alt="A Cartesian graph of loss versus training iterations, showing a rapid drop in loss for the initial iterations, followed by a gradual drop, and then a flat slope during the final iterations." > </p> <p>Loss curves can help you determine when your model is <a href="#convergence"><strong>converging</strong></a> or <a href="#overfitting"><strong>overfitting</strong></a>.</p> <p>Loss curves can plot all of the following types of loss:</p> <ul> <li><a href="#training-loss"><strong>training loss</strong></a></li> <li><a href="#validation-loss"><strong>validation loss</strong></a></li> <li><a href="#test-loss"><strong>test loss</strong></a></li> </ul> <p>See also <a href="#generalization_curve"><strong>generalization curve</strong></a>.</p> <p><a class="glossary-anchor" name="loss-function"></a> <h2 class="hide-from-toc" id="loss-function" data-text=" loss function" tabindex="-1"> loss function</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="ML Fundamentals">#fundamentals</div> </div></p> <p>During <a href="#training"><strong>training</strong></a> or testing, a mathematical function that calculates the loss on a <a href="#batch"><strong>batch</strong></a> of examples. A loss function returns a lower loss for models that makes good predictions than for models that make bad predictions.</p> <p>The goal of training is typically to minimize the loss that a loss function returns.</p> <p>Many different kinds of loss functions exist. Pick the appropriate loss function for the kind of model you are building. For example:</p> <ul> <li><a href="#L2_loss"><strong>L<sub>2</sub> loss</strong></a> (or <a href="#MSE"><strong>Mean Squared Error</strong></a>) is the loss function for <a href="#linear_regression"><strong>linear regression</strong></a>.</li> <li><a href="#Log_Loss"><strong>Log Loss</strong></a> is the loss function for <a href="#logistic_regression"><strong>logistic regression</strong></a>.</li> </ul> <p><a class="glossary-anchor" name="loss_surface"></a> <h2 class="hide-from-toc" id="loss-surface" data-text=" loss surface" tabindex="-1"> loss surface</h2></p> <p>A graph of weight(s) versus loss. <a href="#gradient_descent"><strong>Gradient descent</strong></a> aims to find the weight(s) for which the loss surface is at a local minimum.</p> <p><a class="glossary-anchor" name="Low-Rank-Adaptability"></a> <h2 class="hide-from-toc" id="low-rank-adaptability-lora" data-text=" Low-Rank Adaptability (LoRA)" tabindex="-1"> Low-Rank Adaptability (LoRA)</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="Language Evaluation">#language</div> <div class="glossary-icon" title="Generative AI">#generativeAI</div> </div></p> <p>A <a href="#parameter-efficient-tuning"><strong>parameter-efficient</strong></a> technique for <a href="#fine-tuning"><strong>fine tuning</strong></a> that "freezes" the model's pre-trained weights (such that they can no longer be modified) and then inserts a small set of trainable weights into the model. This set of trainable weights (also known as "update matrices") is considerably smaller than the base model and is therefore much faster to train.</p> <p>LoRA provides the following benefits:</p> <ul> <li>Improves the quality of a model's predictions for the domain where the fine tuning is applied.</li> <li>Fine-tunes faster than techniques that require fine-tuning <em>all</em> of a model's parameters.</li> <li>Reduces the computational cost of <a href="#inference"><strong>inference</strong></a> by enabling concurrent serving of multiple specialized models sharing the same base model.</li> </ul> <section class="expandable"> <h4 class="showalways" id="click-the-icon-to-learn-more-about-update-matrices-in-lora." data-text=" Click the icon to learn more about update matrices in LoRA. " tabindex="-1"> Click the icon to learn more about update matrices in LoRA. </h4> <div class="expand-background"> The update matrices used in LoRA consist of <a href="https://wikipedia.org/wiki/Rank_factorization">rank decomposition matrices</a>, which are derived from the base model to help filter out noise and focus training on the most important features of the model. </div> <hr /> </section> <p><a class="glossary-anchor" name="LSTM"></a> <h2 class="hide-from-toc" id="lstm" data-text=" LSTM" tabindex="-1"> LSTM</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="Sequence Models">#seq</div> </div></p> <p>Abbreviation for <a href="#Long_Short-Term_Memory"><strong>Long Short-Term Memory</strong></a>.</p> <p><a class="glossary-anchor" name="m"></a> <h2 class="glossary" id="m" data-text="M" tabindex="-1">M</h2></p> <p><a class="glossary-anchor" name="machine_learning"></a> <h2 class="hide-from-toc" id="machine-learning" data-text=" machine learning" tabindex="-1"> machine learning</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="ML Fundamentals">#fundamentals</div> </div></p> <p>A program or system that <a href="#training"><strong>trains</strong></a> a <a href="#model"><strong>model</strong></a> from input data. The trained model can make useful predictions from new (never-before-seen) data drawn from the same distribution as the one used to train the model.</p> <p>Machine learning also refers to the field of study concerned with these programs or systems.</p> <p><a class="glossary-anchor" name="majority_class"></a> <h2 class="hide-from-toc" id="majority-class" data-text=" majority class" tabindex="-1"> majority class</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="ML Fundamentals">#fundamentals</div> </div></p> <p>The more common label in a <a href="#class_imbalanced_data_set"><strong>class-imbalanced dataset</strong></a>. For example, given a dataset containing 99% negative labels and 1% positive labels, the negative labels are the majority class.</p> <p>Contrast with <a href="#minority_class"><strong>minority class</strong></a>.</p> <p><a class="glossary-anchor" name="markov_decision_process"></a> <h2 class="hide-from-toc" id="markov-decision-process-mdp" data-text=" Markov decision process (MDP)" tabindex="-1"> Markov decision process (MDP)</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="Reinforcement Learning">#rl</div> </div></p> <p>A graph representing the decision-making model where decisions (or <a href="#action"><strong>actions</strong></a>) are taken to navigate a sequence of <a href="#state"><strong>states</strong></a> under the assumption that the <a href="#Markov_property"><strong>Markov property</strong></a> holds. In <a href="#reinforcement_learning"><strong>reinforcement learning</strong></a>, these transitions between states return a numerical <a href="#reward"><strong>reward</strong></a>.</p> <p><a class="glossary-anchor" name="Markov_property"></a> <h2 class="hide-from-toc" id="markov-property" data-text=" Markov property" tabindex="-1"> Markov property</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="Reinforcement Learning">#rl</div> </div></p> <p>A property of certain <a href="#environment"><strong>environments</strong></a>, where state transitions are entirely determined by information implicit in the current <a href="#state"><strong>state</strong></a> and the agent's <a href="#action"><strong>action</strong></a>.</p> <p><a class="glossary-anchor" name="masked-language-model"></a> <h2 class="hide-from-toc" id="masked-language-model" data-text=" masked language model" tabindex="-1"> masked language model</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="Language Evaluation">#language</div> </div></p> <p>A <a href="#language-model"><strong>language model</strong></a> that predicts the probability of candidate tokens to fill in blanks in a sequence. For example, a masked language model can calculate probabilities for candidate word(s) to replace the underline in the following sentence:</p> <blockquote> <p>The ____ in the hat came back.</p> </blockquote> <p>The literature typically uses the string "MASK" instead of an underline. For example:</p> <blockquote> <p>The "MASK" in the hat came back.</p> </blockquote> <p>Most modern masked language models are <a href="#bidirectional"><strong>bidirectional</strong></a>.</p> <p><a class="glossary-anchor" name="matplotlib"></a> <h2 class="hide-from-toc" id="matplotlib" data-text=" matplotlib" tabindex="-1"> matplotlib</h2></p> <p>An open-source Python 2D plotting library. <a href="https://matplotlib.org/" target="T">matplotlib</a> helps you visualize different aspects of machine learning.</p> <p><a class="glossary-anchor" name="matrix_factorization"></a> <h2 class="hide-from-toc" id="matrix-factorization" data-text=" matrix factorization" tabindex="-1"> matrix factorization</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="Recommendation Systems">#recsystems</div> </div></p> <p>In math, a mechanism for finding the matrixes whose dot product approximates a target matrix.</p> <p>In <a href="#recommendation_system"><strong>recommendation systems</strong></a>, the target matrix often holds users' ratings on <a href="#items"><strong>items</strong></a>. For example, the target matrix for a movie recommendation system might look something like the following, where the positive integers are user ratings and 0 means that the user didn't rate the movie:</p> <table> <tr> <th> </th> <th>Casablanca</th> <th>The Philadelphia Story</th> <th>Black Panther</th> <th>Wonder Woman</th> <th>Pulp Fiction</th> </tr> <tr> <td>User 1</td> <td>5.0</td> <td>3.0</td> <td>0.0</td> <td>2.0</td> <td>0.0</td> </tr> <tr> <td>User 2</td> <td>4.0</td> <td>0.0</td> <td>0.0</td> <td>1.0</td> <td>5.0</td> </tr> <tr> <td>User 3</td> <td>3.0</td> <td>1.0</td> <td>4.0</td> <td>5.0</td> <td>0.0</td> </tr> </table> <p>The movie recommendation system aims to predict user ratings for unrated movies. For example, will User 1 like <em>Black Panther</em>?</p> <p>One approach for recommendation systems is to use matrix factorization to generate the following two matrixes:</p> <ul> <li>A <a href="#user_matrix"><strong>user matrix</strong></a>, shaped as the number of users X the number of embedding dimensions.</li> <li>An <a href="#item_matrix"><strong>item matrix</strong></a>, shaped as the number of embedding dimensions X the number of items.</li> </ul> <p>For example, using matrix factorization on our three users and five items could yield the following user matrix and item matrix:</p> <div></div><devsite-code><pre class="devsite-click-to-copy" translate="no" dir="ltr" is-upgraded syntax="Text only">User Matrix Item Matrix 1.1 2.3 0.9 0.2 1.4 2.0 1.2 0.6 2.0 1.7 1.2 1.2 -0.1 2.1 2.5 0.5</pre></devsite-code> <p>The dot product of the user matrix and item matrix yields a recommendation matrix that contains not only the original user ratings but also predictions for the movies that each user hasn't seen. For example, consider User 1's rating of <em>Casablanca</em>, which was 5.0. The dot product corresponding to that cell in the recommendation matrix should hopefully be around 5.0, and it is:</p> <div></div><devsite-code><pre translate="no" dir="ltr" is-upgraded> (1.1 * 0.9) + (2.3 * 1.7) = 4.9 </pre></devsite-code> <p>More importantly, will User 1 like <em>Black Panther</em>? Taking the dot product corresponding to the first row and the third column yields a predicted rating of 4.3:</p> <div></div><devsite-code><pre translate="no" dir="ltr" is-upgraded> (1.1 * 1.4) + (2.3 * 1.2) = 4.3 </pre></devsite-code> <p>Matrix factorization typically yields a user matrix and item matrix that, together, are significantly more compact than the target matrix.</p> <p><a class="glossary-anchor" name="MAE"></a> <h2 class="hide-from-toc" id="mean-absolute-error-mae" data-text=" Mean Absolute Error (MAE)" tabindex="-1"> Mean Absolute Error (MAE)</h2></p> <p>The average loss per example when <a href="#L1_loss"><strong>L<sub>1</sub> loss</strong></a> is used. Calculate Mean Absolute Error as follows:</p> <ol> <li>Calculate the L<sub>1</sub> loss for a batch.</li> <li>Divide the L<sub>1</sub> loss by the number of examples in the batch.</li> </ol> <section class="expandable"> <h4 class="showalways" id="click-the-icon-to-see-the-formal-math._2" data-text=" Click the icon to see the formal math. " tabindex="-1"> Click the icon to see the formal math. </h4> <div class="expand-background"> <p> $$\text{Mean Absolute Error} = \frac{1}{n}\sum_{i=0}^n | y_i - \hat{y}_i |$$ </p> <p>where:</p> <ul> <li>$n$ is the number of examples.</li> <li>$y$ is the actual value of the label.</li> <li>$\hat{y}$ is the value that the model predicts for $y$.</li> </ul> </div> <hr /> </section> <p>For example, consider the calculation of L<sub>1</sub> loss on the following batch of five examples:</p> <table> <tr><th>Actual value of example</th> <th>Model's predicted value</th> <th>Loss (difference between actual and predicted)</th></tr> <tr><td>7</td> <td>6</td> <td>1</td> </tr> <tr><td>5</td> <td>4</td> <td>1</td> </tr> <tr><td>8</td> <td>11</td> <td>3</td> </tr> <tr><td>4</td> <td>6</td> <td>2</td> </tr> <tr><td>9</td> <td>8</td> <td>1</td> </tr> <tr><th colspan="2"> </th> <th>8 = L<sub>1</sub> loss</th> </tr> </table> <p>So, L<sub>1</sub> loss is 8 and the number of examples is 5. Therefore, the Mean Absolute Error is:</p> <div></div><devsite-code><pre translate="no" dir="ltr" is-upgraded> Mean Absolute Error = L<sub>1</sub> loss / Number of Examples Mean Absolute Error = 8/5 = 1.6 </pre></devsite-code> <p>Contrast Mean Absolute Error with <a href="#MSE"><strong>Mean Squared Error</strong></a> and <a href="#RMSE"><strong>Root Mean Squared Error</strong></a>.</p> <p><a class="glossary-anchor" name="MSE"></a> <h2 class="hide-from-toc" id="mean-squared-error-mse" data-text=" Mean Squared Error (MSE)" tabindex="-1"> Mean Squared Error (MSE)</h2></p> <p>The average loss per example when <a href="#L2_loss"><strong>L<sub>2</sub> loss</strong></a> is used. Calculate Mean Squared Error as follows:</p> <ol> <li>Calculate the L<sub>2</sub> loss for a batch.</li> <li>Divide the L<sub>2</sub> loss by the number of examples in the batch.</li> </ol> <section class="expandable"> <h4 class="showalways" id="click-the-icon-to-see-the-formal-math._3" data-text=" Click the icon to see the formal math. " tabindex="-1"> Click the icon to see the formal math. </h4> <div class="expand-background"> $$\text{Mean Squared Error} = \frac{1}{n}\sum_{i=0}^n {(y_i - \hat{y}_i)}^2$$ where: <ul> <li>$n$ is the number of examples.</li> <li>$y$ is the actual value of the label.</li> <li>$\hat{y}$ is the model's prediction for $y$.</li> </ul> </div> <hr /> </section> <p>For example, consider the loss on the following batch of five examples:</p> <table> <tr><th>Actual value</th> <th>Model's prediction</th> <th>Loss</th> <th>Squared loss</th></tr> <tr><td>7</td> <td>6</td> <td>1</td> <td>1</td></tr> <tr><td>5</td> <td>4</td> <td>1</td> <td>1</td></tr> <tr><td>8</td> <td>11</td> <td>3</td> <td>9</td></tr> <tr><td>4</td> <td>6</td> <td>2</td> <td>4</td></tr> <tr><td>9</td> <td>8</td> <td>1</td> <td>1</td></tr> <tr><th colspan="3"> </th> <th>16 = L<sub>2</sub> loss</th></tr> </table> <p>Therefore, the Mean Squared Error is:</p> <div></div><devsite-code><pre translate="no" dir="ltr" is-upgraded> Mean Squared Error = L<sub>2</sub> loss / Number of Examples Mean Squared Error = 16/5 = 3.2 </pre></devsite-code> <p>Mean Squared Error is a popular training <a href="#optimizer"><strong>optimizer</strong></a>, particularly for <a href="#linear_regression"><strong>linear regression</strong></a>.</p> <p>Contrast Mean Squared Error with <a href="#MAE"><strong>Mean Absolute Error</strong></a> and <a href="#RMSE"><strong>Root Mean Squared Error</strong></a>.</p> <p><a href="#TensorFlow_Playground"><strong>TensorFlow Playground</strong></a> uses Mean Squared Error to calculate loss values.</p> <section class="expandable"> <h4 class="showalways" id="click-the-icon-to-see-more-details-about-outliers." data-text=" Click the icon to see more details about outliers. " tabindex="-1"> Click the icon to see more details about outliers. </h4> <div class="expand-background"> <p> <a href="#outliers"><b>Outliers</b></a> strongly influence Mean Squared Error. For example, a loss of 1 is a squared loss of 1, but a loss of 3 is a squared loss of 9. In the preceding table, the example with a loss of 3 accounts for ~56% of the Mean Squared Error, while each of the examples with a loss of 1 accounts for only 6% of the Mean Squared Error. </p> <p>Outliers don't influence Mean Absolute Error as strongly as Mean Squared Error. For example, a loss of 3 accounts for only ~38% of the Mean Absolute Error.</p> <p><a href="#clipping"><b>Clipping</b></a> is one way to prevent extreme outliers from damaging your model's predictive ability.</p> </div> <hr /> </section> <p><a class="glossary-anchor" name="mesh"></a> <h2 class="hide-from-toc" id="mesh" data-text=" mesh" tabindex="-1"> mesh</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="TensorFlow">#TensorFlow</div> <div class="glossary-icon" title="Google Cloud">#GoogleCloud</div> </div></p> <p>In ML parallel programming, a term associated with assigning the data and model to TPU chips, and defining how these values will be sharded or replicated.</p> <p>Mesh is an overloaded term that can mean either of the following:</p> <ul> <li>A physical layout of TPU chips. </li> <li>An abstract logical construct for mapping the data and model to the TPU chips. </li> </ul> <p>In either case, a mesh is specified as a <a href="#shape"><strong>shape</strong></a>.</p> <p><a class="glossary-anchor" name="meta-learning"></a> <h2 class="hide-from-toc" id="meta-learning" data-text=" meta-learning" tabindex="-1"> meta-learning</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="Language Evaluation">#language</div> </div></p> <p>A subset of machine learning that discovers or improves a learning algorithm. A meta-learning system can also aim to train a model to quickly learn a new task from a small amount of data or from experience gained in previous tasks. Meta-learning algorithms generally try to achieve the following:</p> <ul> <li>Improve or learn hand-engineered features (such as an initializer or an optimizer).</li> <li>Be more data-efficient and compute-efficient.</li> <li>Improve generalization.</li> </ul> <p>Meta-learning is related to <a href="#few-shot_learning"><strong>few-shot learning</strong></a>.</p> <p><a class="glossary-anchor" name="metric"></a> <h2 class="hide-from-toc" id="metric" data-text=" metric" tabindex="-1"> metric</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="TensorFlow">#TensorFlow</div> </div></p> <p>A statistic that you care about.</p> <p>An <a href="#objective"><strong>objective</strong></a> is a metric that a machine learning system tries to optimize.</p> <p><a class="glossary-anchor" name="metrics_API"></a> <h2 class="hide-from-toc" id="metrics-api-tf.metrics" data-text=" Metrics API (tf.metrics)" tabindex="-1"> Metrics API (tf.metrics)</h2></p> <p>A TensorFlow API for evaluating models. For example, <code translate="no" dir="ltr">tf.metrics.accuracy</code> determines how often a model's predictions match labels.</p> <p><a class="glossary-anchor" name="mini-batch"></a> <h2 class="hide-from-toc" id="mini-batch" data-text=" mini-batch" tabindex="-1"> mini-batch</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="ML Fundamentals">#fundamentals</div> </div></p> <p>A small, randomly selected subset of a <a href="#batch"><strong>batch</strong></a> processed in one <a href="#iteration"><strong>iteration</strong></a>. The <a href="#batch_size"><strong>batch size</strong></a> of a mini-batch is usually between 10 and 1,000 examples.</p> <p>For example, suppose the entire training set (the full batch) consists of 1,000 examples. Further suppose that you set the <a href="#batch_size"><strong>batch size</strong></a> of each mini-batch to 20. Therefore, each iteration determines the loss on a random 20 of the 1,000 examples and then adjusts the <a href="#weight"><strong>weights</strong></a> and <a href="#bias"><strong>biases</strong></a> accordingly.</p> <p>It is much more efficient to calculate the loss on a mini-batch than the loss on all the examples in the full batch.</p> <p><a class="glossary-anchor" name="mini-batch_SGD"></a> <h2 class="hide-from-toc" id="mini-batch-stochastic-gradient-descent" data-text=" mini-batch stochastic gradient descent" tabindex="-1"> mini-batch stochastic gradient descent</h2></p> <p>A <a href="#gradient_descent"><strong>gradient descent</strong></a> algorithm that uses <a href="#mini-batch"><strong>mini-batches</strong></a>. In other words, mini-batch stochastic gradient descent estimates the gradient based on a small subset of the training data. Regular <a href="#SGD"><strong>stochastic gradient descent</strong></a> uses a mini-batch of size 1.</p> <p><a class="glossary-anchor" name="minimax_loss"></a> <h2 class="hide-from-toc" id="minimax-loss" data-text=" minimax loss" tabindex="-1"> minimax loss</h2></p> <p>A loss function for <a href="#generative_adversarial_network"><strong>generative adversarial networks</strong></a>, based on the <a href="#cross-entropy"><strong>cross-entropy</strong></a> between the distribution of generated data and real data.</p> <p>Minimax loss is used in the <a href="https://arxiv.org/pdf/1406.2661.pdf">first paper</a> to describe generative adversarial networks.</p> <p><a class="glossary-anchor" name="minority_class"></a> <h2 class="hide-from-toc" id="minority-class" data-text=" minority class" tabindex="-1"> minority class</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="ML Fundamentals">#fundamentals</div> </div></p> <p>The less common label in a <a href="#class_imbalanced_data_set"><strong>class-imbalanced dataset</strong></a>. For example, given a dataset containing 99% negative labels and 1% positive labels, the positive labels are the minority class.</p> <p>Contrast with <a href="#majority_class"><strong>majority class</strong></a>.</p> <section class="expandable"> <h4 class="showalways" id="click-the-icon-for-additional-notes._10" data-text=" Click the icon for additional notes. " tabindex="-1"> Click the icon for additional notes. </h4> <div class="expand-background"> <p> A training set with a million <a href="#example">examples</a> sounds impressive. However, if the minority class is poorly represented, then even a very large training set might be insufficient. Focus less on the total number of examples in the dataset and more on the number of examples in the minority class. </p> <p> If your dataset doesn't contain enough minority class examples, consider using <a href="#downsampling"><b>downsampling</b></a> (the definition in the second bullet) to supplement the minority class. </p> </div> <hr /> </section> <p><a class="glossary-anchor" name="mixture-of-experts"></a> <h2 class="hide-from-toc" id="mixture-of-experts" data-text=" mixture of experts" tabindex="-1"> mixture of experts</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="Language Evaluation">#language</div> <div class="glossary-icon" title="Generative AI">#generativeAI</div> </div></p> <p>A scheme to increase <a href="#neural-network"><strong>neural network</strong></a> efficiency by using only a subset of its parameters (known as an <strong>expert</strong>) to process a given input <a href="#token"><strong>token</strong></a> or <a href="#example"><strong>example</strong></a>. A <strong>gating network</strong> routes each input token or example to the proper expert(s).</p> <p>For details, see either of the following papers:</p> <ul> <li><a href="https://arxiv.org/abs/1701.06538">Outrageously Large Neural Networks: The Sparsely-Gated Mixture-of-Experts Layer</a></li> <li><a href="https://research.google/blog/mixture-of-experts-with-expert-choice-routing/?m=1">Mixture-of-Experts with Expert Choice Routing</a></li> </ul> <p><a class="glossary-anchor" name="ML"></a> <h2 class="hide-from-toc" id="ml" data-text=" ML" tabindex="-1"> ML</h2></p> <p>Abbreviation for <a href="#machine_learning"><strong>machine learning</strong></a>.</p> <p><a class="glossary-anchor" name="MMIT"></a> <h2 class="hide-from-toc" id="mmit" data-text=" MMIT" tabindex="-1"> MMIT</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="Language Evaluation">#language</div> <div class="glossary-icon" title="Image Models">#image</div> <div class="glossary-icon" title="Generative AI">#generativeAI</div> </div></p> <p>Abbreviation for <a href="#multimodal-instruction-tuned"><strong>multimodal instruction-tuned</strong></a>.</p> <p><a class="glossary-anchor" name="MNIST"></a> <h2 class="hide-from-toc" id="mnist" data-text=" MNIST" tabindex="-1"> MNIST</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="Image Models">#image</div> </div></p> <p>A public-domain dataset compiled by LeCun, Cortes, and Burges containing 60,000 images, each image showing how a human manually wrote a particular digit from 0–9. Each image is stored as a 28x28 array of integers, where each integer is a grayscale value between 0 and 255, inclusive.</p> <p>MNIST is a canonical dataset for machine learning, often used to test new machine learning approaches. For details, see <a href="http://yann.lecun.com/exdb/mnist/" target="T"> The MNIST Database of Handwritten Digits</a>.</p> <p><a class="glossary-anchor" name="modality"></a> <h2 class="hide-from-toc" id="modality" data-text=" modality" tabindex="-1"> modality</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="Language Evaluation">#language</div> </div></p> <p>A high-level data category. For example, numbers, text, images, video, and audio are five different modalities.</p> <p><a class="glossary-anchor" name="model"></a> <h2 class="hide-from-toc" id="model" data-text=" model" tabindex="-1"> model</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="ML Fundamentals">#fundamentals</div> </div></p> <p>In general, any mathematical construct that processes input data and returns output. Phrased differently, a model is the set of parameters and structure needed for a system to make predictions. In <a href="#supervised_machine_learning"><strong>supervised machine learning</strong></a>, a model takes an <a href="#example"><strong>example</strong></a> as input and infers a <a href="#prediction"><strong>prediction</strong></a> as output. Within supervised machine learning, models differ somewhat. For example:</p> <ul> <li>A linear regression model consists of a set of <a href="#weight"><strong>weights</strong></a> and a <a href="#bias"><strong>bias</strong></a>.</li> <li>A <a href="#neural-network"><strong>neural network</strong></a> model consists of: <ul> <li>A set of <a href="#hidden_layer"><strong>hidden layers</strong></a>, each containing one or more <a href="#neuron"><strong>neurons</strong></a>.</li> <li>The weights and bias associated with each neuron.</li> </ul></li> <li>A <a href="#decision-tree"><strong>decision tree</strong></a> model consists of: <ul> <li>The shape of the tree; that is, the pattern in which the conditions and leaves are connected.</li> <li>The conditions and leaves.</li> </ul></li> </ul> <p>You can save, restore, or make copies of a model.</p> <p><a href="#unsupervised_machine_learning"><strong>Unsupervised machine learning</strong></a> also generates models, typically a function that can map an input example to the most appropriate <a href="#clustering"><strong>cluster</strong></a>.</p> <section class="expandable"> <h4 class="showalways" id="click-the-icon-to-compare-algebraic-and-programming-functions-to-ml-models." data-text=" Click the icon to compare algebraic and programming functions to ML models. " tabindex="-1"> Click the icon to compare algebraic and programming functions to ML models. </h4> <div class="expand-background"> <p>An algebraic function such as the following is a model:</p> <div></div><devsite-code><pre translate="no" dir="ltr" is-upgraded> f(x, y) = 3x -5xy + y<sup>2</sup> + 17 </pre></devsite-code> <p>The preceding function maps input values (<tt>x</tt> and <tt>y</tt>) to output.</p> <p>Similarly, a programming function like the following is also a model:</p> <div></div><devsite-code><pre class="devsite-click-to-copy" translate="no" dir="ltr" is-upgraded syntax="scdoc">def half_of_greater(x, y): if (x > y): return(x / 2) else return(y / 2)</pre></devsite-code> <p>A caller passes arguments to the preceding Python function, and the Python function generates output (via the <tt>return</tt> statement).</p> <p>Although a <a href="#deep_neural_network"><b>deep neural network</b></a> has a very different mathematical structure than an algebraic or programming function, a deep neural network still takes input (an example) and returns output (a prediction).</p> <p>A human programmer codes a programming function manually. In contrast, a machine learning model gradually learns the optimal parameters during automated training.</p> </div> <hr /> </section> <p><a class="glossary-anchor" name="model_capacity"></a> <h2 class="hide-from-toc" id="model-capacity" data-text=" model capacity" tabindex="-1"> model capacity</h2></p> <p>The complexity of problems that a model can learn. The more complex the problems that a model can learn, the higher the model's capacity. A model's capacity typically increases with the number of model parameters. For a formal definition of classifier capacity, see <a href="https://wikipedia.org/wiki/VC_dimension" target="T">VC dimension</a>.</p> <p><a class="glossary-anchor" name="model_cascading"></a> <h2 class="hide-from-toc" id="model-cascading" data-text=" model cascading" tabindex="-1"> model cascading</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="Generative AI">#generativeAI</div> </div></p> <p>A system that picks the ideal <a href="#model"><strong>model</strong></a> for a specific inference query.</p> <p>Imagine a group of models, ranging from very large (lots of <a href="#parameter"><strong>parameters</strong></a>) to much smaller (far fewer parameters). Very large models consume more computational resources at <a href="#inference"><strong>inference</strong></a> time than smaller models. However, very large models can typically infer more complex requests than smaller models. Model cascading determines the complexity of the inference query and then picks the appropriate model to perform the inference. The main motivation for model cascading is to reduce inference costs by generally selecting smaller models, and only selecting a larger model for more complex queries.</p> <p>Imagine that a small model runs on a phone and a larger version of that model runs on a remote server. Good model cascading reduces cost and latency by enabling the smaller model to handle simple requests and only calling the remote model to handle complex requests.</p> <p>See also <a href="#model_router"><strong>model router</strong></a>.</p> <p><a class="glossary-anchor" name="model-parallelism"></a> <h2 class="hide-from-toc" id="model-parallelism" data-text=" model parallelism" tabindex="-1"> model parallelism</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="Language Evaluation">#language</div> </div></p> <p>A way of scaling training or inference that puts different parts of one <a href="#model"><strong>model</strong></a> on different <a href="#device"><strong>devices</strong></a>. Model parallelism enables models that are too big to fit on a single device.</p> <p>To implement model parallelism, a system typically does the following:</p> <ol> <li>Shards (divides) the model into smaller parts.</li> <li>Distributes the training of those smaller parts across multiple processors. Each processor trains its own part of the model.</li> <li>Combines the results to create a single model.</li> </ol> <p>Model parallelism slows training.</p> <p>See also <a href="#data-parallelism"><strong>data parallelism</strong></a>.</p> <p><a class="glossary-anchor" name="model_router"></a> <h2 class="hide-from-toc" id="model-router" data-text=" model router" tabindex="-1"> model router</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="Generative AI">#generativeAI</div> </div></p> <p>The algorithm that determines the ideal <a href="#model"><strong>model</strong></a> for <a href="#inference"><strong>inference</strong></a> in <a href="#model_cascading"><strong>model cascading</strong></a>. A model router is itself typically a machine learning model that gradually learns how to pick the best model for a given input. However, a model router could sometimes be a simpler, non-machine learning algorithm.</p> <p><a class="glossary-anchor" name="model_training"></a> <h2 class="hide-from-toc" id="model-training" data-text=" model training" tabindex="-1"> model training</h2></p> <p>The process of determining the best <a href="#model"><strong>model</strong></a>.</p> <p><a class="glossary-anchor" name="Momentum"></a> <h2 class="hide-from-toc" id="momentum" data-text=" Momentum" tabindex="-1"> Momentum</h2></p> <p>A sophisticated gradient descent algorithm in which a learning step depends not only on the derivative in the current step, but also on the derivatives of the step(s) that immediately preceded it. Momentum involves computing an exponentially weighted moving average of the gradients over time, analogous to momentum in physics. Momentum sometimes prevents learning from getting stuck in local minima.</p> <p><a class="glossary-anchor" name="MOE"></a> <h2 class="hide-from-toc" id="moe" data-text=" MOE" tabindex="-1"> MOE</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="Language Evaluation">#language</div> <div class="glossary-icon" title="Image Models">#image</div> <div class="glossary-icon" title="Generative AI">#generativeAI</div> </div></p> <p>Abbreviation for <a href="#mixture-of-experts"><strong>mixture of experts</strong></a>.</p> <p><a class="glossary-anchor" name="multi-class"></a> <h2 class="hide-from-toc" id="multi-class-classification" data-text=" multi-class classification" tabindex="-1"> multi-class classification</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="ML Fundamentals">#fundamentals</div> </div></p> <p>In supervised learning, a <a href="#classification_model"><strong>classification</strong></a> problem in which the dataset contains <em>more than two</em> <a href="#class"><strong>classes</strong></a> of labels. For example, the labels in the Iris dataset must be one of the following three classes:</p> <ul> <li>Iris setosa</li> <li>Iris virginica</li> <li>Iris versicolor</li> </ul> <p>A model trained on the Iris dataset that predicts Iris type on new examples is performing multi-class classification.</p> <p>In contrast, classification problems that distinguish between exactly two classes are <a href="#binary_classification"><strong>binary classification models</strong></a>. For example, an email model that predicts either <em>spam</em> or <em>not spam</em> is a binary classification model.</p> <p>In clustering problems, multi-class classification refers to more than two clusters.</p> <p><a class="glossary-anchor" name="multi-class_logistic_regression"></a> <h2 class="hide-from-toc" id="multi-class-logistic-regression" data-text=" multi-class logistic regression" tabindex="-1"> multi-class logistic regression</h2></p> <p>Using <a href="#logistic_regression"><strong>logistic regression</strong></a> in <a href="#multi-class"><strong>multi-class classification</strong></a> problems.</p> <p><a class="glossary-anchor" name="multi-head-self-attention"></a> <h2 class="hide-from-toc" id="multi-head-self-attention" data-text=" multi-head self-attention" tabindex="-1"> multi-head self-attention</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="Language Evaluation">#language</div> </div></p> <p>An extension of <a href="#self-attention"><strong>self-attention</strong></a> that applies the self-attention mechanism multiple times for each position in the input sequence.</p> <p><a href="#Transformer"><strong>Transformers</strong></a> introduced multi-head self-attention.</p> <p><a class="glossary-anchor" name="multimodal-model"></a> <h2 class="hide-from-toc" id="multimodal-model" data-text=" multimodal model" tabindex="-1"> multimodal model</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="Language Evaluation">#language</div> </div></p> <p>A model whose inputs and/or outputs include more than one <a href="#modality"><strong>modality</strong></a>. For example, consider a model that takes both an image and a text caption (two modalities) as <a href="#feature"><strong>features</strong></a>, and outputs a score indicating how appropriate the text caption is for the image. So, this model's inputs are multimodal and the output is unimodal.</p> <p><a class="glossary-anchor" name="multimodal-instruction-tuned"></a> <h2 class="hide-from-toc" id="multimodal-instruction-tuned" data-text=" multimodal instruction-tuned" tabindex="-1"> multimodal instruction-tuned</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="Language Evaluation">#language</div> </div></p> <p>An <a href="#instruction-tuning"><strong>instruction-tuned</strong></a> model that can process input beyond text, such as images, video, and audio.</p> <p><a class="glossary-anchor" name="multinomial_classification"></a> <h2 class="hide-from-toc" id="multinomial-classification" data-text=" multinomial classification" tabindex="-1"> multinomial classification</h2></p> <p>Synonym for <a href="#multi-class"><strong>multi-class classification</strong></a>.</p> <p><a class="glossary-anchor" name="multinomial-regression"></a> <h2 class="hide-from-toc" id="multinomial-regression" data-text=" multinomial regression" tabindex="-1"> multinomial regression</h2></p> <p>Synonym for <a href="#multi-class_logistic_regression"><strong>multi-class logistic regression</strong></a>.</p> <p><a class="glossary-anchor" name="multitask"></a> <h2 class="hide-from-toc" id="multitask" data-text=" multitask" tabindex="-1"> multitask</h2></p> <p>A machine learning technique in which a single <a href="#model"><strong>model</strong></a> is trained to perform multiple <a href="#task"><strong>tasks</strong></a>.</p> <p>Multitask models are created by training on data that is appropriate for each of the different tasks. This allows the model to learn to share information across the tasks, which helps the model learn more effectively.</p> <p>A model trained for multiple tasks often has improved generalization abilities and can be more robust at handling different types of data.</p> <p><a class="glossary-anchor" name="n"></a> <h2 class="glossary" id="n" data-text="N" tabindex="-1">N</h2></p> <p><a class="glossary-anchor" name="NaN_trap"></a> <h2 class="hide-from-toc" id="nan-trap" data-text=" NaN trap" tabindex="-1"> NaN trap</h2></p> <p>When one number in your model becomes a <a href="https://wikipedia.org/wiki/NaN">NaN</a> during training, which causes many or all other numbers in your model to eventually become a NaN.</p> <p>NaN is an abbreviation for <strong>N</strong>ot <strong>a</strong> <strong>N</strong>umber.</p> <p><a class="glossary-anchor" name="natural_language_understanding"></a> <h2 class="hide-from-toc" id="natural-language-understanding" data-text=" natural language understanding" tabindex="-1"> natural language understanding</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="Language Evaluation">#language</div> </div></p> <p>Determining a user's intentions based on what the user typed or said. For example, a search engine uses natural language understanding to determine what the user is searching for based on what the user typed or said.</p> <p><a class="glossary-anchor" name="negative_class"></a> <h2 class="hide-from-toc" id="negative-class" data-text=" negative class" tabindex="-1"> negative class</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="ML Fundamentals">#fundamentals</div> </div></p> <p>In <a href="#binary_classification"><strong>binary classification</strong></a>, one class is termed <em>positive</em> and the other is termed <em>negative</em>. The positive class is the thing or event that the model is testing for and the negative class is the other possibility. For example:</p> <ul> <li>The negative class in a medical test might be "not tumor."</li> <li>The negative class in an email classifier might be "not spam."</li> </ul> <p>Contrast with <a href="#positive_class"><strong>positive class</strong></a>.</p> <p><a class="glossary-anchor" name="negative_sampling"></a> <h2 class="hide-from-toc" id="negative-sampling" data-text=" negative sampling" tabindex="-1"> negative sampling</h2></p> <p>Synonym for <a href="#candidate_sampling"><strong>candidate sampling</strong></a>.</p> <p><a class="glossary-anchor" name="nas"></a> <a class="glossary-anchor" name="neural-architecture-search"></a> <h2 class="hide-from-toc" id="neural-architecture-search-nas" data-text=" Neural Architecture Search (NAS)" tabindex="-1"> Neural Architecture Search (NAS)</h2></p> <p>A technique for automatically designing the architecture of a <a href="#neural-network"><strong>neural network</strong></a>. NAS algorithms can reduce the amount of time and resources required to train a neural network.</p> <p>NAS typically uses:</p> <ul> <li>A search space, which is a set of possible architectures.</li> <li>A fitness function, which is a measure of how well a particular architecture performs on a given task.</li> </ul> <p>NAS algorithms often start with a small set of possible architectures and gradually expand the search space as the algorithm learns more about what architectures are effective. The fitness function is typically based on the performance of the architecture on a training set, and the algorithm is typically trained using a <a href="#reinforcement_learning"><strong>reinforcement learning</strong></a> technique.</p> <p>NAS algorithms have proven effective in finding high-performing architectures for a variety of tasks, including image <a href="#classification_model"><strong>classification</strong></a>, text classification, and machine translation.</p> <p><a class="glossary-anchor" name="neural_network"></a> <a class="glossary-anchor" name="neural-network"></a> <h2 class="hide-from-toc" id="neural-network" data-text=" neural network" tabindex="-1"> neural network</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="ML Fundamentals">#fundamentals</div> </div></p> <p>A <a href="#model"><strong>model</strong></a> containing at least one <a href="#hidden_layer"><strong>hidden layer</strong></a>. A <a href="#deep_neural_network"><strong>deep neural network</strong></a> is a type of neural network containing more than one hidden layer. For example, the following diagram shows a deep neural network containing two hidden layers.</p> <p> <img src="/static/machine-learning/glossary/images/NeuralNetwork.png" loading="lazy" width="750" alt="A neural network with an input layer, two hidden layers, and an output layer." > </p> <p>Each neuron in a neural network connects to all of the nodes in the next layer. For example, in the preceding diagram, notice that each of the three neurons in the first hidden layer separately connect to both of the two neurons in the second hidden layer.</p> <p>Neural networks implemented on computers are sometimes called <strong>artificial neural networks</strong> to differentiate them from neural networks found in brains and other nervous systems.</p> <p>Some neural networks can mimic extremely complex nonlinear relationships between different features and the label.</p> <p>See also <a href="#convolutional_neural_network"><strong>convolutional neural network</strong></a> and <a href="#recurrent_neural_network"><strong>recurrent neural network</strong></a>.</p> <p><a class="glossary-anchor" name="neuron"></a> <h2 class="hide-from-toc" id="neuron" data-text=" neuron" tabindex="-1"> neuron</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="ML Fundamentals">#fundamentals</div> </div></p> <p>In machine learning, a distinct unit within a <a href="#hidden_layer"><strong>hidden layer</strong></a> of a <a href="#neural_network"><strong>neural network</strong></a>. Each neuron performs the following two-step action:</p> <ol> <li>Calculates the <a href="#weighted_sum"><strong>weighted sum</strong></a> of input values multiplied by their corresponding weights.</li> <li>Passes the weighted sum as input to an <a href="#activation_function"><strong>activation function</strong></a>.</li> </ol> <p>A neuron in the first hidden layer accepts inputs from the feature values in the <a href="#input-layer"><strong>input layer</strong></a>. A neuron in any hidden layer beyond the first accepts inputs from the neurons in the preceding hidden layer. For example, a neuron in the second hidden layer accepts inputs from the neurons in the first hidden layer.</p> <p>The following illustration highlights two neurons and their inputs.</p> <p> <img src="/static/machine-learning/glossary/images/Neurons.png" loading="lazy" width="750" alt="A neural network with an input layer, two hidden layers, and an output layer. Two neurons are highlighted: one in the first hidden layer and one in the second hidden layer. The highlighted neuron in the first hidden layer receives inputs from both features in the input layer. The highlighted neuron in the second hidden layer receives inputs from each of the three neurons in the first hidden layer." > </p> <p>A neuron in a neural network mimics the behavior of neurons in brains and other parts of nervous systems.</p> <p><a class="glossary-anchor" name="N-gram"></a> <h2 class="hide-from-toc" id="n-gram" data-text=" N-gram" tabindex="-1"> N-gram</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="Sequence Models">#seq</div> <div class="glossary-icon" title="Language Evaluation">#language</div> </div></p> <p>An ordered sequence of N words. For example, <em>truly madly</em> is a 2-gram. Because order is relevant, <em>madly truly</em> is a different 2-gram than <em>truly madly</em>.</p> <table> <tr> <th>N</th> <th>Name(s) for this kind of N-gram</th> <th>Examples</th> </tr> <tr> <td>2 </td> <td>bigram or 2-gram </td> <td><em>to go, go to, eat lunch, eat dinner</em> </td> </tr> <tr> <td>3 </td> <td>trigram or 3-gram </td> <td><em>ate too much, three blind mice, the bell tolls</em> </td> </tr> <tr> <td>4 </td> <td>4-gram </td> <td><em>walk in the park, dust in the wind, the boy ate lentils</em> </td> </tr> </table> <p>Many <a href="#natural_language_understanding"><strong>natural language understanding</strong></a> models rely on N-grams to predict the next word that the user will type or say. For example, suppose a user typed <em>three blind</em>. An NLU model based on trigrams would likely predict that the user will next type <em>mice</em>.</p> <p>Contrast N-grams with <a href="#bag_of_words"><strong>bag of words</strong></a>, which are unordered sets of words.</p> <p><a class="glossary-anchor" name="NLU"></a> <h2 class="hide-from-toc" id="nlu" data-text=" NLU" tabindex="-1"> NLU</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="Language Evaluation">#language</div> </div></p> <p>Abbreviation for <a href="#natural_language_understanding"><strong>natural language understanding</strong></a>.</p> <p><a class="glossary-anchor" name="node-decision-tree"></a> <h2 class="hide-from-toc" id="node-decision-tree" data-text=" node (decision tree) " tabindex="-1"> node (decision tree) </h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="Decision Forests">#df</div> </div></p> <p>In a <a href="#decision-tree"><strong>decision tree</strong></a>, any <a href="#condition"><strong>condition</strong></a> or <a href="#leaf"><strong>leaf</strong></a>.</p> <p> <img src="/static/machine-learning/glossary/images/node.png" loading="lazy" width="485" alt="A decision tree with two conditions and three leaves." > </p> <p><a class="glossary-anchor" name="node"></a> <h2 class="hide-from-toc" id="node-neural-network" data-text=" node (neural network)" tabindex="-1"> node (neural network)</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="ML Fundamentals">#fundamentals</div> </div></p> <p>A <a href="#neuron"><strong>neuron</strong></a> in a <a href="#hidden_layer"><strong>hidden layer</strong></a>.</p> <p><a class="glossary-anchor" name="node_graph"></a> <h2 class="hide-from-toc" id="node-tensorflow-graph" data-text=" node (TensorFlow graph)" tabindex="-1"> node (TensorFlow graph)</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="TensorFlow">#TensorFlow</div> </div></p> <p>An operation in a TensorFlow <a href="#graph"><strong>graph</strong></a>.</p> <p><a class="glossary-anchor" name="noise"></a> <h2 class="hide-from-toc" id="noise" data-text=" noise" tabindex="-1"> noise</h2></p> <p>Broadly speaking, anything that obscures the signal in a dataset. Noise can be introduced into data in a variety of ways. For example:</p> <ul> <li>Human raters make mistakes in labeling.</li> <li>Humans and instruments mis-record or omit feature values.</li> </ul> <p><a class="glossary-anchor" name="non-binary-condition"></a> <h2 class="hide-from-toc" id="non-binary-condition" data-text=" non-binary condition " tabindex="-1"> non-binary condition </h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="Decision Forests">#df</div> </div></p> <p>A <a href="#condition"><strong>condition</strong></a> containing more than two possible outcomes. For example, the following non-binary condition contains three possible outcomes:</p> <p> <img src="/static/machine-learning/glossary/images/non-binary-conditions.png" loading="lazy" width="360" alt="A condition (number_of_legs = ?) that leads to three possible outcomes. One outcome (number_of_legs = 8) leads to a leaf named spider. A second outcome (number_of_legs = 4) leads to a leaf named dog. A third outcome (number_of_legs = 2) leads to a leaf named penguin." > </p> <p><a class="glossary-anchor" name="nonlinear"></a> <h2 class="hide-from-toc" id="nonlinear" data-text=" nonlinear " tabindex="-1"> nonlinear </h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="ML Fundamentals">#fundamentals</div> </div></p> <p>A relationship between two or more variables that can't be represented solely through addition and multiplication. A <em>linear</em> relationship can be represented as a line; a <em>nonlinear</em> relationship can't be represented as a line. For example, consider two models that each relate a single feature to a single label. The model on the left is linear and the model on the right is nonlinear:</p> <p> <img src="/static/machine-learning/glossary/images/LinearVsNonlinear.png" loading="lazy" alt="Two plots. One plot is a line, so this is a linear relationship. The other plot is a curve, so this is a nonlinear relationship." > </p> <p><a class="glossary-anchor" name="non-response_bias"></a> <h2 class="hide-from-toc" id="non-response-bias" data-text=" non-response bias " tabindex="-1"> non-response bias </h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="Fairness">#fairness</div> </div></p> <p>See <a href="#selection_bias"><strong>selection bias</strong></a>.</p> <p><a class="glossary-anchor" name="nonstationarity"></a> <h2 class="hide-from-toc" id="nonstationarity" data-text=" nonstationarity" tabindex="-1"> nonstationarity</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="ML Fundamentals">#fundamentals</div> </div></p> <p>A feature whose values change across one or more dimensions, usually time. For example, consider the following examples of nonstationarity:</p> <ul> <li>The number of swimsuits sold at a particular store varies with the season.</li> <li>The quantity of a particular fruit harvested in a particular region is zero for much of the year but large for a brief period.</li> <li>Due to climate change, annual mean temperatures are shifting.</li> </ul> <p>Contrast with <a href="#stationarity"><strong>stationarity</strong></a>.</p> <p><a class="glossary-anchor" name="normalization"></a> <h2 class="hide-from-toc" id="normalization" data-text=" normalization" tabindex="-1"> normalization</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="ML Fundamentals">#fundamentals</div> </div></p> <p>Broadly speaking, the process of converting a variable's actual range of values into a standard range of values, such as:</p> <ul> <li>-1 to +1</li> <li>0 to 1</li> <li>Z-scores (roughly, -3 to +3)</li> </ul> <p>For example, suppose the actual range of values of a certain feature is 800 to 2,400. As part of <a href="#feature_engineering"><strong>feature engineering</strong></a>, you could normalize the actual values down to a standard range, such as -1 to +1.</p> <p>Normalization is a common task in <a href="#feature_engineering"><strong>feature engineering</strong></a>. Models usually train faster (and produce better predictions) when every numerical feature in the <a href="#feature_vector"><strong>feature vector</strong></a> has roughly the same range.</p> <p>See the <a href="/machine-learning/crash-course/numerical-data/normalization">Working with Numerical Data module</a> of Machine Learning Crash Course for more details. Also see <a href="#Z-score-normalization"><strong>Z-score normalization</strong></a>.</p> <p><a class="glossary-anchor" name="novelty-detection"></a> <h2 class="hide-from-toc" id="novelty-detection" data-text=" novelty detection" tabindex="-1"> novelty detection</h2></p> <p>The process of determining whether a new (novel) example comes from the same distribution as the <a href="#training_set"><strong>training set</strong></a>. In other words, after training on the training set, novelty detection determines whether a <em>new</em> example (during inference or during additional training) is an <a href="#outliers"><strong>outlier</strong></a>.</p> <p>Contrast with <a href="#outlier-detection"><strong>outlier detection</strong></a>.</p> <p><a class="glossary-anchor" name="numerical_data"></a> <h2 class="hide-from-toc" id="numerical-data" data-text=" numerical data" tabindex="-1"> numerical data</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="ML Fundamentals">#fundamentals</div> </div></p> <p><a href="#feature"><strong>Features</strong></a> represented as integers or real-valued numbers. For example, a house valuation model would probably represent the size of a house (in square feet or square meters) as numerical data. Representing a feature as numerical data indicates that the feature's values have a <em>mathematical</em> relationship to the label. That is, the number of square meters in a house probably has some mathematical relationship to the value of the house.</p> <p>Not all integer data should be represented as numerical data. For example, postal codes in some parts of the world are integers; however, integer postal codes shouldn't be represented as numerical data in models. That's because a postal code of <code translate="no" dir="ltr">20000</code> is not twice (or half) as potent as a postal code of 10000. Furthermore, although different postal codes <em>do</em> correlate to different real estate values, we can't assume that real estate values at postal code 20000 are twice as valuable as real estate values at postal code 10000. Postal codes should be represented as <a href="#categorical_data"><strong>categorical data</strong></a> instead.</p> <p>Numerical features are sometimes called <a href="#continuous_feature"><strong>continuous features</strong></a>.</p> <p><a class="glossary-anchor" name="numpy"></a> <h2 class="hide-from-toc" id="numpy" data-text=" NumPy" tabindex="-1"> NumPy</h2></p> <p>An <a href="http://www.numpy.org/" target="T"> open-source math library</a> that provides efficient array operations in Python. <a href="#pandas"><strong>pandas</strong></a> is built on NumPy.</p> <p><a class="glossary-anchor" name="o"></a> <h2 class="glossary" id="o" data-text="O" tabindex="-1">O</h2></p> <p><a class="glossary-anchor" name="objective"></a> <h2 class="hide-from-toc" id="objective" data-text=" objective" tabindex="-1"> objective</h2></p> <p>A metric that your algorithm is trying to optimize.</p> <p><a class="glossary-anchor" name="objective_function"></a> <h2 class="hide-from-toc" id="objective-function" data-text=" objective function" tabindex="-1"> objective function</h2></p> <p>The mathematical formula or <a href="#metric"><strong>metric</strong></a> that a model aims to optimize. For example, the objective function for <a href="#linear_regression"><strong>linear regression</strong></a> is usually <a href="#MSE"><strong>Mean Squared Loss</strong></a>. Therefore, when training a linear regression model, training aims to minimize Mean Squared Loss.</p> <p>In some cases, the goal is to <em>maximize</em> the objective function. For example, if the objective function is accuracy, the goal is to maximize accuracy.</p> <p>See also <a href="#loss"><strong>loss</strong></a>.</p> <p><a class="glossary-anchor" name="oblique-condition"></a> <h2 class="hide-from-toc" id="oblique-condition" data-text=" oblique condition " tabindex="-1"> oblique condition </h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="Decision Forests">#df</div> </div></p> <p>In a <a href="#decision-tree"><strong>decision tree</strong></a>, a <a href="#condition"><strong>condition</strong></a> that involves more than one <a href="#feature"><strong>feature</strong></a>. For example, if height and width are both features, then the following is an oblique condition:</p> <div></div><devsite-code><pre class="devsite-click-to-copy" translate="no" dir="ltr" is-upgraded syntax="Text only"><code translate="no" dir="ltr"> height > width </code></pre></devsite-code> <p>Contrast with <a href="#axis-aligned-condition"><strong>axis-aligned condition</strong></a>.</p> <p><a class="glossary-anchor" name="offline"></a> <h2 class="hide-from-toc" id="offline" data-text=" offline" tabindex="-1"> offline</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="ML Fundamentals">#fundamentals</div> </div></p> <p>Synonym for <a href="#static"><strong>static</strong></a>.</p> <p><a class="glossary-anchor" name="offline_inference"></a> <h2 class="hide-from-toc" id="offline-inference" data-text=" offline inference" tabindex="-1"> offline inference</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="ML Fundamentals">#fundamentals</div> </div></p> <p>The process of a model generating a batch of <a href="#prediction"><strong>predictions</strong></a> and then caching (saving) those predictions. Apps can then access the inferred prediction from the cache rather than rerunning the model.</p> <p>For example, consider a model that generates local weather forecasts (predictions) once every four hours. After each model run, the system caches all the local weather forecasts. Weather apps retrieve the forecasts from the cache.</p> <p>Offline inference is also called <strong>static inference</strong>.</p> <p>Contrast with <a href="#online_inference"><strong>online inference</strong></a>.</p> <p><a class="glossary-anchor" name="one-hot_encoding"></a> <h2 class="hide-from-toc" id="one-hot-encoding" data-text=" one-hot encoding" tabindex="-1"> one-hot encoding</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="ML Fundamentals">#fundamentals</div> </div></p> <p>Representing categorical data as a vector in which:</p> <ul> <li>One element is set to 1.</li> <li>All other elements are set to 0.</li> </ul> <p>One-hot encoding is commonly used to represent strings or identifiers that have a finite set of possible values. For example, suppose a certain categorical feature named <code translate="no" dir="ltr">Scandinavia</code> has five possible values:</p> <ul> <li>"Denmark"</li> <li>"Sweden"</li> <li>"Norway"</li> <li>"Finland"</li> <li>"Iceland"</li> </ul> <p>One-hot encoding could represent each of the five values as follows:</p> <table> <tr><th>country</th> <th colspan=5">Vector</th></tr> <tr><td>"Denmark"</td> <td>1</td> <td>0</td> <td>0</td> <td>0</td> <td>0</td></tr> <tr><td>"Sweden"</td> <td>0</td> <td>1</td> <td>0</td> <td>0</td> <td>0</td></tr> <tr><td>"Norway"</td> <td>0</td> <td>0</td> <td>1</td> <td>0</td> <td>0</td></tr> <tr><td>"Finland"</td> <td>0</td> <td>0</td> <td>0</td> <td>1</td> <td>0</td></tr> <tr><td>"Iceland"</td> <td>0</td> <td>0</td> <td>0</td> <td>0</td> <td>1</td></tr> </table> <p>Thanks to one-hot encoding, a model can learn different connections based on each of the five countries.</p> <p>Representing a feature as <a href="#numerical_data"><strong>numerical data</strong></a> is an alternative to one-hot encoding. Unfortunately, representing the Scandinavian countries numerically is not a good choice. For example, consider the following numeric representation:</p> <ul> <li>"Denmark" is 0</li> <li>"Sweden" is 1</li> <li>"Norway" is 2</li> <li>"Finland" is 3</li> <li>"Iceland" is 4</li> </ul> <p>With numeric encoding, a model would interpret the raw numbers mathematically and would try to train on those numbers. However, Iceland isn't actually twice as much (or half as much) of something as Norway, so the model would come to some strange conclusions.</p> <p><a class="glossary-anchor" name="one-shot_learning"></a> <h2 class="hide-from-toc" id="one-shot-learning" data-text=" one-shot learning" tabindex="-1"> one-shot learning</h2></p> <p>A machine learning approach, often used for object classification, designed to learn effective classifiers from a single training example.</p> <p>See also <a href="#few-shot_learning"><strong>few-shot learning</strong></a> and <a href="#zero-shot-learning"><strong>zero-shot learning</strong></a>.</p> <p><a class="glossary-anchor" name="one-shot-prompting"></a> <h2 class="hide-from-toc" id="one-shot-prompting" data-text=" one-shot prompting" tabindex="-1"> one-shot prompting</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="Language Evaluation">#language</div> <div class="glossary-icon" title="Generative AI">#generativeAI</div> </div></p> <p>A <a href="#prompt"><strong>prompt</strong></a> that contains <em>one</em> example demonstrating how the <a href="#large-language-model"><strong>large language model</strong></a> should respond. For example, the following prompt contains one example showing a large language model how it should answer a query.</p> <table> <tr> <th>Parts of one prompt</th> <th>Notes</th> </tr> <tr> <td><tt>What is the official currency of the specified country?</tt></td> <td>The question you want the LLM to answer.</td> </tr> <tr> <td><tt>France: EUR</tt></td> <td>One example.</td> </tr> <tr> <td><tt>India:</tt></td> <td>The actual query.</td> </tr> </table> <p>Compare and contrast <strong>one-shot prompting</strong> with the following terms:</p> <ul> <li><a href="#zero-shot-prompting"><strong>zero-shot prompting</strong></a></li> <li><a href="#few-shot-prompting"><strong>few-shot prompting</strong></a></li> </ul> <p><a class="glossary-anchor" name="one-vs.-all"></a> <h2 class="hide-from-toc" id="one-vs.-all" data-text=" one-vs.-all" tabindex="-1"> one-vs.-all</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="ML Fundamentals">#fundamentals</div> </div></p> <p>Given a classification problem with N classes, a solution consisting of N separate <a href="#binary_classification"><strong>binary classifiers</strong></a>—one binary classifier for each possible outcome. For example, given a model that classifies examples as animal, vegetable, or mineral, a one-vs.-all solution would provide the following three separate binary classifiers:</p> <ul> <li>animal versus not animal</li> <li>vegetable versus not vegetable</li> <li>mineral versus not mineral</li> </ul> <p><a class="glossary-anchor" name="online"></a> <h2 class="hide-from-toc" id="online" data-text=" online" tabindex="-1"> online</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="ML Fundamentals">#fundamentals</div> </div></p> <p>Synonym for <a href="#dynamic"><strong>dynamic</strong></a>.</p> <p><a class="glossary-anchor" name="online_inference"></a> <h2 class="hide-from-toc" id="online-inference" data-text=" online inference" tabindex="-1"> online inference</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="ML Fundamentals">#fundamentals</div> </div></p> <p>Generating <a href="#prediction"><strong>predictions</strong></a> on demand. For example, suppose an app passes input to a model and issues a request for a prediction. A system using online inference responds to the request by running the model (and returning the prediction to the app).</p> <p>Contrast with <a href="#offline_inference"><strong>offline inference</strong></a>.</p> <p><a class="glossary-anchor" name="Operation"></a> <h2 class="hide-from-toc" id="operation-op" data-text=" operation (op)" tabindex="-1"> operation (op)</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="TensorFlow">#TensorFlow</div> </div></p> <p>In TensorFlow, any procedure that creates, manipulates, or destroys a <a href="#tensor"><strong>Tensor</strong></a>. For example, a matrix multiply is an operation that takes two Tensors as input and generates one Tensor as output.</p> <p><a class="glossary-anchor" name="Optax"></a> <h2 class="hide-from-toc" id="optax" data-text=" Optax" tabindex="-1"> Optax</h2></p> <p>A gradient processing and optimization library for <a href="#JAX"><strong>JAX</strong></a>. Optax facilitates research by providing building blocks that can be recombined in custom ways to optimize parametric models such as deep neural networks. Other goals include:</p> <ul> <li>Providing readable, well-tested, efficient implementations of core components.</li> <li>Improving productivity by making it possible to combine low level ingredients into custom optimizers (or other gradient processing components).</li> <li>Accelerating adoption of new ideas by making it easy for anyone to contribute.</li> </ul> <p><a class="glossary-anchor" name="optimizer"></a> <h2 class="hide-from-toc" id="optimizer" data-text=" optimizer" tabindex="-1"> optimizer</h2></p> <p>A specific implementation of the <a href="#gradient_descent"><strong>gradient descent</strong></a> algorithm. Popular optimizers include:</p> <ul> <li><a href="#AdaGrad"><strong>AdaGrad</strong></a>, which stands for ADAptive GRADient descent.</li> <li>Adam, which stands for ADAptive with Momentum.</li> </ul> <p><a class="glossary-anchor" name="out-group_homogeneity_bias"></a> <h2 class="hide-from-toc" id="out-group-homogeneity-bias" data-text=" out-group homogeneity bias " tabindex="-1"> out-group homogeneity bias </h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="Fairness">#fairness</div> </div></p> <p>The tendency to see out-group members as more alike than in-group members when comparing attitudes, values, personality traits, and other characteristics. <strong>In-group</strong> refers to people you interact with regularly; <strong>out-group</strong> refers to people you don't interact with regularly. If you create a dataset by asking people to provide attributes about out-groups, those attributes may be less nuanced and more stereotyped than attributes that participants list for people in their in-group.</p> <p>For example, Lilliputians might describe the houses of other Lilliputians in great detail, citing small differences in architectural styles, windows, doors, and sizes. However, the same Lilliputians might simply declare that Brobdingnagians all live in identical houses.</p> <p>Out-group homogeneity bias is a form of <a href="#group_attribution_bias"><strong>group attribution bias</strong></a>.</p> <p>See also <a href="#in-group_bias"><strong>in-group bias</strong></a>.</p> <p><a class="glossary-anchor" name="outlier-detection"></a> <h2 class="hide-from-toc" id="outlier-detection" data-text=" outlier detection" tabindex="-1"> outlier detection</h2></p> <p>The process of identifying <a href="#outliers"><strong>outliers</strong></a> in a <a href="#training_set"><strong>training set</strong></a>.</p> <p>Contrast with <a href="#novelty-detection"><strong>novelty detection</strong></a>.</p> <p><a class="glossary-anchor" name="outliers"></a> <h2 class="hide-from-toc" id="outliers" data-text=" outliers" tabindex="-1"> outliers</h2></p> <p>Values distant from most other values. In machine learning, any of the following are outliers:</p> <ul> <li>Input data whose values are more than roughly 3 standard deviations from the mean.</li> <li><a href="#weight"><strong>Weights</strong></a> with high absolute values.</li> <li>Predicted values relatively far away from the actual values.</li> </ul> <p>For example, suppose that <code translate="no" dir="ltr">widget-price</code> is a feature of a certain model. Assume that the mean <code translate="no" dir="ltr">widget-price</code> is 7 Euros with a standard deviation of 1 Euro. Examples containing a <code translate="no" dir="ltr">widget-price</code> of 12 Euros or 2 Euros would therefore be considered outliers because each of those prices is five standard deviations from the mean.</p> <p>Outliers are often caused by typos or other input mistakes. In other cases, outliers aren't mistakes; after all, values five standard deviations away from the mean are rare but hardly impossible.</p> <p>Outliers often cause problems in model training. <a href="#clipping"><strong>Clipping</strong></a> is one way of managing outliers.</p> <p><a class="glossary-anchor" name="out-of-bag-evaluation"></a> <h2 class="hide-from-toc" id="out-of-bag-evaluation-oob-evaluation" data-text=" out-of-bag evaluation (OOB evaluation) " tabindex="-1"> out-of-bag evaluation (OOB evaluation) </h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="Decision Forests">#df</div> </div></p> <p>A mechanism for evaluating the quality of a <a href="#decision-forest"><strong>decision forest</strong></a> by testing each <a href="#decision-tree"><strong>decision tree</strong></a> against the <a href="#example"><strong>examples</strong></a> <em>not</em> used during <a href="#training"><strong>training</strong></a> of that decision tree. For example, in the following diagram, notice that the system trains each decision tree on about two-thirds of the examples and then evaluates against the remaining one-third of the examples.</p> <p> <img src="/static/machine-learning/glossary/images/OOBevaluation.png" loading="lazy" width="600" alt="A decision forest consisting of three decision trees. One decision tree trains on two-thirds of the examples and then uses the remaining one-third for OOB evaluation. A second decision tree trains on a different two-thirds of the examples than the previous decision tree, and then uses a different one-third for OOB evaluation than the previous decision tree." > </p> <p>Out-of-bag evaluation is a computationally efficient and conservative approximation of the <a href="#cross-validation"><strong>cross-validation</strong></a> mechanism. In cross-validation, one model is trained for each cross-validation round (for example, 10 models are trained in a 10-fold cross-validation). With OOB evaluation, a single model is trained. Because <a href="#bagging"><strong>bagging</strong></a> withholds some data from each tree during training, OOB evaluation can use that data to approximate cross-validation.</p> <p><a class="glossary-anchor" name="output_layer"></a> <h2 class="hide-from-toc" id="output-layer" data-text=" output layer" tabindex="-1"> output layer</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="ML Fundamentals">#fundamentals</div> </div></p> <p>The "final" layer of a neural network. The output layer contains the prediction.</p> <p>The following illustration shows a small deep neural network with an input layer, two hidden layers, and an output layer:</p> <p> <img src="/static/machine-learning/glossary/images/OutputLayer.png" loading="lazy" width="750" alt="A neural network with one input layer, two hidden layers, and one output layer. The input layer consists of two features. The first hidden layer consists of three neurons and the second hidden layer consists of two neurons. The output layer consists of a single node." > </p> <p><a class="glossary-anchor" name="overfitting"></a> <h2 class="hide-from-toc" id="overfitting" data-text=" overfitting" tabindex="-1"> overfitting</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="ML Fundamentals">#fundamentals</div> </div></p> <p>Creating a <a href="#model"><strong>model</strong></a> that matches the <a href="#training_set"><strong>training data</strong></a> so closely that the model fails to make correct predictions on new data.</p> <p><a href="#regularization"><strong>Regularization</strong></a> can reduce overfitting. Training on a large and diverse training set can also reduce overfitting.</p> <section class="expandable"> <h4 class="showalways" id="click-the-icon-for-additional-notes._11" data-text=" Click the icon for additional notes. " tabindex="-1"> Click the icon for additional notes. </h4> <div class="expand-background"> <p> Overfitting is like strictly following advice from only your favorite teacher. You'll probably be successful in that teacher's class, but you might "overfit" to that teacher's ideas and be unsuccessful in other classes. Following advice from a mixture of teachers will enable you to adapt better to new situations. </p> </div> <hr /> </section> <p><a class="glossary-anchor" name="oversampling"></a> <h2 class="hide-from-toc" id="oversampling" data-text=" oversampling" tabindex="-1"> oversampling</h2></p> <p>Reusing the <a href="#example"><strong>examples</strong></a> of a <a href="#minority_class"><strong>minority class</strong></a> in a <a href="#class_imbalanced_data_set"><strong>class-imbalanced dataset</strong></a> in order to create a more balanced <a href="#training_set"><strong>training set</strong></a>.</p> <p>For example, consider a <a href="#binary_classification"><strong>binary classification</strong></a> problem in which the ratio of the <a href="#majority_class"><strong>majority class</strong></a> to the minority class is 5,000:1. If the dataset contains a million examples, then the dataset contains only about 200 examples of the minority class, which might be too few examples for effective training. To overcome this deficiency, you might oversample (reuse) those 200 examples multiple times, possibly yielding sufficient examples for useful training.</p> <p>You need to be careful about over <a href="#overfitting"><strong>overfitting</strong></a> when oversampling.</p> <p>Contrast with <a href="#undersampling"><strong>undersampling</strong></a>.</p> <p><a class="glossary-anchor" name="p"></a> <h2 class="glossary" id="p" data-text="P" tabindex="-1">P</h2></p> <p><a class="glossary-anchor" name="packed-data"></a> <h2 class="hide-from-toc" id="packed-data" data-text=" packed data" tabindex="-1"> packed data</h2></p> <p>An approach for storing data more efficiently.</p> <p>Packed data stores data either by using a compressed format or in some other way that allows it to be accessed more efficiently. Packed data minimizes the amount of memory and computation required to access it, leading to faster training and more efficient model inference.</p> <p>Packed data is often used with other techniques, such as <a href="#data_augmentation"><strong>data augmentation</strong></a> and <a href="#regularization"><strong>regularization</strong></a>, further improving the performance of <a href="#model"><strong>models</strong></a>.</p> <p><a class="glossary-anchor" name="pandas"></a> <h2 class="hide-from-toc" id="pandas" data-text=" pandas" tabindex="-1"> pandas</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="ML Fundamentals">#fundamentals</div> </div></p> <p>A column-oriented data analysis API built on top of <a href="#numpy"><strong>numpy</strong></a>. Many machine learning frameworks, including TensorFlow, support pandas data structures as inputs. See the <a href="http://pandas.pydata.org/" target="T">pandas documentation</a> for details.</p> <p><a class="glossary-anchor" name="parameter"></a> <h2 class="hide-from-toc" id="parameter" data-text=" parameter" tabindex="-1"> parameter</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="ML Fundamentals">#fundamentals</div> </div></p> <p>The <a href="#weight"><strong>weights</strong></a> and <a href="#bias"><strong>biases</strong></a> that a model learns during <a href="#training"><strong>training</strong></a>. For example, in a <a href="#linear_regression"><strong>linear regression</strong></a> model, the parameters consist of the bias (<em>b</em>) and all the weights (<i>w<sub>1</sub></i>, <i>w<sub>2</sub></i>, and so on) in the following formula:</p> <div> $$y' = b + w_1x_1 + w_2x_2 + … w_nx_n$$ </div> <p>In contrast, <a href="#hyperparameter"><strong>hyperparameter</strong></a> are the values that <em>you</em> (or a hyperparameter tuning service) supply to the model. For example, <a href="#learning_rate"><strong>learning rate</strong></a> is a hyperparameter.</p> <p><a class="glossary-anchor" name="parameter-efficient-tuning"></a> <h2 class="hide-from-toc" id="parameter-efficient-tuning" data-text=" parameter-efficient tuning" tabindex="-1"> parameter-efficient tuning</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="Language Evaluation">#language</div> <div class="glossary-icon" title="Generative AI">#generativeAI</div> </div></p> <p>A set of techniques to <a href="#fine-tuning"><strong>fine-tune</strong></a> a large <a href="#pre-trained-model"><strong>pre-trained language model (PLM)</strong></a> more efficiently than full <a href="#fine-tuning"><strong>fine-tuning</strong></a>. Parameter-efficient tuning typically fine-tunes far fewer <a href="#parameter"><strong>parameters</strong></a> than full fine-tuning, yet generally produces a <a href="#large-language-model"><strong>large language model</strong></a> that performs as well (or almost as well) as a large language model built from full fine-tuning.</p> <p>Compare and contrast parameter-efficient tuning with:</p> <ul> <li><a href="#instruction-tuning"><strong>instruction tuning</strong></a></li> <li><a href="#prompt-tuning"><strong>prompt tuning</strong></a></li> </ul> <p>Parameter-efficient tuning is also known as <strong>parameter-efficient fine-tuning</strong>.</p> <p><a class="glossary-anchor" name="Parameter_Server"></a> <h2 class="hide-from-toc" id="parameter-server-ps" data-text=" Parameter Server (PS)" tabindex="-1"> Parameter Server (PS)</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="TensorFlow">#TensorFlow</div> </div></p> <p>A job that keeps track of a model's <a href="#parameter"><strong>parameters</strong></a> in a distributed setting.</p> <p><a class="glossary-anchor" name="parameter_update"></a> <h2 class="hide-from-toc" id="parameter-update" data-text=" parameter update" tabindex="-1"> parameter update</h2></p> <p>The operation of adjusting a model's <a href="#parameter"><strong>parameters</strong></a> during training, typically within a single iteration of <a href="#gradient_descent"><strong>gradient descent</strong></a>.</p> <p><a class="glossary-anchor" name="partial_derivative"></a> <h2 class="hide-from-toc" id="partial-derivative" data-text=" partial derivative" tabindex="-1"> partial derivative</h2></p> <p>A derivative in which all but one of the variables is considered a constant. For example, the partial derivative of <em>f(x, y)</em> with respect to <em>x</em> is the derivative of <em>f</em> considered as a function of <em>x</em> alone (that is, keeping <em>y</em> constant). The partial derivative of <em>f</em> with respect to <em>x</em> focuses only on how <em>x</em> is changing and ignores all other variables in the equation.</p> <p><a class="glossary-anchor" name="participation_bias"></a> <h2 class="hide-from-toc" id="participation-bias" data-text=" participation bias" tabindex="-1"> participation bias</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="Fairness">#fairness</div> </div></p> <p>Synonym for non-response bias. See <a href="#selection_bias"><strong>selection bias</strong></a>.</p> <p><a class="glossary-anchor" name="partitioning_strategy"></a> <h2 class="hide-from-toc" id="partitioning-strategy" data-text=" partitioning strategy" tabindex="-1"> partitioning strategy</h2></p> <p>The algorithm by which variables are divided across <a href="#Parameter_Server"><strong>parameter servers</strong></a>.</p> <p><a class="glossary-anchor" name="pax"></a> <h2 class="hide-from-toc" id="pax" data-text=" Pax" tabindex="-1"> Pax</h2></p> <p>A programming framework designed for training large-scale <a href="#neural-network"><strong>neural network</strong></a> <a href="#model"><strong>models</strong></a> so large that they span multiple <a href="#TPU"><strong>TPU</strong></a> <a href="#accelerator-chip"><strong>accelerator chip</strong></a> <a href="#TPU_slice"><strong>slices</strong></a> or <a href="#TPU_Pod"><strong>pods</strong></a>.</p> <p>Pax is built on <a href="#flax"><strong>Flax</strong></a>, which is built on <a href="#JAX"><strong>JAX</strong></a>.</p> <p> <img src="/static/machine-learning/glossary/images/Pax.png" width="300" loading="lazy" alt="Diagram indicating Pax's position in the software stack. Pax is built on top of JAX. Pax itself consists of three layers. The bottom layer contains TensorStore and Flax. The middle layer contains Optax and Flaxformer. The top layer contains Praxis Modeling Library. Fiddle is built on top of Pax." > </p> <p><a class="glossary-anchor" name="perceptron"></a> <h2 class="hide-from-toc" id="perceptron" data-text=" perceptron" tabindex="-1"> perceptron</h2></p> <p>A system (either hardware or software) that takes in one or more input values, runs a function on the weighted sum of the inputs, and computes a single output value. In machine learning, the function is typically nonlinear, such as <a href="#ReLU"><strong>ReLU</strong></a>, <a href="#sigmoid-function"><strong>sigmoid</strong></a>, or <a href="https://wikipedia.org/wiki/Hyperbolic_functions" target="T">tanh</a>. For example, the following perceptron relies on the sigmoid function to process three input values:</p> <div> $$f(x_1, x_2, x_3) = \text{sigmoid}(w_1 x_1 + w_2 x_2 + w_3 x_3)$$ </div> <p>In the following illustration, the perceptron takes three inputs, each of which is itself modified by a weight before entering the perceptron:</p> <p> <img src="/static/machine-learning/glossary/images/Perceptron.svg" width="525" loading="lazy" alt="A perceptron that takes in 3 inputs, each multiplied by separate weights. The perceptron outputs a single value." > </p> <p>Perceptrons are the <a href="#neuron"><strong>neurons</strong></a> in <a href="#neural-network"><strong>neural networks</strong></a>.</p> <p><a class="glossary-anchor" name="performance"></a> <h2 class="hide-from-toc" id="performance" data-text=" performance" tabindex="-1"> performance</h2></p> <p>Overloaded term with the following meanings:</p> <ul> <li>The standard meaning within software engineering. Namely: How fast (or efficiently) does this piece of software run?</li> <li>The meaning within machine learning. Here, performance answers the following question: How correct is this <a href="#model"><strong>model</strong></a>? That is, how good are the model's predictions?</li> </ul> <p><a class="glossary-anchor" name="permutation-variable-importances"></a> <h2 class="hide-from-toc" id="permutation-variable-importances" data-text=" permutation variable importances " tabindex="-1"> permutation variable importances </h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="Decision Forests">#df</div> </div></p> <p>A type of <a href="#variable-importances"><strong>variable importance</strong></a> that evaluates the increase in the prediction error of a model <em>after</em> permuting the feature's values. Permutation variable importance is a model-independent metric.</p> <p><a class="glossary-anchor" name="perplexity"></a> <h2 class="hide-from-toc" id="perplexity" data-text=" perplexity" tabindex="-1"> perplexity</h2></p> <p>One measure of how well a <a href="#model"><strong>model</strong></a> is accomplishing its task. For example, suppose your task is to read the first few letters of a word a user is typing on a phone keyboard, and to offer a list of possible completion words. Perplexity, P, for this task is approximately the number of guesses you need to offer in order for your list to contain the actual word the user is trying to type.</p> <p>Perplexity is related to <a href="#cross-entropy"><strong>cross-entropy</strong></a> as follows:</p> <div> $$P= 2^{-\text{cross entropy}}$$ </div> <p><a class="glossary-anchor" name="pipeline"></a> <h2 class="hide-from-toc" id="pipeline" data-text=" pipeline" tabindex="-1"> pipeline</h2></p> <p>The infrastructure surrounding a machine learning algorithm. A pipeline includes gathering the data, putting the data into training data files, training one or more models, and exporting the models to production.</p> <p><a class="glossary-anchor" name="pipelining"></a> <h2 class="hide-from-toc" id="pipelining" data-text=" pipelining" tabindex="-1"> pipelining</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="Language Evaluation">#language</div> </div></p> <p>A form of <a href="#model-parallelism"><strong>model parallelism</strong></a> in which a model's processing is divided into consecutive stages and each stage is executed on a different device. While a stage is processing one batch, the preceding stage can work on the next batch.</p> <p>See also <a href="#staged-training"><strong>staged training</strong></a>.</p> <p><a class="glossary-anchor" name="pjit"></a> <h2 class="hide-from-toc" id="pjit" data-text=" pjit" tabindex="-1"> pjit</h2></p> <p>A <a href="#JAX"><strong>JAX</strong></a> function that splits code to run across multiple <a href="#accelerator-chip"><strong>accelerator chips</strong></a>. The user passes a function to pjit, which returns a function that has the equivalent semantics but is compiled into an <a href="#XLA"><strong>XLA</strong></a> computation that runs across multiple devices (such as GPUs or <a href="#TPU"><strong>TPU</strong></a> cores).</p> <p>pjit enables users to shard computations without rewriting them by using the <a href="#single-program"><strong>SPMD</strong></a> partitioner.</p> <p>As of March 2023, <code translate="no" dir="ltr">pjit</code> has been merged with <code translate="no" dir="ltr">jit</code>. Refer to <a href="https://jax.readthedocs.io/en/latest/notebooks/Distributed_arrays_and_automatic_parallelization.html">Distributed arrays and automatic parallelization</a> for more details.</p> <p><a class="glossary-anchor" name="PLM"></a> <h2 class="hide-from-toc" id="plm" data-text=" PLM" tabindex="-1"> PLM</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="Language Evaluation">#language</div> <div class="glossary-icon" title="Generative AI">#generativeAI</div> </div></p> <p>Abbreviation for <a href="#pre-trained-model"><strong>pre-trained language model</strong></a>.</p> <p><a class="glossary-anchor" name="pmap"></a> <h2 class="hide-from-toc" id="pmap" data-text=" pmap" tabindex="-1"> pmap</h2></p> <p>A <a href="#JAX"><strong>JAX</strong></a> function that executes copies of an input function on multiple underlying hardware devices (CPUs, GPUs, or <a href="#TPU"><strong>TPUs</strong></a>), with different input values. pmap relies on <a href="#single-program"><strong>SPMD</strong></a>.</p> <p><a class="glossary-anchor" name="policy"></a> <h2 class="hide-from-toc" id="policy" data-text=" policy" tabindex="-1"> policy</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="Reinforcement Learning">#rl</div> </div></p> <p>In reinforcement learning, an <a href="#agent"><strong>agent's</strong></a> probabilistic mapping from <a href="#state"><strong>states</strong></a> to <a href="#action"><strong>actions</strong></a>.</p> <p><a class="glossary-anchor" name="pooling"></a> <h2 class="hide-from-toc" id="pooling" data-text=" pooling" tabindex="-1"> pooling</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="Image Models">#image</div> </div></p> <p>Reducing a matrix (or matrixes) created by an earlier <a href="#convolutional_layer"><strong>convolutional layer</strong></a> to a smaller matrix. Pooling usually involves taking either the maximum or average value across the pooled area. For example, suppose we have the following 3x3 matrix:</p> <p> <img src="/static/machine-learning/glossary/images/PoolingStart.svg" loading="lazy" width="120" alt="The 3x3 matrix [[5,3,1], [8,2,5], [9,4,3]]." > </p> <p>A pooling operation, just like a convolutional operation, divides that matrix into slices and then slides that convolutional operation by <a href="#stride"><strong>strides</strong></a>. For example, suppose the pooling operation divides the convolutional matrix into 2x2 slices with a 1x1 stride. As the following diagram illustrates, four pooling operations take place. Imagine that each pooling operation picks the maximum value of the four in that slice:</p> <p> <img src="/static/machine-learning/glossary/images/PoolingConvolution.svg" loading="lazy" width="400" alt="The input matrix is 3x3 with the values: [[5,3,1], [8,2,5], [9,4,3]]. The top-left 2x2 submatrix of the input matrix is [[5,3], [8,2]], so the top-left pooling operation yields the value 8 (which is the maximum of 5, 3, 8, and 2). The top-right 2x2 submatrix of the input matrix is [[3,1], [2,5]], so the top-right pooling operation yields the value 5. The bottom-left 2x2 submatrix of the input matrix is [[8,2], [9,4]], so the bottom-left pooling operation yields the value 9. The bottom-right 2x2 submatrix of the input matrix is [[2,5], [4,3]], so the bottom-right pooling operation yields the value 5. In summary, the pooling operation yields the 2x2 matrix [[8,5], [9,5]]." > </p> <p>Pooling helps enforce <a href="#translational_invariance"><strong>translational invariance</strong></a> in the input matrix.</p> <p>Pooling for vision applications is known more formally as <strong>spatial pooling</strong>. Time-series applications usually refer to pooling as <strong>temporal pooling</strong>. Less formally, pooling is often called <strong>subsampling</strong> or <strong>downsampling</strong>.</p> <p><a class="glossary-anchor" name="positional_encoding"></a> <h2 class="hide-from-toc" id="positional-encoding" data-text=" positional encoding" tabindex="-1"> positional encoding</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="Language Evaluation">#language</div> </div></p> <p>A technique to add information about the <em>position</em> of a token in a sequence to the token's embedding. <a href="#transformer"><strong>Transformer models</strong></a> use positional encoding to better understand the relationship between different parts of the sequence.</p> <p>A common implementation of positional encoding uses a sinusoidal function. (Specifically, the frequency and amplitude of the sinusoidal function are determined by the position of the token in the sequence.) This technique enables a Transformer model to learn to attend to different parts of the sequence based on their position.</p> <p><a class="glossary-anchor" name="positive_class"></a> <h2 class="hide-from-toc" id="positive-class" data-text=" positive class" tabindex="-1"> positive class</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="ML Fundamentals">#fundamentals</div> </div></p> <p>The class you are testing for.</p> <p>For example, the positive class in a cancer model might be "tumor." The positive class in an email classifier might be "spam."</p> <p>Contrast with <a href="#negative_class"><strong>negative class</strong></a>.</p> <section class="expandable"> <h4 class="showalways" id="click-the-icon-for-additional-notes._12" data-text=" Click the icon for additional notes. " tabindex="-1"> Click the icon for additional notes. </h4> <div class="expand-background"> <p> The term <b>positive class</b> can be confusing because the "positive" outcome of many tests is often an undesirable result. For example, the positive class in many medical tests corresponds to tumors or diseases. In general, you want a doctor to tell you, "Congratulations! Your test results were negative." Regardless, the positive class is the event that the test is seeking to find. </p> <p> Admittedly, you're simultaneously testing for both the positive and negative classes. </p> </div> <hr /> </section> <p><a class="glossary-anchor" name="postprocessing"></a> <a class="glossary-anchor" name="post-processing"></a> <h2 class="hide-from-toc" id="post-processing" data-text=" post-processing" tabindex="-1"> post-processing</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="Fairness">#fairness</div> <div class="glossary-icon" title="ML Fundamentals">#fundamentals</div> </div></p> <p>Adjusting the output of a model <em>after</em> the model has been run. Post-processing can be used to enforce fairness constraints without modifying models themselves.</p> <p>For example, one might apply post-processing to a binary classifier by setting a classification threshold such that <a href="#equality_of_opportunity"><strong>equality of opportunity</strong></a> is maintained for some attribute by checking that the <a href="#TP_rate"><strong>true positive rate</strong></a> is the same for all values of that attribute.</p> <p><a class="glossary-anchor" name="PRAUC"></a> <a class="glossary-anchor" name="PR_AUC"></a> <a class="glossary-anchor" name="area_under_the_pr_curve"></a> <h2 class="hide-from-toc" id="pr-auc-area-under-the-pr-curve" data-text=" PR AUC (area under the PR curve)" tabindex="-1"> PR AUC (area under the PR curve)</h2></p> <p>Area under the interpolated <a href="#precision-recall_curve"><strong>precision-recall curve</strong></a>, obtained by plotting (recall, precision) points for different values of the <a href="#classification_threshold"><strong>classification threshold</strong></a>. Depending on how it's calculated, PR AUC may be equivalent to the <a href="#average_precision"><strong>average precision</strong></a> of the model.</p> <p><a class="glossary-anchor" name="Praxis"></a> <h2 class="hide-from-toc" id="praxis" data-text=" Praxis" tabindex="-1"> Praxis</h2></p> <p>A core, high-performance ML library of <a href="#pax"><strong>Pax</strong></a>. Praxis is often called the "Layer library".</p> <p>Praxis contains not just the definitions for the Layer class, but most of its supporting components as well, including:</p> <ul> <li>data inputs</li> <li>configuration libraries (HParam and <a href="#fiddle"><strong>Fiddle</strong></a>)</li> <li><a href="#optimizer"><strong>optimizers</strong></a></li> </ul> <p>Praxis provides the definitions for the Model class.</p> <p><a class="glossary-anchor" name="precision"></a> <h2 class="hide-from-toc" id="precision" data-text=" precision" tabindex="-1"> precision</h2></p> <p>A metric for <a href="#classification_model"><strong>classification models</strong></a> that answers the following question:</p> <blockquote> <p>When the model predicted the <a href="#positive_class"><strong>positive class</strong></a>, what percentage of the predictions were correct?</p> </blockquote> <p>Here is the formula:</p> <div> $$\text{Precision} = \frac{\text{true positives}} {\text{true positives} + \text{false positives}}$$ </div> <p>where:</p> <ul> <li>true positive means the model <em>correctly</em> predicted the positive class.</li> <li>false positive means the model <em>mistakenly</em> predicted the positive class.</li> </ul> <p>For example, suppose a model made 200 positive predictions. Of these 200 positive predictions:</p> <ul> <li>150 were true positives.</li> <li>50 were false positives.</li> </ul> <p>In this case:</p> <div> $$\text{Precision} = \frac{\text{150}} {\text{150} + \text{50}} = 0.75$$ </div> <p>Contrast with <a href="#accuracy"><strong>accuracy</strong></a> and <a href="#recall"><strong>recall</strong></a>.</p> <p>See <a href="/machine-learning/crash-course/classification/accuracy-precision-recall">Classification: Accuracy, recall, precision and related metrics</a> for more information.</p> <p><a class="glossary-anchor" name="precision-recall_curve"></a> <h2 class="hide-from-toc" id="precision-recall-curve" data-text=" precision-recall curve" tabindex="-1"> precision-recall curve</h2></p> <p>A curve of <a href="#precision"><strong>precision</strong></a> versus <a href="#recall"><strong>recall</strong></a> at different <a href="#classification_threshold"><strong>classification thresholds</strong></a>.</p> <p><a class="glossary-anchor" name="prediction"></a> <h2 class="hide-from-toc" id="prediction" data-text=" prediction" tabindex="-1"> prediction</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="ML Fundamentals">#fundamentals</div> </div></p> <p>A model's output. For example:</p> <ul> <li>The prediction of a binary classification model is either the positive class or the negative class.</li> <li>The prediction of a multi-class classification model is one class.</li> <li>The prediction of a linear regression model is a number.</li> </ul> <p><a class="glossary-anchor" name="prediction_bias"></a> <h2 class="hide-from-toc" id="prediction-bias" data-text=" prediction bias" tabindex="-1"> prediction bias</h2></p> <p>A value indicating how far apart the average of <a href="#prediction"><strong>predictions</strong></a> is from the average of <a href="#label"><strong>labels</strong></a> in the dataset.</p> <p>Not to be confused with the <a href="#bias"><strong>bias term</strong></a> in machine learning models or with <a href="#bias_ethics"><strong>bias in ethics and fairness</strong></a>.</p> <p><a class="glossary-anchor" name="predictive-ML"></a> <h2 class="hide-from-toc" id="predictive-ml" data-text=" predictive ML" tabindex="-1"> predictive ML</h2></p> <p>Any standard ("classic") <a href="#machine_learning"><strong>machine learning</strong></a> system.</p> <p>The term <strong>predictive ML</strong> doesn't have a formal definition. Rather, the term distinguishes a category of ML systems <em>not</em> based on <a href="#generative-AI"><strong>generative AI</strong></a>.</p> <p><a class="glossary-anchor" name="predictive_parity"></a> <h2 class="hide-from-toc" id="predictive-parity" data-text=" predictive parity" tabindex="-1"> predictive parity</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="Fairness">#fairness</div> </div></p> <p>A <a href="#fairness_metric"><strong>fairness metric</strong></a> that checks whether, for a given classifier, the <a href="#precision"><strong>precision</strong></a> rates are equivalent for subgroups under consideration.</p> <p>For example, a model that predicts college acceptance would satisfy predictive parity for nationality if its precision rate is the same for Lilliputians and Brobdingnagians.</p> <p>Predictive parity is sometime also called <em>predictive rate parity</em>.</p> <p>See <a href="http://fairware.cs.umass.edu/papers/Verma.pdf">"Fairness Definitions Explained"</a> (section 3.2.1) for a more detailed discussion of predictive parity.</p> <p><a class="glossary-anchor" name="predictive_rate_parity"></a> <h2 class="hide-from-toc" id="predictive-rate-parity" data-text=" predictive rate parity" tabindex="-1"> predictive rate parity</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="Fairness">#fairness</div> </div></p> <p>Another name for <a href="#predictive_parity"><strong>predictive parity</strong></a>.</p> <p><a class="glossary-anchor" name="preprocessing"></a> <h2 class="hide-from-toc" id="preprocessing" data-text=" preprocessing" tabindex="-1"> preprocessing</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="Fairness">#fairness</div> </div> Processing data before it's used to train a model. Preprocessing could be as simple as removing words from an English text corpus that don't occur in the English dictionary, or could be as complex as re-expressing data points in a way that eliminates as many attributes that are correlated with <a href="#sensitive_attribute"><strong>sensitive attributes</strong></a> as possible. Preprocessing can help satisfy <a href="#fairness_constraint"><strong>fairness constraints</strong></a>.</p> <p><a class="glossary-anchor" name="pre-trained-model"></a> <h2 class="hide-from-toc" id="pre-trained-model" data-text=" pre-trained model" tabindex="-1"> pre-trained model</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="Language Evaluation">#language</div> <div class="glossary-icon" title="Image Models">#image</div> <div class="glossary-icon" title="Generative AI">#generativeAI</div> </div></p> <p>Models or model components (such as an <a href="#embedding_vector"><strong>embedding vector</strong></a>) that have been already been trained. Sometimes, you'll feed pre-trained embedding vectors into a <a href="#neural_network"><strong>neural network</strong></a>. Other times, your model will train the embedding vectors themselves rather than rely on the pre-trained embeddings.</p> <p>The term <strong>pre-trained language model</strong> refers to a <a href="#large-language-model"><strong>large language model</strong></a> that has gone through <a href="#pre-training"><strong>pre-training</strong></a>.</p> <p><a class="glossary-anchor" name="pre-training"></a> <h2 class="hide-from-toc" id="pre-training" data-text=" pre-training" tabindex="-1"> pre-training</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="Language Evaluation">#language</div> <div class="glossary-icon" title="Image Models">#image</div> <div class="glossary-icon" title="Generative AI">#generativeAI</div> </div></p> <p>The initial training of a model on a large dataset. Some pre-trained models are clumsy giants and must typically be refined through additional training. For example, ML experts might pre-train a <a href="#large-language-model"><strong>large language model</strong></a> on a vast text dataset, such as all the English pages in Wikipedia. Following pre-training, the resulting model might be further refined through any of the following techniques:</p> <ul> <li><a href="#distillation"><strong>distillation</strong></a></li> <li><a href="#fine-tuning"><strong>fine-tuning</strong></a></li> <li><a href="#instruction-tuning"><strong>instruction tuning</strong></a></li> <li><a href="#parameter-efficient-tuning"><strong>parameter-efficient tuning</strong></a></li> <li><a href="#prompt-tuning"><strong>prompt-tuning</strong></a></li> </ul> <p><a class="glossary-anchor" name="prior_belief"></a> <h2 class="hide-from-toc" id="prior-belief" data-text=" prior belief" tabindex="-1"> prior belief</h2></p> <p>What you believe about the data before you begin training on it. For example, <a href="#L2_regularization"><strong>L<sub>2</sub> regularization</strong></a> relies on a prior belief that <a href="#weight"><strong>weights</strong></a> should be small and normally distributed around zero.</p> <p><a class="glossary-anchor" name="probabilistic-regression-model"></a> <h2 class="hide-from-toc" id="probabilistic-regression-model" data-text=" probabilistic regression model" tabindex="-1"> probabilistic regression model</h2></p> <p>A <a href="#regression_model"><strong>regression model</strong></a> that uses not only the <a href="#weight"><strong>weights</strong></a> for each <a href="#feature"><strong>feature</strong></a>, but also the uncertainty of those weights. A probabilistic regression model generates a prediction and the uncertainty of that prediction. For example, a probabilistic regression model might yield a prediction of 325 with a standard deviation of 12. For more information about probabilistic regression models, see this <a href="https://www.tensorflow.org/probability/examples/Probabilistic_Layers_Regression">Colab on tensorflow.org</a>.</p> <p><a class="glossary-anchor" name="probability_density_function"></a> <h2 class="hide-from-toc" id="probability-density-function" data-text=" probability density function" tabindex="-1"> probability density function</h2></p> <p>A function that identifies the frequency of data samples having <em>exactly</em> a particular value. When a dataset's values are continuous floating-point numbers, exact matches rarely occur. However, <em>integrating</em> a probability density function from value <code translate="no" dir="ltr">x</code> to value <code translate="no" dir="ltr">y</code> yields the expected frequency of data samples between <code translate="no" dir="ltr">x</code> and <code translate="no" dir="ltr">y</code>.</p> <p>For example, consider a normal distribution having a mean of 200 and a standard deviation of 30. To determine the expected frequency of data samples falling within the range 211.4 to 218.7, you can integrate the probability density function for a normal distribution from 211.4 to 218.7.</p> <p><a class="glossary-anchor" name="prompt"></a> <h2 class="hide-from-toc" id="prompt" data-text=" prompt" tabindex="-1"> prompt</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="Language Evaluation">#language</div> <div class="glossary-icon" title="Generative AI">#generativeAI</div> </div></p> <p>Any text entered as input to a <a href="#large-language-model"><strong>large language model</strong></a> to condition the model to behave in a certain way. Prompts can be as short as a phrase or arbitrarily long (for example, the entire text of a novel). Prompts fall into multiple categories, including those shown in the following table:</p> <table> <tr><th>Prompt category</th> <th>Example</th> <th>Notes</th></tr> <tr> <td>Question</td> <td><tt>How fast can a pigeon fly?</tt></td> </tr> <tr> <td>Instruction</td> <td><tt>Write a funny poem about arbitrage.</tt></td> <td>A prompt that asks the large language model to <i>do</i> something.</td> </tr> <tr> <td>Example</td> <td><tt>Translate Markdown code to HTML. For example: <br/> Markdown: * list item <br/> HTML: <ul> <li>list item</li> </ul></tt></td> <td>The first sentence in this example prompt is an instruction. The remainder of the prompt is the example. </td> </tr> <tr> <td><a href="#role-prompting"><b>Role</b></a></td> <td><tt>Explain why gradient descent is used in machine learning training to a PhD in Physics.</tt></td> <td>The first part of the sentence is an instruction; the phrase "to a PhD in Physics" is the role portion.</td> </tr> <tr> <td>Partial input for the model to complete</td> <td><tt>The Prime Minister of the United Kingdom lives at</tt></td> <td>A partial input prompt can either end abruptly (as this example does) or end with an underscore.</td> </tr> </table> <p>A <a href="#generative-AI"><strong>generative AI</strong></a> model can respond to a prompt with text, code, images, <a href="#embedding_vector"><strong>embeddings</strong></a>, videos…almost anything.</p> <p><a class="glossary-anchor" name="prompt-based-learning"></a> <h2 class="hide-from-toc" id="prompt-based-learning" data-text=" prompt-based learning" tabindex="-1"> prompt-based learning</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="Language Evaluation">#language</div> <div class="glossary-icon" title="Generative AI">#generativeAI</div> </div></p> <p>A capability of certain <a href="#model"><strong>models</strong></a> that enables them to adapt their behavior in response to arbitrary text input (<a href="#prompt"><strong>prompts</strong></a>). In a typical prompt-based learning paradigm, a <a href="#large-language-model"><strong>large language model</strong></a> responds to a prompt by generating text. For example, suppose a user enters the following prompt:</p> <blockquote> <p>Summarize Newton's Third Law of Motion.</p> </blockquote> <p>A model capable of prompt-based learning isn't specifically trained to answer the previous prompt. Rather, the model "knows" a lot of facts about physics, a lot about general language rules, and a lot about what constitutes generally useful answers. That knowledge is sufficient to provide a (hopefully) useful answer. Additional human feedback ("That answer was too complicated." or "What's a reaction?") enables some prompt-based learning systems to gradually improve the usefulness of their answers.</p> <p><a class="glossary-anchor" name="prompt-design"></a> <h2 class="hide-from-toc" id="prompt-design" data-text=" prompt design" tabindex="-1"> prompt design</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="Language Evaluation">#language</div> <div class="glossary-icon" title="Generative AI">#generativeAI</div> </div></p> <p>Synonym for <a href="#prompt-engineering"><strong>prompt engineering</strong></a>.</p> <p><a class="glossary-anchor" name="prompt-engineering"></a> <h2 class="hide-from-toc" id="prompt-engineering" data-text=" prompt engineering" tabindex="-1"> prompt engineering</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="Language Evaluation">#language</div> <div class="glossary-icon" title="Generative AI">#generativeAI</div> </div></p> <p>The art of creating <a href="#prompt"><strong>prompts</strong></a> that elicit the desired responses from a <a href="#large-language-model"><strong>large language model</strong></a>. Humans perform prompt engineering. Writing well-structured prompts is an essential part of ensuring useful responses from a large language model. Prompt engineering depends on many factors, including:</p> <ul> <li>The dataset used to <a href="#pre-training"><strong>pre-train</strong></a> and possibly <a href="#fine-tuning"><strong>fine-tune</strong></a> the large language model.</li> <li>The <a href="#temperature"><strong>temperature</strong></a> and other decoding parameters that the model uses to generate responses.</li> </ul> <p>See <a href="https://developers.generativeai.google/guide/prompt_best_practices">Introduction to prompt design</a> for more details on writing helpful prompts.</p> <p><a href="#prompt-design"><strong>Prompt design</strong></a> is a synonym for prompt engineering.</p> <p><a class="glossary-anchor" name="prompt-tuning"></a> <h2 class="hide-from-toc" id="prompt-tuning" data-text=" prompt tuning" tabindex="-1"> prompt tuning</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="Language Evaluation">#language</div> <div class="glossary-icon" title="Generative AI">#generativeAI</div> </div></p> <p>A <a href="#parameter-efficient-tuning"><strong>parameter efficient tuning</strong></a> mechanism that learns a "prefix" that the system prepends to the actual <a href="#prompt"><strong>prompt</strong></a>.</p> <p>One variation of prompt tuning—sometimes called <strong>prefix tuning</strong>—is to prepend the prefix at <em>every layer</em>. In contrast, most prompt tuning only adds a prefix to the <a href="#input-layer"><strong>input layer</strong></a>.</p> <section class="expandable"> <h4 class="showalways" id="click-the-icon-to-learn-more-about-prefixes." data-text=" Click the icon to learn more about prefixes. " tabindex="-1"> Click the icon to learn more about prefixes. </h4> <div class="expand-background"> <p>For prompt tuning, the "prefix" (also known as a "soft prompt") is a handful of learned, task-specific vectors prepended to the text token embeddings from the actual prompt. The system learns the soft prompt by freezing all other model parameters and fine-tuning on a specific task.</p> </div> <hr /> </section> <p><a class="glossary-anchor" name="proxy_labels"></a> <h2 class="hide-from-toc" id="proxy-labels" data-text=" proxy labels" tabindex="-1"> proxy labels</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="ML Fundamentals">#fundamentals</div> </div></p> <p>Data used to approximate labels not directly available in a dataset.</p> <p>For example, suppose you must train a model to predict employee stress level. Your dataset contains a lot of predictive features but doesn't contain a label named <em>stress level.</em> Undaunted, you pick "workplace accidents" as a proxy label for stress level. After all, employees under high stress get into more accidents than calm employees. Or do they? Maybe workplace accidents actually rise and fall for multiple reasons.</p> <p>As a second example, suppose you want <em>is it raining?</em> to be a Boolean label for your dataset, but your dataset doesn't contain rain data. If photographs are available, you might establish pictures of people carrying umbrellas as a proxy label for <em>is it raining?</em> Is that a good proxy label? Possibly, but people in some cultures may be more likely to carry umbrellas to protect against sun than the rain.</p> <p>Proxy labels are often imperfect. When possible, choose actual labels over proxy labels. That said, when an actual label is absent, pick the proxy label very carefully, choosing the least horrible proxy label candidate.</p> <p><a class="glossary-anchor" name="proxy_sensitive_attributes"></a> <h2 class="hide-from-toc" id="proxy-sensitive-attributes" data-text=" proxy (sensitive attributes)" tabindex="-1"> proxy (sensitive attributes)</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="Fairness">#fairness</div> </div> An attribute used as a stand-in for a <a href="#sensitive_attribute"><strong>sensitive attribute</strong></a>. For example, an individual's postal code might be used as a proxy for their income, race, or ethnicity.</p> <p><a class="glossary-anchor" name="pure_function"></a> <h2 class="hide-from-toc" id="pure-function" data-text=" pure function" tabindex="-1"> pure function</h2></p> <p>A function whose outputs are based only on its inputs, and that has no side effects. Specifically, a pure function doesn't use or change any global state, such as the contents of a file or the value of a variable outside the function.</p> <p>Pure functions can be used to create thread-safe code, which is beneficial when sharding <a href="#model"><strong>model</strong></a> code across multiple <a href="#accelerator-chip"><strong>accelerator chips</strong></a>.</p> <p><a href="#JAX"><strong>JAX's</strong></a> function transformation methods require that the input functions are pure functions.</p> <p><a class="glossary-anchor" name="q"></a> <h2 class="glossary" id="q" data-text="Q" tabindex="-1">Q</h2></p> <p><a class="glossary-anchor" name="q-function"></a> <h2 class="hide-from-toc" id="q-function" data-text=" Q-function" tabindex="-1"> Q-function</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="Reinforcement Learning">#rl</div> </div></p> <p>In <a href="#reinforcement_learning"><strong>reinforcement learning</strong></a>, the function that predicts the expected <a href="#return"><strong>return</strong></a> from taking an <a href="#action"><strong>action</strong></a> in a <a href="#state"><strong>state</strong></a> and then following a given <a href="#policy"><strong>policy</strong></a>.</p> <p>Q-function is also known as <strong>state-action value function</strong>.</p> <p><a class="glossary-anchor" name="q-learning"></a> <h2 class="hide-from-toc" id="q-learning" data-text=" Q-learning" tabindex="-1"> Q-learning</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="Reinforcement Learning">#rl</div> </div></p> <p>In <a href="#reinforcement_learning"><strong>reinforcement learning</strong></a>, an algorithm that allows an <a href="#agent"><strong>agent</strong></a> to learn the optimal <a href="#q-function"><strong>Q-function</strong></a> of a <a href="#markov_decision_process"><strong>Markov decision process</strong></a> by applying the <a href="#bellman_equation"><strong>Bellman equation</strong></a>. The Markov decision process models an <a href="#environment"><strong>environment</strong></a>.</p> <p><a class="glossary-anchor" name="quantile"></a> <h2 class="hide-from-toc" id="quantile" data-text=" quantile" tabindex="-1"> quantile</h2></p> <p>Each bucket in <a href="#quantile_bucketing"><strong>quantile bucketing</strong></a>.</p> <p><a class="glossary-anchor" name="quantile_bucketing"></a> <h2 class="hide-from-toc" id="quantile-bucketing" data-text=" quantile bucketing" tabindex="-1"> quantile bucketing</h2></p> <p>Distributing a feature's values into <a href="#bucketing"><strong>buckets</strong></a> so that each bucket contains the same (or almost the same) number of examples. For example, the following figure divides 44 points into 4 buckets, each of which contains 11 points. In order for each bucket in the figure to contain the same number of points, some buckets span a different width of x-values.</p> <p> <img src="/static/machine-learning/glossary/images/QuantileBucketing.svg" loading="lazy" width="500" alt="44 data points divided into 4 buckets of 11 points each. Although each bucket contains the same number of data points, some buckets contain a wider range of feature values than other buckets." > </p> <p><a class="glossary-anchor" name="quantization"></a> <h2 class="hide-from-toc" id="quantization" data-text=" quantization" tabindex="-1"> quantization</h2></p> <p>Overloaded term that could be used in any of the following ways:</p> <ul> <li>Implementing <a href="#quantile_bucketing"><strong>quantile bucketing</strong></a> on a particular <a href="#feature"><strong>feature</strong></a>.</li> <li>Transforming data into zeroes and ones for quicker storing, training, and inferring. As Boolean data is more robust to noise and errors than other formats, quantization can improve model correctness. Quantization techniques include rounding, truncating, and <a href="#binning"><strong>binning</strong></a>.</li> <li><p>Reducing the number of bits used to store a model's <a href="#parameter"><strong>parameters</strong></a>. For example, suppose a model's parameters are stored as 32-bit floating-point numbers. Quantization converts those parameters from 32 bits down to 4, 8, or 16 bits. Quantization reduces the following:</p> <ul> <li>Compute, memory, disk, and network usage</li> <li>Time to infer a predication</li> <li>Power consumption</li> </ul> <p>However, quantization sometimes decreases the correctness of a model's predictions. </li> </ul> <p><a class="glossary-anchor" name="queue"></a> <h2 class="hide-from-toc" id="queue" data-text=" queue" tabindex="-1"> queue</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="TensorFlow">#TensorFlow</div> </div></p> <p>A TensorFlow <a href="#Operation"><strong>Operation</strong></a> that implements a queue data structure. Typically used in I/O.</p> <p><a class="glossary-anchor" name="r"></a> <h2 class="glossary" id="r" data-text="R" tabindex="-1">R</h2></p> <p><a class="glossary-anchor" name="RAG"></a> <h2 class="hide-from-toc" id="rag" data-text=" RAG" tabindex="-1"> RAG</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="ML Fundamentals">#fundamentals</div> </div></p> <p>Abbreviation for <a href="#retrieval-augmented_generation"><strong>retrieval-augmented generation</strong></a>.</p> <p><a class="glossary-anchor" name="random-forest"></a> <h2 class="hide-from-toc" id="random-forest" data-text=" random forest" tabindex="-1"> random forest</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="Decision Forests">#df</div> </div></p> <p>An <a href="#ensemble"><strong>ensemble</strong></a> of <a href="#decision-tree"><strong>decision trees</strong></a> in which each decision tree is trained with a specific random noise, such as <a href="#bagging"><strong>bagging</strong></a>.</p> <p>Random forests are a type of <a href="#decision-forest"><strong>decision forest</strong></a>.</p> <p><a class="glossary-anchor" name="random_policy"></a> <h2 class="hide-from-toc" id="random-policy" data-text=" random policy" tabindex="-1"> random policy</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="Reinforcement Learning">#rl</div> </div></p> <p>In <a href="#reinforcement_learning"><strong>reinforcement learning</strong></a>, a <a href="#policy"><strong>policy</strong></a> that chooses an <a href="#action"><strong>action</strong></a> at random.</p> <p><a class="glossary-anchor" name="ranking"></a> <h2 class="hide-from-toc" id="ranking" data-text=" ranking" tabindex="-1"> ranking</h2></p> <p>A type of <a href="#supervised_machine_learning"><strong>supervised learning</strong></a> whose objective is to order a list of items.</p> <p><a class="glossary-anchor" name="rank_ordinality"></a> <h2 class="hide-from-toc" id="rank-ordinality" data-text=" rank (ordinality)" tabindex="-1"> rank (ordinality)</h2></p> <p>The ordinal position of a class in a machine learning problem that categorizes classes from highest to lowest. For example, a behavior ranking system could rank a dog's rewards from highest (a steak) to lowest (wilted kale).</p> <p><a class="glossary-anchor" name="rank"></a> <a class="glossary-anchor" name="rank_Tensor"></a> <h2 class="hide-from-toc" id="rank-tensor" data-text=" rank (Tensor)" tabindex="-1"> rank (Tensor)</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="TensorFlow">#TensorFlow</div> </div></p> <p>The number of dimensions in a <a href="#tensor"><strong>Tensor</strong></a>. For example, a scalar has rank 0, a vector has rank 1, and a matrix has rank 2.</p> <p>Not to be confused with <a href="#rank_ordinality"><strong>rank (ordinality)</strong></a>.</p> <p><a class="glossary-anchor" name="rater"></a> <h2 class="hide-from-toc" id="rater" data-text=" rater" tabindex="-1"> rater</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="ML Fundamentals">#fundamentals</div> </div></p> <p>A human who provides <a href="#label"><strong>labels</strong></a> for <a href="#example"><strong>examples</strong></a>. "Annotator" is another name for rater.</p> <p><a class="glossary-anchor" name="recall"></a> <h2 class="hide-from-toc" id="recall" data-text=" recall" tabindex="-1"> recall</h2></p> <p>A metric for <a href="#classification_model"><strong>classification models</strong></a> that answers the following question:</p> <blockquote> <p>When <a href="#ground_truth"><strong>ground truth</strong></a> was the <a href="#positive_class"><strong>positive class</strong></a>, what percentage of predictions did the model correctly identify as the positive class?</p> </blockquote> <p>Here is the formula:</p> <p>\[\text{Recall} = \frac{\text{true positives}} {\text{true positives} + \text{false negatives}} \]</p> <p>where:</p> <ul> <li>true positive means the model <em>correctly</em> predicted the positive class.</li> <li>false negative means that the model <em>mistakenly</em> predicted the <a href="#negative_class"><strong>negative class</strong></a>.</li> </ul> <p>For instance, suppose your model made 200 predictions on examples for which ground truth was the positive class. Of these 200 predictions:</p> <ul> <li>180 were true positives.</li> <li>20 were false negatives.</li> </ul> <p>In this case:</p> <p>\[\text{Recall} = \frac{\text{180}} {\text{180} + \text{20}} = 0.9 \]</p> <section class="expandable"> <h4 class="showalways" id="click-the-icon-for-notes-about-class-imbalanced-datasets." data-text=" Click the icon for notes about class-imbalanced datasets. " tabindex="-1"> Click the icon for notes about class-imbalanced datasets. </h4> <div class="expand-background"> <p> Recall is particularly useful for determining the predictive power of classification models in which the positive class is rare. For example, consider a <a href="#class_imbalanced_data_set"><b>class-imbalanced dataset</b></a> in which the positive class for a certain disease occurs in only 10 patients out of a million. Suppose your model makes five million predictions that yield the following outcomes: </p> <ul> <li>30 True Positives</li> <li>20 False Negatives</li> <li>4,999,000 True Negatives</li> <li>950 False Positives</li> </ul> <p>The recall of this model is therefore:</p> <div></div><devsite-code><pre translate="no" dir="ltr" is-upgraded> recall = TP / (TP + FN) recall = 30 / (30 + 20) = 0.6 = 60% </pre></devsite-code> By contrast, the <a href="#accuracy">accuracy</a> of this model is: <div></div><devsite-code><pre translate="no" dir="ltr" is-upgraded> accuracy = (TP + TN) / (TP + TN + FP + FN) accuracy = (30 + 4,999,000) / (30 + 4,999,000 + 950 + 20) = 99.98% </pre></devsite-code> <p> That high value of accuracy looks impressive but is essentially meaningless. Recall is a much more useful metric for class-imbalanced datasets than accuracy. </p> </div> <hr /> </section> <p>See <a href="/machine-learning/crash-course/classification/accuracy-precision-recall">Classification: Accuracy, recall, precision and related metrics</a> for more information.</p> <p><a class="glossary-anchor" name="recommendation_system"></a> <h2 class="hide-from-toc" id="recommendation-system" data-text=" recommendation system" tabindex="-1"> recommendation system</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="Recommendation Systems">#recsystems</div> </div></p> <p>A system that selects for each user a relatively small set of desirable <a href="#items"><strong>items</strong></a> from a large corpus. For example, a video recommendation system might recommend two videos from a corpus of 100,000 videos, selecting <em>Casablanca</em> and <em>The Philadelphia Story</em> for one user, and <em>Wonder Woman</em> and <em>Black Panther</em> for another. A video recommendation system might base its recommendations on factors such as:</p> <ul> <li>Movies that similar users have rated or watched.</li> <li>Genre, directors, actors, target demographic...</li> </ul> <p><a class="glossary-anchor" name="Rectified-Linear-Unit"></a> <a class="glossary-anchor" name="ReLU"></a> <h2 class="hide-from-toc" id="rectified-linear-unit-relu" data-text=" Rectified Linear Unit (ReLU)" tabindex="-1"> Rectified Linear Unit (ReLU)</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="ML Fundamentals">#fundamentals</div> </div></p> <p>An <a href="#activation_function"><strong>activation function</strong></a> with the following behavior:</p> <ul> <li>If input is negative or zero, then the output is 0.</li> <li>If input is positive, then the output is equal to the input.</li> </ul> <p>For example:</p> <ul> <li>If the input is -3, then the output is 0.</li> <li>If the input is +3, then the output is 3.0.</li> </ul> <p>Here is a plot of ReLU:</p> <p> <img src="/static/machine-learning/glossary/images/ReLU.png" loading="lazy" alt="A cartesian plot of two lines. The first line has a constant y value of 0, running along the x-axis from -infinity,0 to 0,-0. The second line starts at 0,0. This line has a slope of +1, so it runs from 0,0 to +infinity,+infinity." > </p> <p>ReLU is a very popular activation function. Despite its simple behavior, ReLU still enables a neural network to learn <a href="#nonlinear"><strong>nonlinear</strong></a> relationships between <a href="#feature"><strong>features</strong></a> and the <a href="#label"><strong>label</strong></a>.</p> <p><a class="glossary-anchor" name="recurrent_neural_network"></a> <h2 class="hide-from-toc" id="recurrent-neural-network" data-text=" recurrent neural network " tabindex="-1"> recurrent neural network </h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="Sequence Models">#seq</div> </div></p> <p>A <a href="#neural_network"><strong>neural network</strong></a> that is intentionally run multiple times, where parts of each run feed into the next run. Specifically, hidden layers from the previous run provide part of the input to the same hidden layer in the next run. Recurrent neural networks are particularly useful for evaluating sequences, so that the hidden layers can learn from previous runs of the neural network on earlier parts of the sequence.</p> <p>For example, the following figure shows a recurrent neural network that runs four times. Notice that the values learned in the hidden layers from the first run become part of the input to the same hidden layers in the second run. Similarly, the values learned in the hidden layer on the second run become part of the input to the same hidden layer in the third run. In this way, the recurrent neural network gradually trains and predicts the meaning of the entire sequence rather than just the meaning of individual words.</p> <p> <img src="/static/machine-learning/glossary/images/RNN.svg" width="545" loading="lazy" alt="An RNN that runs four times to process four input words." > </p> <p><a class="glossary-anchor" name="regression_model"></a> <h2 class="hide-from-toc" id="regression-model" data-text=" regression model" tabindex="-1"> regression model</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="ML Fundamentals">#fundamentals</div> </div></p> <p>Informally, a model that generates a numerical prediction. (In contrast, a <a href="#classification_model"><strong>classification model</strong></a> generates a class prediction.) For example, the following are all regression models:</p> <ul> <li>A model that predicts a certain house's value, such as 423,000 Euros.</li> <li>A model that predicts a certain tree's life expectancy, such as 23.2 years.</li> <li>A model that predicts the amount of rain that will fall in a certain city over the next six hours, such as 0.18 inches.</li> </ul> <p>Two common types of regression models are:</p> <ul> <li><a href="#linear_regression"><strong>Linear regression</strong></a>, which finds the line that best fits label values to features.</li> <li><a href="#logistic_regression"><strong>Logistic regression</strong></a>, which generates a probability between 0.0 and 1.0 that a system typically then maps to a class prediction.</li> </ul> <p>Not every model that outputs numerical predictions is a regression model. In some cases, a numeric prediction is really just a classification model that happens to have numeric class names. For example, a model that predicts a numeric postal code is a classification model, not a regression model.</p> <p><a class="glossary-anchor" name="regularization"></a> <h2 class="hide-from-toc" id="regularization" data-text=" regularization" tabindex="-1"> regularization</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="ML Fundamentals">#fundamentals</div> </div></p> <p>Any mechanism that reduces <a href="#overfitting"><strong>overfitting</strong></a>. Popular types of regularization include:</p> <ul> <li><a href="#L1_regularization"><strong>L<sub>1</sub> regularization</strong></a></li> <li><a href="#L2_regularization"><strong>L<sub>2</sub> regularization</strong></a></li> <li><a href="#dropout_regularization"><strong>dropout regularization</strong></a></li> <li><a href="#early_stopping"><strong>early stopping</strong></a> (this is not a formal regularization method, but can effectively limit overfitting)</li> </ul> <p>Regularization can also be defined as the penalty on a model's complexity.</p> <section class="expandable"> <h4 class="showalways" id="click-the-icon-for-additional-notes._13" data-text=" Click the icon for additional notes. " tabindex="-1"> Click the icon for additional notes. </h4> <div class="expand-background"> <p> Regularization is counterintuitive. Increasing regularization usually <i>increases</i> training loss, which is confusing because, well, isn't the goal to <i>minimize</i> training loss? </p> <p> Actually, no. The goal isn't to minimize training loss. The goal is to make excellent predictions on real-world examples. Remarkably, even though increasing regularization increases training loss, it usually helps models make better predictions on real-world examples. </p> </div> <hr /> </section> <p><a class="glossary-anchor" name="regularization_rate"></a> <h2 class="hide-from-toc" id="regularization-rate" data-text=" regularization rate" tabindex="-1"> regularization rate</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="ML Fundamentals">#fundamentals</div> </div></p> <p>A number that specifies the relative importance of <a href="#regularization"><strong>regularization</strong></a> during training. Raising the regularization rate reduces <a href="#overfitting"><strong>overfitting</strong></a> but may reduce the model's predictive power. Conversely, reducing or omitting the regularization rate increases overfitting.</p> <section class="expandable"> <h4 class="showalways" id="click-the-icon-to-see-the-math._3" data-text=" Click the icon to see the math. " tabindex="-1"> Click the icon to see the math. </h4> <div class="expand-background"> <p> The regularization rate is usually represented as the Greek letter lambda. The following simplified <a href="#loss"><b>loss</b></a> equation shows lambda's influence: </p> <div> $$\text{minimize(loss function + }\lambda\text{(regularization))}$$ </div> <p>where <i>regularization</i> is any regularization mechanism, including;</p> <ul> <li><a href="#L1_regularization"><b>L<sub>1</sub> regularization</b></a></li> <li><a href="#L2_regularization"><b>L<sub>2</sub> regularization</b></a></li> </ul> </div> <hr /> </section> <p><a class="glossary-anchor" name="reinforcement_learning"></a> <h2 class="hide-from-toc" id="reinforcement-learning-rl" data-text=" reinforcement learning (RL)" tabindex="-1"> reinforcement learning (RL)</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="Reinforcement Learning">#rl</div> </div></p> <p>A family of algorithms that learn an optimal <a href="#policy"><strong>policy</strong></a>, whose goal is to maximize <a href="#return"><strong>return</strong></a> when interacting with an <a href="#environment"><strong>environment</strong></a>. For example, the ultimate reward of most games is victory. Reinforcement learning systems can become expert at playing complex games by evaluating sequences of previous game moves that ultimately led to wins and sequences that ultimately led to losses.</p> <p><a class="glossary-anchor" name="RLHF"></a> <h2 class="hide-from-toc" id="reinforcement-learning-from-human-feedback-rlhf" data-text=" Reinforcement Learning from Human Feedback (RLHF)" tabindex="-1"> Reinforcement Learning from Human Feedback (RLHF)</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="Generative AI">#generativeAI</div> <div class="glossary-icon" title="Reinforcement Learning">#rl</div> </div></p> <p>Using feedback from human raters to improve the quality of a model's responses. For example, an RLHF mechanism can ask users to rate the quality of a model's response with a 👍 or 👎 emoji. The system can then adjust its future responses based on that feedback.</p> <p><a class="glossary-anchor" name="ReLU"></a> <h2 class="hide-from-toc" id="relu" data-text=" ReLU" tabindex="-1"> ReLU</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="ML Fundamentals">#fundamentals</div> </div></p> <p>Abbreviation for <a href="#ReLU"><strong>Rectified Linear Unit</strong></a>.</p> <p><a class="glossary-anchor" name="replay_buffer"></a> <h2 class="hide-from-toc" id="replay-buffer" data-text=" replay buffer" tabindex="-1"> replay buffer</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="Reinforcement Learning">#rl</div> </div></p> <p>In <a href="#deep_q-network"><strong>DQN</strong></a>-like algorithms, the memory used by the agent to store state transitions for use in <a href="#experience_replay"><strong>experience replay</strong></a>.</p> <p><a class="glossary-anchor" name="replica"></a> <h2 class="hide-from-toc" id="replica" data-text=" replica " tabindex="-1"> replica </h2></p> <p>A copy of the <a href="#training_set"><strong>training set</strong></a> or <a href="#model"><strong>model</strong></a>, typically on another machine. For example, a system could use the following strategy for implementing <a href="#data-parallelism"><strong>data parallelism</strong></a>:</p> <ol> <li>Place replicas of an existing model on multiple machines.</li> <li>Send different subsets of the training set to each replica.</li> <li>Aggregate the <a href="#parameter"><strong>parameter</strong></a> updates.</li> </ol> <p><a class="glossary-anchor" name="reporting_bias"></a> <h2 class="hide-from-toc" id="reporting-bias" data-text=" reporting bias " tabindex="-1"> reporting bias </h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="Fairness">#fairness</div> </div></p> <p>The fact that the frequency with which people write about actions, outcomes, or properties is not a reflection of their real-world frequencies or the degree to which a property is characteristic of a class of individuals. Reporting bias can influence the composition of data that machine learning systems learn from.</p> <p>For example, in books, the word <em>laughed</em> is more prevalent than <em>breathed</em>. A machine learning model that estimates the relative frequency of laughing and breathing from a book corpus would probably determine that laughing is more common than breathing.</p> <p><a class="glossary-anchor" name="representation"></a> <h2 class="hide-from-toc" id="representation" data-text=" representation" tabindex="-1"> representation</h2></p> <p>The process of mapping data to useful <a href="#feature"><strong>features</strong></a>.</p> <p><a class="glossary-anchor" name="re-ranking"></a> <h2 class="hide-from-toc" id="re-ranking" data-text=" re-ranking" tabindex="-1"> re-ranking</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="Recommendation Systems">#recsystems</div> </div></p> <p>The final stage of a <a href="#recommendation_system"><strong>recommendation system</strong></a>, during which scored items may be re-graded according to some other (typically, non-ML) algorithm. Re-ranking evaluates the list of items generated by the <a href="#scoring"><strong>scoring</strong></a> phase, taking actions such as:</p> <ul> <li>Eliminating items that the user has already purchased.</li> <li>Boosting the score of fresher items.</li> </ul> <p><a class="glossary-anchor" name="retrieval-augmented_generation"></a> <h2 class="hide-from-toc" id="retrieval-augmented-generation-rag" data-text=" retrieval-augmented generation (RAG)" tabindex="-1"> retrieval-augmented generation (RAG)</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="ML Fundamentals">#fundamentals</div> </div></p> <p>A technique for improving the quality of <a href="#large-language-model"><strong>large language model (LLM)</strong></a> output by grounding it with sources of knowledge retrieved after the model was trained. RAG improves the accuracy of LLM responses by providing the trained LLM with access to information retrieved from trusted knowledge bases or documents.</p> <p>Common motivations to use retrieval-augmented generation include:</p> <ul> <li>Increasing the factual accuracy of a model's generated responses.</li> <li>Giving the model access to knowledge it was not trained on.</li> <li>Changing the knowledge that the model uses.</li> <li>Enabling the model to cite sources.</li> </ul> <p>For example, suppose that a chemistry app uses the <a href="https://developers.generativeai.google/products/palm">PaLM API</a> to generate summaries related to user queries. When the app's backend receives a query, the backend:</p> <ol> <li>Searches for ("retrieves") data that's relevant to the user's query.</li> <li>Appends ("augments") the relevant chemistry data to the user's query.</li> <li>Instructs the LLM to create a summary based on the appended data.</li> </ol> <p><a class="glossary-anchor" name="return"></a> <h2 class="hide-from-toc" id="return" data-text=" return" tabindex="-1"> return</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="Reinforcement Learning">#rl</div> </div></p> <p>In reinforcement learning, given a certain policy and a certain state, the return is the sum of all <a href="#reward"><strong>rewards</strong></a> that the <a href="#agent"><strong>agent</strong></a> expects to receive when following the <a href="#policy"><strong>policy</strong></a> from the <a href="#state"><strong>state</strong></a> to the end of the <a href="#episode"><strong>episode</strong></a>. The agent accounts for the delayed nature of expected rewards by discounting rewards according to the state transitions required to obtain the reward.</p> <p>Therefore, if the discount factor is $\gamma$, and $r_0, \ldots, r_{N}$ denote the rewards until the end of the episode, then the return calculation is as follows:</p> <div> $$\text{Return} = r_0 + \gamma r_1 + \gamma^2 r_2 + \ldots + \gamma^{N-1} r_{N-1}$$ </div> <p><a class="glossary-anchor" name="reward"></a> <h2 class="hide-from-toc" id="reward" data-text=" reward" tabindex="-1"> reward</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="Reinforcement Learning">#rl</div> </div></p> <p>In reinforcement learning, the numerical result of taking an <a href="#action"><strong>action</strong></a> in a <a href="#state"><strong>state</strong></a>, as defined by the <a href="#environment"><strong>environment</strong></a>.</p> <p><a class="glossary-anchor" name="ridge_regularization"></a> <h2 class="hide-from-toc" id="ridge-regularization" data-text=" ridge regularization" tabindex="-1"> ridge regularization</h2></p> <p>Synonym for <a href="#L2_regularization"><strong>L<sub>2</sub> regularization</strong></a>. The term <strong>ridge regularization</strong> is more frequently used in pure statistics contexts, whereas <strong>L<sub>2</sub> regularization</strong> is used more often in machine learning.</p> <p><a class="glossary-anchor" name="RNN"></a> <h2 class="hide-from-toc" id="rnn" data-text=" RNN" tabindex="-1"> RNN</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="Sequence Models">#seq</div> </div></p> <p>Abbreviation for <a href="#recurrent_neural_network"><strong>recurrent neural networks</strong></a>.</p> <p><a class="glossary-anchor" name="ROC"></a> <h2 class="hide-from-toc" id="roc-receiver-operating-characteristic-curve" data-text=" ROC (receiver operating characteristic) Curve" tabindex="-1"> ROC (receiver operating characteristic) Curve</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="ML Fundamentals">#fundamentals</div> </div></p> <p>A graph of <a href="#TP_rate"><strong>true positive rate</strong></a> versus <a href="#FP_rate"><strong>false positive rate</strong></a> for different <a href="#classification_threshold"><strong>classification thresholds</strong></a> in binary classification.</p> <p>The shape of an ROC curve suggests a binary classification model's ability to separate positive classes from negative classes. Suppose, for example, that a binary classification model perfectly separates all the negative classes from all the positive classes:</p> <p> <img src="/static/machine-learning/glossary/images/ROCSetupIdealDistributionNoClassificationThreshold.png" loading="lazy" alt="A number line with 8 positive examples on the right side and 7 negative examples on the left." > </p> <p>The ROC curve for the preceding model looks as follows:</p> <p> <img src="/static/machine-learning/glossary/images/ROCcurvePerfect.png" loading="lazy" alt="An ROC curve. The x-axis is False Positive Rate and the y-axis is True Positive Rate. The curve has an inverted L shape. The curve starts at (0.0,0.0) and goes straight up to (0.0,1.0). Then the curve goes from (0.0,1.0) to (1.0,1.0)." > </p> <p>In contrast, the following illustration graphs the raw logistic regression values for a terrible model that can't separate negative classes from positive classes at all:</p> <p> <img src="/static/machine-learning/glossary/images/ROCWorstCaseDistribution.png" loading="lazy" alt="A number line with positive examples and negative classes completely intermixed." > </p> <p>The ROC curve for this model looks as follows:</p> <p> <img src="/static/machine-learning/glossary/images/ROCcurveWorstCase.png" loading="lazy" alt="An ROC curve, which is actually a straight line from (0.0,0.0) to (1.0,1.0)." > </p> <p>Meanwhile, back in the real world, most binary classification models separate positive and negative classes to some degree, but usually not perfectly. So, a typical ROC curve falls somewhere between the two extremes:</p> <p> <img src="/static/machine-learning/glossary/images/ROCTypicalGraph.png" loading="lazy" alt="An ROC curve. The x-axis is False Positive Rate and the y-axis is True Positive Rate. The ROC curve approximates a shaky arc traversing the compass points from West to North." > </p> <p>The point on an ROC curve closest to (0.0,1.0) theoretically identifies the ideal classification threshold. However, several other real-world issues influence the selection of the ideal classification threshold. For example, perhaps false negatives cause far more pain than false positives.</p> <p>A numerical metric called <a href="#AUC"><strong>AUC</strong></a> summarizes the ROC curve into a single floating-point value.</p> <p><a class="glossary-anchor" name="role-prompting"></a> <h2 class="hide-from-toc" id="role-prompting" data-text=" role prompting " tabindex="-1"> role prompting </h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="Language Evaluation">#language</div> <div class="glossary-icon" title="Generative AI">#generativeAI</div> </div></p> <p>An optional part of a <a href="#prompt"><strong>prompt</strong></a> that identifies a target audience for a <a href="#generative-AI"><strong>generative AI</strong></a> model's response. <em>Without</em> a role prompt, a large language model provides an answer that may or may not be useful for the person asking the questions. <em>With</em> a role prompt, a large language model can answer in a way that's more appropriate and more helpful for a specific target audience. For example, the role prompt portion of the following prompts are in boldface:</p> <ul> <li>Summarize this article <strong>for a PhD in economics</strong>.</li> <li>Describe how tides work <strong>for a ten-year old</strong>.</li> <li>Explain the 2008 financial crisis. <strong>Speak as you might to a young child, or a golden retriever.</strong></li> </ul> <p><a class="glossary-anchor" name="root"></a> <h2 class="hide-from-toc" id="root" data-text=" root " tabindex="-1"> root </h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="Decision Forests">#df</div> </div></p> <p>The starting <a href="#node-decision-tree"><strong>node</strong></a> (the first <a href="#condition"><strong>condition</strong></a>) in a <a href="#decision-tree"><strong>decision tree</strong></a>. By convention, diagrams put the root at the top of the decision tree. For example:</p> <p> <img src="/static/machine-learning/glossary/images/root.png" loading="lazy" width="344" alt="A decision tree with two conditions and three leaves. The starting condition (x > 2) is the root." > </p> <p><a class="glossary-anchor" name="root_directory"></a> <h2 class="hide-from-toc" id="root-directory" data-text=" root directory" tabindex="-1"> root directory</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="TensorFlow">#TensorFlow</div> </div></p> <p>The directory you specify for hosting subdirectories of the TensorFlow checkpoint and events files of multiple models.</p> <p><a class="glossary-anchor" name="RMSE"></a> <h2 class="hide-from-toc" id="root-mean-squared-error-rmse" data-text=" Root Mean Squared Error (RMSE)" tabindex="-1"> Root Mean Squared Error (RMSE)</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="ML Fundamentals">#fundamentals</div> </div></p> <p>The square root of the <a href="#MSE"><strong>Mean Squared Error</strong></a>.</p> <p><a class="glossary-anchor" name="rotational_invariance"></a> <h2 class="hide-from-toc" id="rotational-invariance" data-text=" rotational invariance" tabindex="-1"> rotational invariance</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="Image Models">#image</div> </div></p> <p>In an image classification problem, an algorithm's ability to successfully classify images even when the orientation of the image changes. For example, the algorithm can still identify a tennis racket whether it is pointing up, sideways, or down. Note that rotational invariance is not always desirable; for example, an upside-down 9 shouldn't be classified as a 9.</p> <p>See also <a href="#translational_invariance"><strong>translational invariance</strong></a> and <a href="#size_invariance"><strong>size invariance</strong></a>.</p> <p><a class="glossary-anchor" name="R-squared"></a> <h2 class="hide-from-toc" id="r-squared" data-text=" R-squared" tabindex="-1"> R-squared</h2></p> <p>A <a href="#regression_model"><strong>regression</strong></a> metric indicating how much variation in a <a href="#label"><strong>label</strong></a> is due to an individual feature or to a feature set. R-squared is a value between 0 and 1, which you can interpret as follows:</p> <ul> <li>An R-squared of 0 means that none of a label's variation is due to the feature set.</li> <li>An R-squared of 1 means that all of a label's variation is due to the feature set.</li> <li>An R-squared between 0 and 1 indicates the extent to which the label's variation can be predicted from a particular feature or the feature set. For example, an R-squared of 0.10 means that 10 percent of the variance in the label is due to the feature set, an R-squared of 0.20 means that 20 percent is due to the feature set, and so on.</li> </ul> <p>R-squared is the square of the <a href="https://wikipedia.org/wiki/Correlation_coefficient">Pearson correlation coefficient</a> between the values that a model predicted and <a href="#ground_truth"><strong>ground truth</strong></a>.</p> <p><a class="glossary-anchor" name="s"></a> <h2 class="glossary" id="s" data-text="S" tabindex="-1">S</h2></p> <p><a class="glossary-anchor" name="sampling_bias"></a> <h2 class="hide-from-toc" id="sampling-bias" data-text=" sampling bias " tabindex="-1"> sampling bias </h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="Fairness">#fairness</div> </div></p> <p>See <a href="#selection_bias"><strong>selection bias</strong></a>.</p> <p><a class="glossary-anchor" name="sampling-with-replacement"></a> <h2 class="hide-from-toc" id="sampling-with-replacement" data-text=" sampling with replacement " tabindex="-1"> sampling with replacement </h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="Decision Forests">#df</div> </div></p> <p>A method of picking items from a set of candidate items in which the same item can be picked multiple times. The phrase "with replacement" means that after each selection, the selected item is returned to the pool of candidate items. The inverse method, <strong>sampling without replacement</strong>, means that a candidate item can only be picked once.</p> <p>For example, consider the following fruit set:</p> <div></div><devsite-code><pre class="devsite-click-to-copy" translate="no" dir="ltr" is-upgraded syntax="Text only">fruit = {kiwi, apple, pear, fig, cherry, lime, mango}</pre></devsite-code> <p>Suppose that the system randomly picks <code translate="no" dir="ltr">fig</code> as the first item. If using sampling with replacement, then the system picks the second item from the following set:</p> <div></div><devsite-code><pre class="devsite-click-to-copy" translate="no" dir="ltr" is-upgraded syntax="Text only">fruit = {kiwi, apple, pear, fig, cherry, lime, mango}</pre></devsite-code> <p>Yes, that's the same set as before, so the system could potentially pick <code translate="no" dir="ltr">fig</code> again.</p> <p>If using sampling without replacement, once picked, a sample can't be picked again. For example, if the system randomly picks <code translate="no" dir="ltr">fig</code> as the first sample, then <code translate="no" dir="ltr">fig</code> can't be picked again. Therefore, the system picks the second sample from the following (reduced) set:</p> <div></div><devsite-code><pre class="devsite-click-to-copy" translate="no" dir="ltr" is-upgraded syntax="Text only">fruit = {kiwi, apple, pear, cherry, lime, mango}</pre></devsite-code> <section class="expandable"> <h4 class="showalways" id="click-the-icon-for-additional-notes._14" data-text=" Click the icon for additional notes. " tabindex="-1"> Click the icon for additional notes. </h4> <div class="expand-background"> <p>The word <i>replacement</i> in <b>sampling with replacement</b> confuses many people. In English, <i>replacement</i> means "substitution." However, <b>sampling with replacement</b> actually uses the French definition for <i>replacement</i>, which means "putting something back."</p> <p>The English word <i>replacement</i> is translated as the French word <i>remplacement</i>.</p> </div> <hr /> </section> <p><a class="glossary-anchor" name="SavedModel"></a> <h2 class="hide-from-toc" id="savedmodel" data-text=" SavedModel" tabindex="-1"> SavedModel</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="TensorFlow">#TensorFlow</div> </div></p> <p>The recommended format for saving and recovering TensorFlow models. SavedModel is a language-neutral, recoverable serialization format, which enables higher-level systems and tools to produce, consume, and transform TensorFlow models.</p> <p>See the <a href="https://www.tensorflow.org/guide/saved_model" target="T">Saving and Restoring chapter</a> in the TensorFlow Programmer's Guide for complete details.</p> <p><a class="glossary-anchor" name="Saver"></a> <h2 class="hide-from-toc" id="saver" data-text=" Saver" tabindex="-1"> Saver</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="TensorFlow">#TensorFlow</div> </div></p> <p>A <a href="https://www.tensorflow.org/api_docs/python/tf/compat/v1/train/Saver" target="T">TensorFlow object</a> responsible for saving model checkpoints.</p> <p><a class="glossary-anchor" name="scalar"></a> <h2 class="hide-from-toc" id="scalar" data-text=" scalar" tabindex="-1"> scalar</h2></p> <p>A single number or a single string that can be represented as a <a href="#tensor"><strong>tensor</strong></a> of <a href="#rank"><strong>rank</strong></a> 0. For example, the following lines of code each create one scalar in TensorFlow:</p> <div></div><devsite-code><pre class="devsite-click-to-copy" translate="no" dir="ltr" is-upgraded syntax="Text only">breed = tf.Variable("poodle", tf.string) temperature = tf.Variable(27, tf.int16) precision = tf.Variable(0.982375101275, tf.float64)</pre></devsite-code> <p><a class="glossary-anchor" name="scaling"></a> <h2 class="hide-from-toc" id="scaling" data-text=" scaling" tabindex="-1"> scaling</h2></p> <p>Any mathematical transform or technique that shifts the range of a label and/or feature value. Some forms of scaling are very useful for transformations like <a href="#normalization"><strong>normalization</strong></a>.</p> <p>Common forms of scaling useful in Machine Learning include:</p> <ul> <li>linear scaling, which typically uses a combination of subtraction and division to replace the original value with a number between -1 and +1 or between 0 and 1.</li> <li>logarithmic scaling, which replaces the original value with its logarithm.</li> <li><a href="#Z-score-normalization"><strong>Z-score normalization</strong></a>, which replaces the original value with a floating-point value representing the number of standard deviations from that feature's mean.</li> </ul> <p><a class="glossary-anchor" name="scikit-learn"></a> <h2 class="hide-from-toc" id="scikit-learn" data-text=" scikit-learn" tabindex="-1"> scikit-learn</h2></p> <p>A popular open-source machine learning platform. See <a href="http://scikit-learn.org/" target="T">scikit-learn.org</a>.</p> <p><a class="glossary-anchor" name="scoring"></a> <h2 class="hide-from-toc" id="scoring" data-text=" scoring" tabindex="-1"> scoring</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="Recommendation Systems">#recsystems</div> </div></p> <p>The part of a <a href="#recommendation_system"><strong>recommendation system</strong></a> that provides a value or ranking for each item produced by the <a href="#candidate_generation"><strong>candidate generation</strong></a> phase.</p> <p><a class="glossary-anchor" name="selection_bias"></a> <h2 class="hide-from-toc" id="selection-bias" data-text=" selection bias " tabindex="-1"> selection bias </h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="Fairness">#fairness</div> </div></p> <p>Errors in conclusions drawn from sampled data due to a selection process that generates systematic differences between samples observed in the data and those not observed. The following forms of selection bias exist:</p> <ul> <li><strong>coverage bias</strong>: The population represented in the dataset doesn't match the population that the machine learning model is making predictions about.</li> <li><strong>sampling bias</strong>: Data is not collected randomly from the target group.</li> <li><strong>non-response bias</strong> (also called <strong>participation bias</strong>): Users from certain groups opt-out of surveys at different rates than users from other groups.</li> </ul> <p>For example, suppose you are creating a machine learning model that predicts people's enjoyment of a movie. To collect training data, you hand out a survey to everyone in the front row of a theater showing the movie. Offhand, this may sound like a reasonable way to gather a dataset; however, this form of data collection may introduce the following forms of selection bias:</p> <ul> <li>coverage bias: By sampling from a population who chose to see the movie, your model's predictions may not generalize to people who did not already express that level of interest in the movie.</li> <li>sampling bias: Rather than randomly sampling from the intended population (all the people at the movie), you sampled only the people in the front row. It is possible that the people sitting in the front row were more interested in the movie than those in other rows.</li> <li>non-response bias: In general, people with strong opinions tend to respond to optional surveys more frequently than people with mild opinions. Since the movie survey is optional, the responses are more likely to form a <a href="https://wikipedia.org/wiki/Multimodal_distribution" target="T">bimodal distribution</a> than a normal (bell-shaped) distribution.</li> </ul> <p><a class="glossary-anchor" name="self-attention"></a> <h2 class="hide-from-toc" id="self-attention-also-called-self-attention-layer" data-text=" self-attention (also called self-attention layer)" tabindex="-1"> self-attention (also called self-attention layer)</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="Language Evaluation">#language</div> </div></p> <p>A neural network layer that transforms a sequence of embeddings (for example, <a href="#token"><strong>token</strong></a> embeddings) into another sequence of embeddings. Each embedding in the output sequence is constructed by integrating information from the elements of the input sequence through an <a href="#attention"><strong>attention</strong></a> mechanism.</p> <p>The <strong>self</strong> part of <strong>self-attention</strong> refers to the sequence attending to itself rather than to some other context. Self-attention is one of the main building blocks for <a href="#Transformer"><strong>Transformers</strong></a> and uses dictionary lookup terminology, such as "query", "key", and "value".</p> <p>A self-attention layer starts with a sequence of input representations, one for each word. The input representation for a word can be a simple embedding. For each word in an input sequence, the network scores the relevance of the word to every element in the whole sequence of words. The relevance scores determine how much the word's final representation incorporates the representations of other words.</p> <p>For example, consider the following sentence:</p> <blockquote> <p>The animal didn't cross the street because it was too tired.</p> </blockquote> <p>The following illustration (from <a href="https://ai.googleblog.com/2017/08/transformer-novel-neural-network.html">Transformer: A Novel Neural Network Architecture for Language Understanding</a>) shows a self-attention layer's attention pattern for the pronoun <strong>it</strong>, with the darkness of each line indicating how much each word contributes to the representation:</p> <p> <img src="/static/machine-learning/glossary/images/self-attention.png" loading="lazy" alt="The following sentence appears twice: The animal didn't cross the street because it was too tired. Lines connect the pronoun it in one sentence to five tokens (The, animal, street, it, and the period) in the other sentence. The line between the pronoun it and the word animal is strongest." > </p> <p>The self-attention layer highlights words that are relevant to "it". In this case, the attention layer has learned to highlight words that <strong>it</strong> might refer to, assigning the highest weight to <strong>animal</strong>.</p> <p>For a sequence of <em>n</em> <a href="#token"><strong>tokens</strong></a>, self-attention transforms a sequence of embeddings <em>n</em> separate times, once at each position in the sequence.</p> <p>Refer also to <a href="#attention"><strong>attention</strong></a> and <a href="#multi-head-self-attention"><strong>multi-head self-attention</strong></a>.</p> <p><a class="glossary-anchor" name="self-supervised-learning"></a> <h2 class="hide-from-toc" id="self-supervised-learning" data-text=" self-supervised learning" tabindex="-1"> self-supervised learning</h2></p> <p>A family of techniques for converting an <a href="#unsupervised_machine_learning"><strong>unsupervised machine learning</strong></a> problem into a <a href="#supervised_machine_learning"><strong>supervised machine learning</strong></a> problem by creating surrogate <a href="#label"><strong>labels</strong></a> from <a href="#unlabeled_example"><strong>unlabeled examples</strong></a>.</p> <p>Some <a href="#Transformer"><strong>Transformer</strong></a>-based models such as <a href="#BERT"><strong>BERT</strong></a> use self-supervised learning.</p> <p>Self-supervised training is a <a href="#semi-supervised_learning"><strong>semi-supervised learning</strong></a> approach.</p> <p><a class="glossary-anchor" name="self-training"></a> <h2 class="hide-from-toc" id="self-training" data-text=" self-training" tabindex="-1"> self-training</h2></p> <p>A variant of <a href="#self-supervised-learning"><strong>self-supervised learning</strong></a> that is particularly useful when all of the following conditions are true:</p> <ul> <li>The ratio of <a href="#unlabeled_example"><strong>unlabeled examples</strong></a> to <a href="#labeled_example"><strong>labeled examples</strong></a> in the dataset is high.</li> <li>This is a <a href="#classification_model"><strong>classification</strong></a> problem.</li> </ul> <p>Self-training works by iterating over the following two steps until the model stops improving:</p> <ol> <li>Use <a href="#supervised_machine_learning"><strong>supervised machine learning</strong></a> to train a model on the labeled examples.</li> <li>Use the model created in Step 1 to generate predictions (labels) on the unlabeled examples, moving those in which there is high confidence into the labeled examples with the predicted label.</li> </ol> <p>Notice that each iteration of Step 2 adds more labeled examples for Step 1 to train on.</p> <p><a class="glossary-anchor" name="semi-supervised_learning"></a> <h2 class="hide-from-toc" id="semi-supervised-learning" data-text=" semi-supervised learning" tabindex="-1"> semi-supervised learning</h2></p> <p>Training a model on data where some of the training examples have labels but others don't. One technique for semi-supervised learning is to infer labels for the unlabeled examples, and then to train on the inferred labels to create a new model. Semi-supervised learning can be useful if labels are expensive to obtain but unlabeled examples are plentiful.</p> <p><a href="#self-training"><strong>Self-training</strong></a> is one technique for semi-supervised learning.</p> <p><a class="glossary-anchor" name="sensitive_attribute"></a> <h2 class="hide-from-toc" id="sensitive-attribute" data-text=" sensitive attribute" tabindex="-1"> sensitive attribute</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="Fairness">#fairness</div> </div> A human attribute that may be given special consideration for legal, ethical, social, or personal reasons.</p> <p><a class="glossary-anchor" name="sentiment_analysis"></a> <h2 class="hide-from-toc" id="sentiment-analysis" data-text=" sentiment analysis" tabindex="-1"> sentiment analysis</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="Language Evaluation">#language</div> </div></p> <p>Using statistical or machine learning algorithms to determine a group's overall attitude—positive or negative—toward a service, product, organization, or topic. For example, using <a href="#natural_language_understanding"><strong>natural language understanding</strong></a>, an algorithm could perform sentiment analysis on the textual feedback from a university course to determine the degree to which students generally liked or disliked the course.</p> <p><a class="glossary-anchor" name="sequence_model"></a> <h2 class="hide-from-toc" id="sequence-model" data-text=" sequence model" tabindex="-1"> sequence model</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="Sequence Models">#seq</div> </div></p> <p>A model whose inputs have a sequential dependence. For example, predicting the next video watched from a sequence of previously watched videos.</p> <p><a class="glossary-anchor" name="sequence-to-sequence-task"></a> <h2 class="hide-from-toc" id="sequence-to-sequence-task" data-text=" sequence-to-sequence task" tabindex="-1"> sequence-to-sequence task</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="Language Evaluation">#language</div> </div></p> <p>A task that converts an input sequence of <a href="#token"><strong>tokens</strong></a> to an output sequence of tokens. For example, two popular kinds of sequence-to-sequence tasks are:</p> <ul> <li>Translators: <ul> <li>Sample input sequence: "I love you."</li> <li>Sample output sequence: "Je t'aime."</li> </ul></li> <li>Question answering: <ul> <li>Sample input sequence: "Do I need my car in New York City?"</li> <li>Sample output sequence: "No. Please keep your car at home."</li> </ul></li> </ul> <p><a class="glossary-anchor" name="serving"></a> <h2 class="hide-from-toc" id="serving" data-text=" serving" tabindex="-1"> serving</h2></p> <p>The process of making a trained model available to provide predictions through <a href="#online_inference"><strong>online inference</strong></a> or <a href="#offline_inference"><strong>offline inference</strong></a>.</p> <p><a class="glossary-anchor" name="shape"></a> <h2 class="hide-from-toc" id="shape-tensor" data-text=" shape (Tensor)" tabindex="-1"> shape (Tensor)</h2></p> <p> The number of elements in each <b><a href="#dimensions">dimension</a></b> of a tensor. The shape is represented as a list of integers. For example, the following two-dimensional tensor has a shape of [3,4]: </p> <div></div><devsite-code><pre class="devsite-click-to-copy" translate="no" dir="ltr" is-upgraded syntax="Text only">[[5, 7, 6, 4], [2, 9, 4, 8], [3, 6, 5, 1]]</pre></devsite-code> <p>TensorFlow uses row-major (C-style) format to represent the order of dimensions, which is why the shape in TensorFlow is <code translate="no" dir="ltr">[3,4]</code> rather than <code translate="no" dir="ltr">[4,3]</code>. In other words, in a two-dimensional TensorFlow Tensor, the shape is <code translate="no" dir="ltr">[</code><em>number of rows</em>, <em>number of columns</em><code translate="no" dir="ltr">]</code>.</p> <p>A <strong>static shape</strong> is a tensor shape that is <em>known</em> at compile time.</p> <p>A <strong>dynamic shape</strong> is <em>unknown</em> at compile time and is therefore dependent on runtime data. This tensor might be represented with a placeholder dimension in TensorFlow, as in <code translate="no" dir="ltr">[3, ?]</code>.</p> <p><a class="glossary-anchor" name="shard"></a> <h2 class="hide-from-toc" id="shard" data-text=" shard " tabindex="-1"> shard </h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="TensorFlow">#TensorFlow</div> <div class="glossary-icon" title="Google Cloud">#GoogleCloud</div> </div></p> <p>A logical division of the <a href="#training_set"><strong>training set</strong></a> or the <a href="#model"><strong>model</strong></a>. Typically, some process creates shards by dividing the <a href="#example"><strong>examples</strong></a> or <a href="#parameter"><strong>parameters</strong></a> into (usually) equal-sized chunks. Each shard is then assigned to a different machine.</p> <p>Sharding a model is called <a href="#model-parallelism"><strong>model parallelism</strong></a>; sharding data is called <a href="#data-parallelism"><strong>data parallelism</strong></a>.</p> <p><a class="glossary-anchor" name="shrinkage"></a> <h2 class="hide-from-toc" id="shrinkage" data-text=" shrinkage " tabindex="-1"> shrinkage </h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="Decision Forests">#df</div> </div></p> <p>A <a href="#hyperparameter"><strong>hyperparameter</strong></a> in <a href="#gradient-boosting"><strong>gradient boosting</strong></a> that controls <a href="#overfitting"><strong>overfitting</strong></a>. Shrinkage in gradient boosting is analogous to <a href="#learning_rate"><strong>learning rate</strong></a> in <a href="#gradient_descent"><strong>gradient descent</strong></a>. Shrinkage is a decimal value between 0.0 and 1.0. A lower shrinkage value reduces overfitting more than a larger shrinkage value.</p> <p><a class="glossary-anchor" name="sigmoid-function"></a> <h2 class="hide-from-toc" id="sigmoid-function" data-text=" sigmoid function" tabindex="-1"> sigmoid function</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="ML Fundamentals">#fundamentals</div> </div></p> <p>A mathematical function that "squishes" an input value into a constrained range, typically 0 to 1 or -1 to +1. That is, you can pass any number (two, a million, negative billion, whatever) to a sigmoid and the output will still be in the constrained range. A plot of the sigmoid activation function looks as follows:</p> <p> <img src="/static/machine-learning/glossary/images/sigmoid.svg" loading="lazy" alt="A two-dimensional curved plot with x values spanning the domain -infinity to +positive, while y values span the range almost 0 to almost 1. When x is 0, y is 0.5. The slope of the curve is always positive, with the highest slope at 0,0.5 and gradually decreasing slopes as the absolute value of x increases." > </p> <p>The sigmoid function has several uses in machine learning, including:</p> <ul> <li>Converting the raw output of a <a href="#logistic_regression"><strong>logistic regression</strong></a> or <a href="#multinomial-regression"><strong>multinomial regression</strong></a> model to a probability.</li> <li>Acting as an <a href="#activation_function"><strong>activation function</strong></a> in some neural networks.</li> </ul> <section class="expandable"> <h4 class="showalways" id="click-the-icon-to-see-the-math._4" data-text=" Click the icon to see the math. " tabindex="-1"> Click the icon to see the math. </h4> <div class="expand-background"> <p> The sigmoid function over an input number <i>x</i> has the following formula: </p> <div> $$ sigmoid(x) = \frac{1}{1 + e^{-\text{x}}} $$ </div> <p> In machine learning, <i>x</i> is generally a <a href="#weighted_sum"><b>weighted sum</b></a>. </p> </div> <hr /> </section> <p><a class="glossary-anchor" name="similarity_measure"></a> <h2 class="hide-from-toc" id="similarity-measure" data-text=" similarity measure" tabindex="-1"> similarity measure</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="Clustering">#clustering</div> </div></p> <p>In <a href="#clustering"><strong>clustering</strong></a> algorithms, the metric used to determine how alike (how similar) any two examples are.</p> <p><a class="glossary-anchor" name="single-program"></a> <h2 class="hide-from-toc" id="single-program-multiple-data-spmd" data-text=" single program / multiple data (SPMD)" tabindex="-1"> single program / multiple data (SPMD)</h2></p> <p>A parallelism technique where the same computation is run on different input data in parallel on different devices. The goal of SPMD is to obtain results more quickly. It is the most common style of parallel programming.</p> <p><a class="glossary-anchor" name="size_invariance"></a> <h2 class="hide-from-toc" id="size-invariance" data-text=" size invariance" tabindex="-1"> size invariance</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="Image Models">#image</div> </div></p> <p>In an image classification problem, an algorithm's ability to successfully classify images even when the size of the image changes. For example, the algorithm can still identify a cat whether it consumes 2M pixels or 200K pixels. Note that even the best image classification algorithms still have practical limits on size invariance. For example, an algorithm (or human) is unlikely to correctly classify a cat image consuming only 20 pixels.</p> <p>See also <a href="#translational_invariance"><strong>translational invariance</strong></a> and <a href="#rotational_invariance"><strong>rotational invariance</strong></a>.</p> <p><a class="glossary-anchor" name="sketching"></a> <h2 class="hide-from-toc" id="sketching" data-text=" sketching" tabindex="-1"> sketching</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="Clustering">#clustering</div> </div></p> <p>In <a href="#unsupervised_machine_learning"><strong>unsupervised machine learning</strong></a>, a category of algorithms that perform a preliminary similarity analysis on examples. Sketching algorithms use a <a href="https://wikipedia.org/wiki/Locality-sensitive_hashing" target="T"> locality-sensitive hash function</a> to identify points that are likely to be similar, and then group them into buckets.</p> <p>Sketching decreases the computation required for similarity calculations on large datasets. Instead of calculating similarity for every single pair of examples in the dataset, we calculate similarity only for each pair of points within each bucket.</p> <p><a class="glossary-anchor" name="skip-gram"></a> <h2 class="hide-from-toc" id="skip-gram" data-text=" skip-gram" tabindex="-1"> skip-gram</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="Language Evaluation">#language</div> </div></p> <p>An <a href="#N-gram"><strong>n-gram</strong></a> which may omit (or "skip") words from the original context, meaning the N words might not have been originally adjacent. More precisely, a "k-skip-n-gram" is an n-gram for which up to k words may have been skipped.</p> <p>For example, "the quick brown fox" has the following possible 2-grams:</p> <ul> <li>"the quick"</li> <li>"quick brown"</li> <li>"brown fox"</li> </ul> <p>A "1-skip-2-gram" is a pair of words that have at most 1 word between them. Therefore, "the quick brown fox" has the following 1-skip 2-grams:</p> <ul> <li>"the brown"</li> <li>"quick fox"</li> </ul> <p>In addition, all the 2-grams are <em>also</em> 1-skip-2-grams, since fewer than one word may be skipped.</p> <p>Skip-grams are useful for understanding more of a word's surrounding context. In the example, "fox" was directly associated with "quick" in the set of 1-skip-2-grams, but not in the set of 2-grams.</p> <p>Skip-grams help train <a href="/machine-learning/glossary#word-embedding"><strong>word embedding</strong></a> models.</p> <p><a class="glossary-anchor" name="softmax"></a> <h2 class="hide-from-toc" id="softmax" data-text=" softmax" tabindex="-1"> softmax</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="ML Fundamentals">#fundamentals</div> </div></p> <p>A function that determines probabilities for each possible class in a <a href="#multi-class"><strong>multi-class classification model</strong></a>. The probabilities add up to exactly 1.0. For example, the following table shows how softmax distributes various probabilities:</p> <table> <tr><th>Image is a...</th> <th>Probability</th></tr> <tr><td>dog</td> <td>.85</td></tr> <tr><td>cat</td> <td>.13</td></tr> <tr><td>horse</td> <td>.02</td></tr> </table> <p>Softmax is also called <strong>full softmax</strong>.</p> <p>Contrast with <a href="#candidate_sampling"><strong>candidate sampling</strong></a>.</p> <section class="expandable"> <h4 class="showalways" id="click-the-icon-to-see-the-math._5" data-text=" Click the icon to see the math. " tabindex="-1"> Click the icon to see the math. </h4> <div class="expand-background"> <p>The softmax equation is as follows:</p> <div> $$\sigma_i = \frac{e^{\text{z}_i}} {\sum_{j=1}^{j=K} {e^{\text{z}_j}}} $$ </div> where: <ul> <li>$\sigma_i$ is the output vector. Each element of the output vector specifies the probability of this element. The sum of all the elements in the output vector is 1.0. The output vector contains the same number of elements as the input vector, $z$.</li> <li>$z$ is the input vector. Each element of the input vector contains a floating-point value.</li> <li>$K$ is the number of elements in the input vector (and the output vector).</li> </ul> <p>For example, suppose the input vector is:</p> <div></div><devsite-code><pre translate="no" dir="ltr" is-upgraded> [1.2, 2.5, 1.8] </pre></devsite-code> <p>Therefore, softmax calculates the denominator as follows:</p> <div> $$\text{denominator} = e^{1.2} + e^{2.5} + e^{1.8} = 21.552$$ </div> <p>The softmax probability of each element is therefore:</p> <div> $$\sigma_1 = \frac{e^{1.2}}{21.552} = 0.154 $$ $$\sigma_2 = \frac{e^{2.5}}{21.552} = 0.565 $$ $$\sigma_1 = \frac{e^{1.8}}{21.552} = 0.281 $$ </div> <p>So, the output vector is therefore:</p> <div> $$\sigma = [0.154, 0.565, 0.281]$$ </div> <p>The sum of the three elements in $\sigma$ is 1.0. Phew!</p> </div> <hr /> </section> <p><a class="glossary-anchor" name="soft_prompt_tuning"></a> <h2 class="hide-from-toc" id="soft-prompt-tuning" data-text=" soft prompt tuning" tabindex="-1"> soft prompt tuning</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="Language Evaluation">#language</div> <div class="glossary-icon" title="Generative AI">#generativeAI</div> </div></p> <p>A technique for tuning a <a href="#large-language-model"><strong>large language model</strong></a> for a particular task, without resource intensive <a href="#fine-tuning"><strong>fine-tuning</strong></a>. Instead of retraining all the <a href="#weight"><strong>weights</strong></a> in the model, soft prompt tuning automatically adjusts a <a href="#prompt"><strong>prompt</strong></a> to achieve the same goal.</p> <p>Given a textual prompt, soft prompt tuning typically appends additional token embeddings to the prompt and uses backpropagation to optimize the input.</p> <p>A "hard" prompt contains actual tokens instead of token embeddings.</p> <p><a class="glossary-anchor" name="sparse_features"></a> <h2 class="hide-from-toc" id="sparse-feature" data-text=" sparse feature" tabindex="-1"> sparse feature</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="Language Evaluation">#language</div> <div class="glossary-icon" title="ML Fundamentals">#fundamentals</div> </div></p> <p>A <a href="#feature"><strong>feature</strong></a> whose values are predominately zero or empty. For example, a feature containing a single 1 value and a million 0 values is sparse. In contrast, a <a href="#dense_feature"><strong>dense feature</strong></a> has values that are predominantly not zero or empty.</p> <p>In machine learning, a surprising number of features are sparse features. Categorical features are usually sparse features. For example, of the 300 possible tree species in a forest, a single example might identify just a <em>maple tree</em>. Or, of the millions of possible videos in a video library, a single example might identify just "Casablanca."</p> <p>In a model, you typically represent sparse features with <a href="#one-hot_encoding"><strong>one-hot encoding</strong></a>. If the one-hot encoding is big, you might put an <a href="#embedding_layer"><strong>embedding layer</strong></a> on top of the one-hot encoding for greater efficiency.</p> <p><a class="glossary-anchor" name="sparse_representation"></a> <h2 class="hide-from-toc" id="sparse-representation" data-text=" sparse representation" tabindex="-1"> sparse representation</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="Language Evaluation">#language</div> <div class="glossary-icon" title="ML Fundamentals">#fundamentals</div> </div></p> <p>Storing only the <em>position(s)</em> of nonzero elements in a sparse feature.</p> <p>For example, suppose a categorical feature named <code translate="no" dir="ltr">species</code> identifies the 36 tree species in a particular forest. Further assume that each <a href="#example"><strong>example</strong></a> identifies only a single species.</p> <p>You could use a one-hot vector to represent the tree species in each example. A one-hot vector would contain a single <code translate="no" dir="ltr">1</code> (to represent the particular tree species in that example) and 35 <code translate="no" dir="ltr">0</code>s (to represent the 35 tree species <em>not</em> in that example). So, the one-hot representation of <code translate="no" dir="ltr">maple</code> might look something like the following:</p> <p> <img src="/static/machine-learning/glossary/images/One-HotRepresentationOfASparseFeature.png" loading="lazy" alt="A vector in which positions 0 through 23 hold the value 0, position 24 holds the value 1, and positions 25 through 35 hold the value 0."> </p> <p>Alternatively, sparse representation would simply identify the position of the particular species. If <code translate="no" dir="ltr">maple</code> is at position 24, then the sparse representation of <code translate="no" dir="ltr">maple</code> would simply be:</p> <div></div><devsite-code><pre translate="no" dir="ltr" is-upgraded> 24 </pre></devsite-code> <p>Notice that the sparse representation is much more compact than the one-hot representation.</p> <aside class="note"><strong>Note:</strong><span> You shouldn't pass a sparse representation as a direct feature input to a model. Instead, you should convert the sparse representation into a one-hot representation before training on it.</span></aside> <section class="expandable"> <h4 class="showalways" id="click-the-icon-for-a-slightly-more-complex-example." data-text=" Click the icon for a slightly more complex example. " tabindex="-1"> Click the icon for a slightly more complex example. </h4> <div class="expand-background"> <p> Suppose each example in your model must represent the words—but not the order of those words—in an English sentence. English consists of about 170,000 words, so English is a categorical feature with about 170,000 elements. Most English sentences use an extremely tiny fraction of those 170,000 words, so the set of words in a single example is almost certainly going to be sparse data. </p> <p>Consider the following sentence:</p> <div></div><devsite-code><pre translate="no" dir="ltr" is-upgraded> My dog is a great dog </pre></devsite-code> <p> You could use a variant of one-hot vector to represent the words in this sentence. In this variant, multiple cells in the vector can contain a nonzero value. Furthermore, in this variant, a cell can contain an integer other than one. Although the words "my", "is", "a", and "great" appear only once in the sentence, the word "dog" appears twice. Using this variant of one-hot vectors to represent the words in this sentence yields the following 170,000-element vector: </p> <p> <img src="/static/machine-learning/glossary/images/One-HotRepresentationOfWordsInASentence.png" loading="lazy" alt="A vector of 170,000 integers. The number 1 is at vector position 0, 45770, 58906, and 91520. The number 2 is at position 26,100. Zeroes are at the remaining 169,996 positions."> </p> <p>A sparse representation of the same sentence would simply be:</p> <div></div><devsite-code><pre class="devsite-click-to-copy" translate="no" dir="ltr" is-upgraded syntax="ActionScript 3"><span class="devsite-syntax-mi">0</span><span class="devsite-syntax-o">:</span><span class="devsite-syntax-w"> </span><span class="devsite-syntax-mi">1</span> <span class="devsite-syntax-mi">26100</span><span class="devsite-syntax-o">:</span><span class="devsite-syntax-w"> </span><span class="devsite-syntax-mi">2</span> <span class="devsite-syntax-mi">45770</span><span class="devsite-syntax-o">:</span><span class="devsite-syntax-w"> </span><span class="devsite-syntax-mi">1</span> <span class="devsite-syntax-mi">58906</span><span class="devsite-syntax-o">:</span><span class="devsite-syntax-w"> </span><span class="devsite-syntax-mi">1</span> <span class="devsite-syntax-mi">91520</span><span class="devsite-syntax-o">:</span><span class="devsite-syntax-w"> </span><span class="devsite-syntax-mi">1</span></pre></devsite-code> </div> <hr /> </section> <section class="expandable"> <h4 class="showalways" id="click-the-icon-if-you-are-confused." data-text=" Click the icon if you are confused. " tabindex="-1"> Click the icon if you are confused. </h4> <div class="expand-background"> <p>The term "sparse representation" confuses a lot of people because sparse representation is itself <i>not a sparse vector</i>. Rather, sparse representation is actually a <i>dense representation of a sparse vector</i>. The synonym <b>index representation</b> is a little clearer than "sparse representation." </p> </div> <hr /> </section> <p><a class="glossary-anchor" name="sparse_vector"></a> <h2 class="hide-from-toc" id="sparse-vector" data-text=" sparse vector" tabindex="-1"> sparse vector</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="ML Fundamentals">#fundamentals</div> </div></p> <p>A vector whose values are mostly zeroes. See also <a href="#sparse_features"><strong>sparse feature</strong></a> and <a href="#sparsity"><strong>sparsity</strong></a>.</p> <p><a class="glossary-anchor" name="sparsity"></a> <h2 class="hide-from-toc" id="sparsity" data-text=" sparsity" tabindex="-1"> sparsity</h2></p> <p>The number of elements set to zero (or null) in a vector or matrix divided by the total number of entries in that vector or matrix. For example, consider a 100-element matrix in which 98 cells contain zero. The calculation of sparsity is as follows:</p> <div> $$ {\text{sparsity}} = \frac{\text{98}} {\text{100}} = {\text{0.98}} $$ </div> <p><strong>Feature sparsity</strong> refers to the sparsity of a feature vector; <strong>model sparsity</strong> refers to the sparsity of the model weights.</p> <p><a class="glossary-anchor" name="spatial_pooling"></a> <h2 class="hide-from-toc" id="spatial-pooling" data-text=" spatial pooling" tabindex="-1"> spatial pooling</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="Image Models">#image</div> </div></p> <p>See <a href="#pooling"><strong>pooling</strong></a>.</p> <p><a class="glossary-anchor" name="split"></a> <h2 class="hide-from-toc" id="split" data-text=" split " tabindex="-1"> split </h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="Decision Forests">#df</div> </div></p> <p>In a <a href="#decision-tree"><strong>decision tree</strong></a>, another name for a <a href="#condition"><strong>condition</strong></a>.</p> <p><a class="glossary-anchor" name="splitter"></a> <h2 class="hide-from-toc" id="splitter" data-text=" splitter " tabindex="-1"> splitter </h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="Decision Forests">#df</div> </div></p> <p>While training a <a href="#decision-tree"><strong>decision tree</strong></a>, the routine (and algorithm) responsible for finding the best <a href="#condition"><strong>condition</strong></a> at each <a href="#node-decision-tree"><strong>node</strong></a>.</p> <p><a class="glossary-anchor" name="SPMD"></a> <h2 class="hide-from-toc" id="spmd" data-text=" SPMD" tabindex="-1"> SPMD</h2></p> <p>Abbreviation for <a href="#single-program"><strong>single program / multiple data</strong></a>.</p> <p><a class="glossary-anchor" name="squared_hinge_loss"></a> <h2 class="hide-from-toc" id="squared-hinge-loss" data-text=" squared hinge loss" tabindex="-1"> squared hinge loss</h2></p> <p>The square of the <a href="#hinge-loss"><strong>hinge loss</strong></a>. Squared hinge loss penalizes outliers more harshly than regular hinge loss.</p> <p><a class="glossary-anchor" name="squared_loss"></a> <h2 class="hide-from-toc" id="squared-loss" data-text=" squared loss" tabindex="-1"> squared loss</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="ML Fundamentals">#fundamentals</div> </div></p> <p>Synonym for <a href="#L2_loss"><b>L<sub>2</sub> loss</b></a>.</p> <p><a class="glossary-anchor" name="staged-training"></a> <h2 class="hide-from-toc" id="staged-training" data-text=" staged training" tabindex="-1"> staged training</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="Language Evaluation">#language</div> </div></p> <p>A tactic of training a model in a sequence of discrete stages. The goal can be either to speed up the training process, or to achieve better model quality.</p> <p>An illustration of the progressive stacking approach is shown below:</p> <ul> <li>Stage 1 contains 3 hidden layers, stage 2 contains 6 hidden layers, and stage 3 contains 12 hidden layers.</li> <li>Stage 2 begins training with the weights learned in the 3 hidden layers of Stage 1. Stage 3 begins training with the weights learned in the 6 hidden layers of Stage 2.</li> </ul> <p> <img src="/static/machine-learning/glossary/images/staged-training.png" loading="lazy" alt="Three stages, which are labeled Stage 1, Stage 2, and Stage 3. Each stage contains a different number of layers: Stage 1 contains 3 layers, Stage 2 contains 6 layers, and Stage 3 contains 12 layers. The 3 layers from Stage 1 become the first 3 layers of Stage 2. Similarly, the 6 layers from Stage 2 become the first 6 layers of Stage 3."> </p> <p>See also <a href="#pipelining"><strong>pipelining</strong></a>.</p> <p><a class="glossary-anchor" name="state"></a> <h2 class="hide-from-toc" id="state" data-text=" state" tabindex="-1"> state</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="Reinforcement Learning">#rl</div> </div></p> <p>In reinforcement learning, the parameter values that describe the current configuration of the environment, which the <a href="#agent"><strong>agent</strong></a> uses to choose an <a href="#action"><strong>action</strong></a>.</p> <p><a class="glossary-anchor" name="state-action_value_function"></a> <h2 class="hide-from-toc" id="state-action-value-function" data-text=" state-action value function" tabindex="-1"> state-action value function</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="Reinforcement Learning">#rl</div> </div></p> <p>Synonym for <a href="#q-function"><strong>Q-function</strong></a>.</p> <p><a class="glossary-anchor" name="static"></a> <a class="glossary-anchor" name="static-model"></a> <h2 class="hide-from-toc" id="static" data-text=" static" tabindex="-1"> static</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="ML Fundamentals">#fundamentals</div> </div></p> <p>Something done once rather than continuously. The terms <strong>static</strong> and <strong>offline</strong> are synonyms. The following are common uses of <strong>static</strong> and <strong>offline</strong> in machine learning:</p> <ul> <li><strong>static model</strong> (or <strong>offline model</strong>) is a model trained once and then used for a while.</li> <li><strong>static training</strong> (or <strong>offline training</strong>) is the process of training a static model.</li> <li><strong>static inference</strong> (or <strong>offline inference</strong>) is a process in which a model generates a batch of predictions at a time.</li> </ul> <p>Contrast with <a href="#dynamic"><strong>dynamic</strong></a>.</p> <p><a class="glossary-anchor" name="static-inference"></a> <h2 class="hide-from-toc" id="static-inference" data-text=" static inference" tabindex="-1"> static inference</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="ML Fundamentals">#fundamentals</div> </div></p> <p>Synonym for <a href="#offline_inference"><strong>offline inference</strong></a>.</p> <p><a class="glossary-anchor" name="stationarity"></a> <h2 class="hide-from-toc" id="stationarity" data-text=" stationarity" tabindex="-1"> stationarity</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="ML Fundamentals">#fundamentals</div> </div></p> <p>A feature whose values don't change across one or more dimensions, usually time. For example, a feature whose values look about the same in 2021 and 2023 exhibits stationarity.</p> <p>In the real world, very few features exhibit stationarity. Even features synonymous with stability (like sea level) change over time.</p> <p>Contrast with <a href="#nonstationarity"><strong>nonstationarity</strong></a>.</p> <p><a class="glossary-anchor" name="step"></a> <h2 class="hide-from-toc" id="step" data-text=" step" tabindex="-1"> step</h2></p> <p>A forward pass and backward pass of one <a href="#batch"><strong>batch</strong></a>.</p> <p>See <a href="#backpropagation"><strong>backpropagation</strong></a> for more information on the forward pass and backward pass.</p> <p><a class="glossary-anchor" name="step_size"></a> <h2 class="hide-from-toc" id="step-size" data-text=" step size" tabindex="-1"> step size</h2></p> <p>Synonym for <a href="#learning_rate"><strong>learning rate</strong></a>.</p> <p><a class="glossary-anchor" name="SGD"></a> <h2 class="hide-from-toc" id="stochastic-gradient-descent-sgd" data-text=" stochastic gradient descent (SGD)" tabindex="-1"> stochastic gradient descent (SGD)</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="ML Fundamentals">#fundamentals</div> </div></p> <p>A <a href="#gradient_descent"><strong>gradient descent</strong></a> algorithm in which the <a href="#batch_size"><strong>batch size</strong></a> is one. In other words, SGD trains on a single example chosen uniformly at random from a <a href="#training_set"><strong>training set</strong></a>.</p> <p><a class="glossary-anchor" name="stride"></a> <h2 class="hide-from-toc" id="stride" data-text=" stride" tabindex="-1"> stride</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="Image Models">#image</div> </div></p> <p>In a convolutional operation or pooling, the delta in each dimension of the next series of input slices. For example, the following animation demonstrates a (1,1) stride during a convolutional operation. Therefore, the next input slice starts one position to the right of the previous input slice. When the operation reaches the right edge, the next slice is all the way over to the left but one position down.</p> <p> <img src="/static/machine-learning/glossary/images/AnimatedConvolution.gif" loading="lazy" alt="An input 5x5 matrix and a 3x3 convolutional filter. Because the stride is (1,1), a convolutional filter will be applied 9 times. The first convolutional slice evaluates the top-left 3x3 submatrix of the input matrix. The second slice evaluates the top-middle 3x3 submatrix. The third convolutional slice evaluates the top-right 3x3 submatrix. The fourth slice evaluates the middle-left 3x3 submatrix. The fifth slice evaluates the middle 3x3 submatrix. The sixth slice evaluates the middle-right 3x3 submatrix. The seventh slice evaluates the bottom-left 3x3 submatrix. The eighth slice evaluates the bottom-middle 3x3 submatrix. The ninth slice evaluates the bottom-right 3x3 submatrix." > </p> <p>The preceding example demonstrates a two-dimensional stride. If the input matrix is three-dimensional, the stride would also be three-dimensional.</p> <p><a class="glossary-anchor" name="SRM"></a> <h2 class="hide-from-toc" id="structural-risk-minimization-srm" data-text=" structural risk minimization (SRM)" tabindex="-1"> structural risk minimization (SRM)</h2></p> <p>An algorithm that balances two goals:</p> <ul> <li>The need to build the most predictive model (for example, lowest loss).</li> <li>The need to keep the model as simple as possible (for example, strong regularization).</li> </ul> <p>For example, a function that minimizes loss+regularization on the training set is a structural risk minimization algorithm.</p> <p>Contrast with <a href="#ERM"><strong>empirical risk minimization</strong></a>.</p> <p><a class="glossary-anchor" name="subsampling"></a> <h2 class="hide-from-toc" id="subsampling" data-text=" subsampling" tabindex="-1"> subsampling</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="Image Models">#image</div> </div></p> <p>See <a href="#pooling"><strong>pooling</strong></a>.</p> <p><a class="glossary-anchor" name="subword_token"></a> <h2 class="hide-from-toc" id="subword-token" data-text=" subword token" tabindex="-1"> subword token</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="Language Evaluation">#language</div> </div></p> <p>In <a href="#language-model"><strong>language models</strong></a>, a <a href="#token"><strong>token</strong></a> that is a substring of a word, which may be the entire word.</p> <p>For example, a word like "itemize" might be broken up into the pieces "item" (a root word) and "ize" (a suffix), each of which is represented by its own token. Splitting uncommon words into such pieces, called subwords, allows language models to operate on the word's more common constituent parts, such as prefixes and suffixes.</p> <p>Conversely, common words like "going" might not be broken up and might be represented by a single token.</p> <p><a class="glossary-anchor" name="summary"></a> <h2 class="hide-from-toc" id="summary" data-text=" summary" tabindex="-1"> summary</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="TensorFlow">#TensorFlow</div> </div></p> <p>In TensorFlow, a value or set of values calculated at a particular <a href="#step"><strong>step</strong></a>, usually used for tracking model metrics during training.</p> <p><a class="glossary-anchor" name="supervised_machine_learning"></a> <h2 class="hide-from-toc" id="supervised-machine-learning" data-text=" supervised machine learning" tabindex="-1"> supervised machine learning</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="ML Fundamentals">#fundamentals</div> </div></p> <p>Training a <a href="#model"><strong>model</strong></a> from <a href="#feature"><strong>features</strong></a> and their corresponding <a href="#label"><strong>labels</strong></a>. Supervised machine learning is analogous to learning a subject by studying a set of questions and their corresponding answers. After mastering the mapping between questions and answers, a student can then provide answers to new (never-before-seen) questions on the same topic.</p> <p>Compare with <a href="#unsupervised_machine_learning"><strong>unsupervised machine learning</strong></a>.</p> <p><a class="glossary-anchor" name="synthetic_feature"></a> <h2 class="hide-from-toc" id="synthetic-feature" data-text=" synthetic feature" tabindex="-1"> synthetic feature</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="ML Fundamentals">#fundamentals</div> </div></p> <p>A <a href="#feature"><strong>feature</strong></a> not present among the input features, but assembled from one or more of them. Methods for creating synthetic features include the following:</p> <ul> <li><a href="#bucketing"><strong>Bucketing</strong></a> a continuous feature into range bins.</li> <li>Creating a <a href="#feature_cross"><strong>feature cross</strong></a>.</li> <li>Multiplying (or dividing) one feature value by other feature value(s) or by itself. For example, if <code translate="no" dir="ltr">a</code> and <code translate="no" dir="ltr">b</code> are input features, then the following are examples of synthetic features: <ul> <li><tt>ab</tt></li> <li><tt>a<sup>2</sup></tt></li> </ul></li> <li>Applying a transcendental function to a feature value. For example, if <code translate="no" dir="ltr">c</code> is an input feature, then the following are examples of synthetic features: <ul> <li><tt>sin(c)</tt></li> <li><tt>ln(c)</tt></li> </ul></li> </ul> <p>Features created by <a href="#normalization"><strong>normalizing</strong></a> or <a href="#scaling"><strong>scaling</strong></a> alone are not considered synthetic features.</p> <p><a class="glossary-anchor" name="t"></a> <h2 class="glossary" id="t" data-text="T" tabindex="-1">T</h2></p> <p><a class="glossary-anchor" name="T5"></a> <h2 class="hide-from-toc" id="t5" data-text=" T5" tabindex="-1"> T5</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="Language Evaluation">#language</div> </div></p> <p>A text-to-text <a href="#transfer_learning"><strong>transfer learning</strong></a> <a href="#model"><strong>model</strong></a> introduced by <a href="https://arxiv.org/pdf/1910.10683.pdf" target="T"> Google AI in 2020</a>. T5 is an <a href="#encoder"><strong>encoder</strong></a>-<a href="#decoder"><strong>decoder</strong></a> model, based on the <a href="#transformer"><strong>Transformer</strong></a> architecture, trained on an extremely large dataset. It is effective at a variety of natural language processing tasks, such as generating text, translating languages, and answering questions in a conversational manner.</p> <p>T5 gets its name from the five T's in "Text-to-Text Transfer Transformer."</p> <p><a class="glossary-anchor" name="T5X"></a> <h2 class="hide-from-toc" id="t5x" data-text=" T5X" tabindex="-1"> T5X</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="Language Evaluation">#language</div> </div></p> <p>An open-source, <a href="#machine_learning"><strong>machine learning</strong></a> framework designed to build and <a href="#training"><strong>train</strong></a> large-scale natural language processing (NLP) models. <a href="#T5"><strong>T5</strong></a> is implemented on the T5X codebase (which is built on <a href="#JAX"><strong>JAX</strong></a> and <a href="#flax"><strong>Flax</strong></a>).</p> <p><a class="glossary-anchor" name="tabular_q-learning"></a> <h2 class="hide-from-toc" id="tabular-q-learning" data-text=" tabular Q-learning" tabindex="-1"> tabular Q-learning</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="Reinforcement Learning">#rl</div> </div></p> <p>In <a href="#reinforcement_learning"><strong>reinforcement learning</strong></a>, implementing <a href="#q-learning"><strong>Q-learning</strong></a> by using a table to store the <a href="#q-function"><strong>Q-functions</strong></a> for every combination of <a href="#state"><strong>state</strong></a> and <a href="#action"><strong>action</strong></a>.</p> <p><a class="glossary-anchor" name="target"></a> <h2 class="hide-from-toc" id="target" data-text=" target" tabindex="-1"> target</h2></p> <p>Synonym for <a href="#label"><strong>label</strong></a>.</p> <p><a class="glossary-anchor" name="target_network"></a> <h2 class="hide-from-toc" id="target-network" data-text=" target network" tabindex="-1"> target network</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="Reinforcement Learning">#rl</div> </div></p> <p>In <a href="#q-learning"><strong>Deep Q-learning</strong></a>, a neural network that is a stable approximation of the main neural network, where the main neural network implements either a <a href="#q-function"><strong>Q-function</strong></a> or a <a href="#policy"><strong>policy</strong></a>. Then, you can train the main network on the Q-values predicted by the target network. Therefore, you prevent the feedback loop that occurs when the main network trains on Q-values predicted by itself. By avoiding this feedback, training stability increases.</p> <p><a class="glossary-anchor" name="task"></a> <h2 class="hide-from-toc" id="task" data-text=" task" tabindex="-1"> task</h2></p> <p>A problem that can be solved using machine learning techniques, such as:</p> <ul> <li><a href="#classification_model"><strong>classification</strong></a></li> <li><a href="#regression_model"><strong>regression</strong></a></li> <li><a href="#clustering"><strong>clustering</strong></a></li> <li><a href="#anomaly-detection"><strong>anomaly detection</strong></a></li> </ul> <p><a class="glossary-anchor" name="temperature"></a> <h2 class="hide-from-toc" id="temperature" data-text=" temperature" tabindex="-1"> temperature</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="Language Evaluation">#language</div> <div class="glossary-icon" title="Image Models">#image</div> <div class="glossary-icon" title="Generative AI">#generativeAI</div> </div></p> <p>A <a href="#hyperparameter"><strong>hyperparameter</strong></a> that controls the degree of randomness of a model's output. Higher temperatures result in more random output, while lower temperatures result in less random output.</p> <p>Choosing the best temperature depends on the specific application and the preferred properties of the model's output. For example, you would probably raise the temperature when creating an application that generates creative output. Conversely, you would probably lower the temperature when building a model that classifies images or text in order to improve the model's accuracy and consistency.</p> <p>Temperature is often used with <a href="#softmax"><strong>softmax</strong></a>.</p> <p><a class="glossary-anchor" name="temporal_data"></a> <h2 class="hide-from-toc" id="temporal-data" data-text=" temporal data" tabindex="-1"> temporal data</h2></p> <p>Data recorded at different points in time. For example, winter coat sales recorded for each day of the year would be temporal data.</p> <p><a class="glossary-anchor" name="tensor"></a> <h2 class="hide-from-toc" id="tensor" data-text=" Tensor" tabindex="-1"> Tensor</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="TensorFlow">#TensorFlow</div> </div></p> <p>The primary data structure in TensorFlow programs. Tensors are N-dimensional (where N could be very large) data structures, most commonly scalars, vectors, or matrixes. The elements of a Tensor can hold integer, floating-point, or string values.</p> <p><a class="glossary-anchor" name="TensorBoard"></a> <h2 class="hide-from-toc" id="tensorboard" data-text=" TensorBoard" tabindex="-1"> TensorBoard</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="TensorFlow">#TensorFlow</div> </div></p> <p>The dashboard that displays the summaries saved during the execution of one or more TensorFlow programs.</p> <p><a class="glossary-anchor" name="TensorFlow"></a> <h2 class="hide-from-toc" id="tensorflow" data-text=" TensorFlow" tabindex="-1"> TensorFlow</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="TensorFlow">#TensorFlow</div> </div></p> <p>A large-scale, distributed, machine learning platform. The term also refers to the base API layer in the TensorFlow stack, which supports general computation on dataflow graphs.</p> <p>Although TensorFlow is primarily used for machine learning, you may also use TensorFlow for non-ML tasks that require numerical computation using dataflow graphs.</p> <p><a class="glossary-anchor" name="TensorFlow_Playground"></a> <h2 class="hide-from-toc" id="tensorflow-playground" data-text=" TensorFlow Playground" tabindex="-1"> TensorFlow Playground</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="TensorFlow">#TensorFlow</div> </div></p> <p>A program that visualizes how different <a href="#hyperparameter"><strong>hyperparameters</strong></a> influence model (primarily neural network) training. Go to <a href="http://playground.tensorflow.org" target="T"> http://playground.tensorflow.org</a> to experiment with TensorFlow Playground.</p> <p><a class="glossary-anchor" name="TensorFlow_Serving"></a> <h2 class="hide-from-toc" id="tensorflow-serving" data-text=" TensorFlow Serving" tabindex="-1"> TensorFlow Serving</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="TensorFlow">#TensorFlow</div> </div></p> <p>A platform to deploy trained models in production.</p> <p><a class="glossary-anchor" name="TPU"></a> <h2 class="hide-from-toc" id="tensor-processing-unit-tpu" data-text=" Tensor Processing Unit (TPU)" tabindex="-1"> Tensor Processing Unit (TPU)</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="TensorFlow">#TensorFlow</div> <div class="glossary-icon" title="Google Cloud">#GoogleCloud</div> </div></p> <p>An application-specific integrated circuit (ASIC) that optimizes the performance of machine learning workloads. These ASICs are deployed as multiple <a href="#TPU_chip"><strong>TPU chips</strong></a> on a <a href="#TPU_device"><strong>TPU device</strong></a>.</p> <p><a class="glossary-anchor" name="tensor_rank"></a> <h2 class="hide-from-toc" id="tensor-rank" data-text=" Tensor rank" tabindex="-1"> Tensor rank</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="TensorFlow">#TensorFlow</div> </div></p> <p>See <a href="#rank_Tensor"><strong>rank (Tensor)</strong></a>.</p> <p><a class="glossary-anchor" name="tensor_shape"></a> <h2 class="hide-from-toc" id="tensor-shape" data-text=" Tensor shape" tabindex="-1"> Tensor shape</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="TensorFlow">#TensorFlow</div> </div></p> <p>The number of elements a <a href="#tensor"><strong>Tensor</strong></a> contains in various dimensions. For example, a <code translate="no" dir="ltr">[5, 10]</code> Tensor has a shape of 5 in one dimension and 10 in another.</p> <p><a class="glossary-anchor" name="tensor_size"></a> <h2 class="hide-from-toc" id="tensor-size" data-text=" Tensor size" tabindex="-1"> Tensor size</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="TensorFlow">#TensorFlow</div> </div></p> <p>The total number of scalars a <a href="#tensor"><strong>Tensor</strong></a> contains. For example, a <code translate="no" dir="ltr">[5, 10]</code> Tensor has a size of 50.</p> <p><a class="glossary-anchor" name="TensorStore"></a> <h2 class="hide-from-toc" id="tensorstore" data-text=" TensorStore " tabindex="-1"> TensorStore </h2></p> <p>A <a href="https://google.github.io/tensorstore/">library</a> for efficiently reading and writing large multi-dimensional arrays.</p> <p><a class="glossary-anchor" name="termination_condition"></a> <h2 class="hide-from-toc" id="termination-condition" data-text=" termination condition" tabindex="-1"> termination condition</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="Reinforcement Learning">#rl</div> </div></p> <p>In <a href="#reinforcement_learning"><strong>reinforcement learning</strong></a>, the conditions that determine when an <a href="#episode"><strong>episode</strong></a> ends, such as when the agent reaches a certain state or exceeds a threshold number of state transitions. For example, in <a href="https://wikipedia.org/wiki/Tic-tac-toe">tic-tac-toe</a> (also known as noughts and crosses), an episode terminates either when a player marks three consecutive spaces or when all spaces are marked.</p> <p><a class="glossary-anchor" name="test"></a> <h2 class="hide-from-toc" id="test" data-text=" test " tabindex="-1"> test </h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="Decision Forests">#df</div> </div></p> <p>In a <a href="#decision-tree"><strong>decision tree</strong></a>, another name for a <a href="#condition"><strong>condition</strong></a>.</p> <p><a class="glossary-anchor" name="test-loss"></a> <h2 class="hide-from-toc" id="test-loss" data-text=" test loss" tabindex="-1"> test loss</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="ML Fundamentals">#fundamentals</div> </div></p> <p>A <a href="#metric"><strong>metric</strong></a> representing a model's <a href="#loss"><strong>loss</strong></a> against the <a href="#test_set"><strong>test set</strong></a>. When building a <a href="#model"><strong>model</strong></a>, you typically try to minimize test loss. That's because a low test loss is a stronger quality signal than a low <a href="#training-loss"><strong>training loss</strong></a> or low <a href="#validation-loss"><strong>validation loss</strong></a>.</p> <p>A large gap between test loss and training loss or validation loss sometimes suggests that you need to increase the <a href="#regularization_rate"><strong>regularization rate</strong></a>.</p> <p><a class="glossary-anchor" name="test_set"></a> <h2 class="hide-from-toc" id="test-set" data-text=" test set" tabindex="-1"> test set</h2></p> <p>A subset of the <a href="#dataset"><strong>dataset</strong></a> reserved for testing a trained <a href="#model"><strong>model</strong></a>.</p> <p>Traditionally, you divide examples in the dataset into the following three distinct subsets:</p> <ul> <li>a <a href="#training_set"><strong>training set</strong></a></li> <li>a <a href="#validation_set"><strong>validation set</strong></a></li> <li>a test set</li> </ul> <p>Each example in a dataset should belong to only one of the preceding subsets. For instance, a single example shouldn't belong to both the training set and the test set.</p> <p>The training set and validation set are both closely tied to training a model. Because the test set is only indirectly associated with training, <a href="#test-loss"><strong>test loss</strong></a> is a less biased, higher quality metric than <a href="#training-loss"><strong>training loss</strong></a> or <a href="#validation-loss"><strong>validation loss</strong></a>.</p> <p><a class="glossary-anchor" name="text-span"></a> <h2 class="hide-from-toc" id="text-span" data-text=" text span" tabindex="-1"> text span</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="Language Evaluation">#language</div> </div></p> <p>The array index span associated with a specific subsection of a text string. For example, the word <code translate="no" dir="ltr">good</code> in the Python string <code translate="no" dir="ltr">s="Be good now"</code> occupies the text span from 3 to 6.</p> <p><a class="glossary-anchor" name="tf.Example"></a> <h2 class="hide-from-toc" id="tf.example" data-text=" tf.Example" tabindex="-1"> tf.Example</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="TensorFlow">#TensorFlow</div> </div></p> <p>A standard <a href="https://developers.google.com/protocol-buffers/" target="T"> protocol buffer</a> for describing input data for machine learning model training or inference.</p> <p><a class="glossary-anchor" name="tf.keras"></a> <h2 class="hide-from-toc" id="tf.keras" data-text=" tf.keras" tabindex="-1"> tf.keras</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="TensorFlow">#TensorFlow</div> </div></p> <p>An implementation of <a href="#Keras"><strong>Keras</strong></a> integrated into <a href="#TensorFlow"><strong>TensorFlow</strong></a>.</p> <p><a class="glossary-anchor" name="threshold"></a> <h2 class="hide-from-toc" id="threshold-for-decision-trees" data-text=" threshold (for decision trees) " tabindex="-1"> threshold (for decision trees) </h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="Decision Forests">#df</div> </div></p> <p>In an <a href="#axis-aligned-condition"><strong>axis-aligned condition</strong></a>, the value that a <a href="#feature"><strong>feature</strong></a> is being compared against. For example, 75 is the threshold value in the following condition:</p> <div></div><devsite-code><pre class="devsite-click-to-copy" translate="no" dir="ltr" is-upgraded syntax="Text only">grade >= 75</pre></devsite-code> <aside class="note"> This form of the term <b>threshold</b> is different than <a href="#classification_threshold"><b>classification threshold</b></a>. </aside> <p><a class="glossary-anchor" name="time_series_analysis"></a> <h2 class="hide-from-toc" id="time-series-analysis" data-text=" time series analysis" tabindex="-1"> time series analysis</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="Clustering">#clustering</div> </div></p> <p>A subfield of machine learning and statistics that analyzes <a href="#temporal_data"><strong>temporal data</strong></a>. Many types of machine learning problems require time series analysis, including classification, clustering, forecasting, and anomaly detection. For example, you could use time series analysis to forecast the future sales of winter coats by month based on historical sales data.</p> <p><a class="glossary-anchor" name="timestep"></a> <h2 class="hide-from-toc" id="timestep" data-text="timestep" tabindex="-1">timestep</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="Sequence Models">#seq</div> </div></p> <p>One "unrolled" cell within a <a href="#recurrent_neural_network"><strong>recurrent neural network</strong></a>. For example, the following figure shows three timesteps (labeled with the subscripts t-1, t, and t+1):</p> <p> <img src="/static/machine-learning/glossary/images/Simple_RNN.svg" loading="lazy" alt="Three timesteps in a recurrent neural network. The output of the first timestep becomes input to the second timestep. The output of the second timestep becomes input to the third timestep." > </p> <p><a class="glossary-anchor" name="token"></a> <h2 class="hide-from-toc" id="token" data-text=" token" tabindex="-1"> token</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="Language Evaluation">#language</div> </div></p> <p>In a <a href="#language-model"><strong>language model</strong></a>, the atomic unit that the model is training on and making predictions on. A token is typically one of the following:</p> <ul> <li>a word—for example, the phrase "dogs like cats" consists of three word tokens: "dogs", "like", and "cats".</li> <li>a character—for example, the phrase "bike fish" consists of nine character tokens. (Note that the blank space counts as one of the tokens.)</li> <li>subwords—in which a single word can be a single token or multiple tokens. A subword consists of a root word, a prefix, or a suffix. For example, a language model that uses subwords as tokens might view the word "dogs" as two tokens (the root word "dog" and the plural suffix "s"). That same language model might view the single word "taller" as two subwords (the root word "tall" and the suffix "er").</li> </ul> <p>In domains outside of language models, tokens can represent other kinds of atomic units. For example, in computer vision, a token might be a subset of an image.</p> <p><a class="glossary-anchor" name="tower"></a> <h2 class="hide-from-toc" id="tower" data-text="tower" tabindex="-1">tower</h2></p> <p>A component of a <a href="#deep_neural_network"><strong>deep neural network</strong></a> that is itself a deep neural network. In some cases, each tower reads from an independent data source, and those towers stay independent until their output is combined in a final layer. In other cases, (for example, in the <a href="#encoder"><strong>encoder</strong></a> and <a href="#decoder"><strong>decoder</strong></a> tower of many <a href="#Transformer"><strong>Transformers</strong></a>), towers have cross-connections to each other.</p> <p><a class="glossary-anchor" name="TPUabbrev"></a> <h2 class="hide-from-toc" id="tpu" data-text=" TPU " tabindex="-1"> TPU </h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="TensorFlow">#TensorFlow</div> <div class="glossary-icon" title="Google Cloud">#GoogleCloud</div> </div></p> <p>Abbreviation for <a href="#TPU"><strong>Tensor Processing Unit</strong></a>.</p> <p><a class="glossary-anchor" name="TPU_chip"></a> <h2 class="hide-from-toc" id="tpu-chip" data-text=" TPU chip " tabindex="-1"> TPU chip </h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="TensorFlow">#TensorFlow</div> <div class="glossary-icon" title="Google Cloud">#GoogleCloud</div> </div></p> <p>A programmable linear algebra accelerator with on-chip high bandwidth memory that is optimized for machine learning workloads. Multiple TPU chips are deployed on a <a href="#TPU_device"><strong>TPU device</strong></a>.</p> <p><a class="glossary-anchor" name="TPU_device"></a> <h2 class="hide-from-toc" id="tpu-device" data-text=" TPU device " tabindex="-1"> TPU device </h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="TensorFlow">#TensorFlow</div> <div class="glossary-icon" title="Google Cloud">#GoogleCloud</div> </div></p> <p>A printed circuit board (PCB) with multiple <a href="#TPU_chip"><strong>TPU chips</strong></a>, high bandwidth network interfaces, and system cooling hardware.</p> <p><a class="glossary-anchor" name="TPU_master"></a> <h2 class="hide-from-toc" id="tpu-master" data-text=" TPU master" tabindex="-1"> TPU master</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="TensorFlow">#TensorFlow</div> <div class="glossary-icon" title="Google Cloud">#GoogleCloud</div> </div></p> <p>The central coordination process running on a host machine that sends and receives data, results, programs, performance, and system health information to the <a href="#TPU_worker"><strong>TPU workers</strong></a>. The TPU master also manages the setup and shutdown of <a href="#TPU_device"><strong>TPU devices</strong></a>.</p> <p><a class="glossary-anchor" name="TPU_node"></a> <h2 class="hide-from-toc" id="tpu-node" data-text=" TPU node " tabindex="-1"> TPU node </h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="TensorFlow">#TensorFlow</div> <div class="glossary-icon" title="Google Cloud">#GoogleCloud</div> </div></p> <p>A TPU resource on Google Cloud with a specific <a href="#TPU_type"><strong>TPU type</strong></a>. The TPU node connects to your <a href="https://cloud.google.com/vpc/docs/">VPC Network</a> from a <a href="https://cloud.google.com/vpc/docs/vpc-peering">peer VPC network</a>. TPU nodes are a resource defined in the <a href="https://cloud.google.com/tpu/docs/reference/rest/v1/projects.locations.nodes">Cloud TPU API</a>.</p> <p><a class="glossary-anchor" name="TPU_Pod"></a> <h2 class="hide-from-toc" id="tpu-pod" data-text=" TPU Pod " tabindex="-1"> TPU Pod </h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="TensorFlow">#TensorFlow</div> <div class="glossary-icon" title="Google Cloud">#GoogleCloud</div> </div></p> <p>A specific configuration of <a href="#TPU_device"><strong>TPU devices</strong></a> in a Google data center. All of the devices in a TPU Pod are connected to one another over a dedicated high-speed network. A TPU Pod is the largest configuration of <a href="#TPU_device"><strong>TPU devices</strong></a> available for a specific TPU version.</p> <p><a class="glossary-anchor" name="TPU_resource"></a> <h2 class="hide-from-toc" id="tpu-resource" data-text=" TPU resource" tabindex="-1"> TPU resource</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="TensorFlow">#TensorFlow</div> <div class="glossary-icon" title="Google Cloud">#GoogleCloud</div> </div></p> <p>A TPU entity on Google Cloud that you create, manage, or consume. For example, <a href="#TPU_node"><strong>TPU nodes</strong></a> and <a href="#TPU_type"><strong>TPU types</strong></a> are TPU resources.</p> <p><a class="glossary-anchor" name="TPU_slice"></a> <h2 class="hide-from-toc" id="tpu-slice" data-text=" TPU slice" tabindex="-1"> TPU slice</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="TensorFlow">#TensorFlow</div> <div class="glossary-icon" title="Google Cloud">#GoogleCloud</div> </div></p> <p>A TPU slice is a fractional portion of the <a href="#TPU_device"><strong>TPU devices</strong></a> in a <a href="#TPU_Pod"><strong>TPU Pod</strong></a>. All of the devices in a TPU slice are connected to one another over a dedicated high-speed network.</p> <p><a class="glossary-anchor" name="TPU_type"></a> <h2 class="hide-from-toc" id="tpu-type" data-text=" TPU type " tabindex="-1"> TPU type </h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="TensorFlow">#TensorFlow</div> <div class="glossary-icon" title="Google Cloud">#GoogleCloud</div> </div></p> <p>A configuration of one or more <a href="#TPU_device"><strong>TPU devices</strong></a> with a specific TPU hardware version. You select a TPU type when you create a <a href="#TPU_node"><strong>TPU node</strong></a> on Google Cloud. For example, a <code translate="no" dir="ltr">v2-8</code> TPU type is a single TPU v2 device with 8 cores. A <code translate="no" dir="ltr">v3-2048</code> TPU type has 256 networked TPU v3 devices and a total of 2048 cores. TPU types are a resource defined in the <a href="https://cloud.google.com/tpu/docs/reference/rest/v1/projects.locations.acceleratorTypes">Cloud TPU API</a>.</p> <p><a class="glossary-anchor" name="TPU_worker"></a> <h2 class="hide-from-toc" id="tpu-worker" data-text=" TPU worker" tabindex="-1"> TPU worker</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="TensorFlow">#TensorFlow</div> <div class="glossary-icon" title="Google Cloud">#GoogleCloud</div> </div></p> <p>A process that runs on a host machine and executes machine learning programs on <a href="#TPU_device"><strong>TPU devices</strong></a>.</p> <p><a class="glossary-anchor" name="training"></a> <h2 class="hide-from-toc" id="training" data-text=" training" tabindex="-1"> training</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="ML Fundamentals">#fundamentals</div> </div></p> <p>The process of determining the ideal <a href="#parameter"><strong>parameters</strong></a> (weights and biases) comprising a <a href="#model"><strong>model</strong></a>. During training, a system reads in <a href="#example"><strong>examples</strong></a> and gradually adjusts parameters. Training uses each example anywhere from a few times to billions of times.</p> <p><a class="glossary-anchor" name="training-loss"></a> <h2 class="hide-from-toc" id="training-loss" data-text=" training loss" tabindex="-1"> training loss</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="ML Fundamentals">#fundamentals</div> </div></p> <p>A <a href="#metric"><strong>metric</strong></a> representing a model's <a href="#loss"><strong>loss</strong></a> during a particular training iteration. For example, suppose the loss function is <a href="#MSE"><strong>Mean Squared Error</strong></a>. Perhaps the training loss (the Mean Squared Error) for the 10th iteration is 2.2, and the training loss for the 100th iteration is 1.9.</p> <p>A <a href="#loss_curve"><strong>loss curve</strong></a> plots training loss versus the number of iterations. A loss curve provides the following hints about training:</p> <ul> <li>A downward slope implies that the model is improving.</li> <li>An upward slope implies that the model is getting worse.</li> <li>A flat slope implies that the model has reached <a href="#convergence"><strong>convergence</strong></a>.</li> </ul> <p>For example, the following somewhat idealized <a href="#loss_curve"><strong>loss curve</strong></a> shows:</p> <ul> <li>A steep downward slope during the initial iterations, which implies rapid model improvement.</li> <li>A gradually flattening (but still downward) slope until close to the end of training, which implies continued model improvement at a somewhat slower pace then during the initial iterations.</li> <li>A flat slope towards the end of training, which suggests convergence.</li> </ul> <p> <img src="/static/machine-learning/glossary/images/TrainingLoss.png" loading="lazy" alt="The plot of training loss versus iterations. This loss curve starts with a steep downward slope. The slope gradually flattens until the slope becomes zero." > </p> <p>Although training loss is important, see also <a href="#generalization"><strong>generalization</strong></a>.</p> <p><a class="glossary-anchor" name="training-serving-skew"></a> <h2 class="hide-from-toc" id="training-serving-skew" data-text=" training-serving skew" tabindex="-1"> training-serving skew</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="ML Fundamentals">#fundamentals</div> </div></p> <p>The difference between a model's performance during <a href="#training"><strong>training</strong></a> and that same model's performance during <a href="#serving"><strong>serving</strong></a>.</p> <p><a class="glossary-anchor" name="training_set"></a> <h2 class="hide-from-toc" id="training-set" data-text=" training set" tabindex="-1"> training set</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="ML Fundamentals">#fundamentals</div> </div></p> <p>The subset of the <a href="#dataset"><strong>dataset</strong></a> used to train a <a href="#model"><strong>model</strong></a>.</p> <p>Traditionally, examples in the dataset are divided into the following three distinct subsets:</p> <ul> <li>a training set</li> <li>a <a href="#validation_set"><strong>validation set</strong></a></li> <li>a <a href="#test_set"><strong>test set</strong></a></li> </ul> <p>Ideally, each example in the dataset should belong to only one of the preceding subsets. For example, a single example shouldn't belong to both the training set and the validation set.</p> <p><a class="glossary-anchor" name="trajectory"></a> <h2 class="hide-from-toc" id="trajectory" data-text=" trajectory" tabindex="-1"> trajectory</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="Reinforcement Learning">#rl</div> </div></p> <p>In <a href="#reinforcement_learning"><strong>reinforcement learning</strong></a>, a sequence of <a href="https://wikipedia.org/wiki/Tuple" target="T">tuples</a> that represent a sequence of <a href="#state"><strong>state</strong></a> transitions of the <a href="#agent"><strong>agent</strong></a>, where each tuple corresponds to the state, <a href="#action"><strong>action</strong></a>, <a href="#reward"><strong>reward</strong></a>, and next state for a given state transition.</p> <p><a class="glossary-anchor" name="transfer_learning"></a> <h2 class="hide-from-toc" id="transfer-learning" data-text=" transfer learning" tabindex="-1"> transfer learning</h2></p> <p>Transferring information from one machine learning task to another. For example, in multi-task learning, a single model solves multiple tasks, such as a <a href="#deep_model"><strong>deep model</strong></a> that has different output nodes for different tasks. Transfer learning might involve transferring knowledge from the solution of a simpler task to a more complex one, or involve transferring knowledge from a task where there is more data to one where there is less data.</p> <p>Most machine learning systems solve a <em>single</em> task. Transfer learning is a baby step towards artificial intelligence in which a single program can solve <em>multiple</em> tasks.</p> <p><a class="glossary-anchor" name="Transformer"></a> <a class="glossary-anchor" name="transformer"></a> <h2 class="hide-from-toc" id="transformer" data-text=" Transformer" tabindex="-1"> Transformer</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="Language Evaluation">#language</div> </div></p> <p>A <a href="#neural_network"><strong>neural network</strong></a> architecture developed at Google that relies on <a href="#self-attention"><strong>self-attention</strong></a> mechanisms to transform a sequence of input embeddings into a sequence of output embeddings without relying on <a href="#convolution"><strong>convolutions</strong></a> or <a href="#recurrent_neural_network"><strong>recurrent neural networks</strong></a>. A Transformer can be viewed as a stack of self-attention layers.</p> <p>A Transformer can include any of the following:</p> <ul> <li>an <a href="#encoder"><strong>encoder</strong></a></li> <li>a <a href="#decoder"><strong>decoder</strong></a></li> <li>both an encoder and decoder</li> </ul> <p>An <strong>encoder</strong> transforms a sequence of embeddings into a new sequence of the same length. An encoder includes N identical layers, each of which contains two sub-layers. These two sub-layers are applied at each position of the input embedding sequence, transforming each element of the sequence into a new embedding. The first encoder sub-layer aggregates information from across the input sequence. The second encoder sub-layer transforms the aggregated information into an output embedding.</p> <p>A <strong>decoder</strong> transforms a sequence of input embeddings into a sequence of output embeddings, possibly with a different length. A decoder also includes N identical layers with three sub-layers, two of which are similar to the encoder sub-layers. The third decoder sub-layer takes the output of the encoder and applies the <a href="#self-attention"><strong>self-attention</strong></a> mechanism to gather information from it.</p> <p>The blog post <a href="https://ai.googleblog.com/2017/08/transformer-novel-neural-network.html">Transformer: A Novel Neural Network Architecture for Language Understanding</a> provides a good introduction to Transformers.</p> <p><a class="glossary-anchor" name="translational_invariance"></a> <h2 class="hide-from-toc" id="translational-invariance" data-text=" translational invariance" tabindex="-1"> translational invariance</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="Image Models">#image</div> </div></p> <p>In an image classification problem, an algorithm's ability to successfully classify images even when the position of objects within the image changes. For example, the algorithm can still identify a dog, whether it is in the center of the frame or at the left end of the frame.</p> <p>See also <a href="#size_invariance"><strong>size invariance</strong></a> and <a href="#rotational_invariance"><strong>rotational invariance</strong></a>.</p> <p><a class="glossary-anchor" name="trigram"></a> <h2 class="hide-from-toc" id="trigram" data-text=" trigram" tabindex="-1"> trigram</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="Sequence Models">#seq</div> <div class="glossary-icon" title="Language Evaluation">#language</div> </div></p> <p>An <a href="#N-gram"><strong>N-gram</strong></a> in which N=3.</p> <p><a class="glossary-anchor" name="TN"></a> <h2 class="hide-from-toc" id="true-negative-tn" data-text=" true negative (TN)" tabindex="-1"> true negative (TN)</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="ML Fundamentals">#fundamentals</div> </div></p> <p>An example in which the model <em>correctly</em> predicts the <a href="#negative_class"><strong>negative class</strong></a>. For example, the model infers that a particular email message is <em>not spam</em>, and that email message really is <em>not spam</em>.</p> <p><a class="glossary-anchor" name="TP"></a> <h2 class="hide-from-toc" id="true-positive-tp" data-text=" true positive (TP)" tabindex="-1"> true positive (TP)</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="ML Fundamentals">#fundamentals</div> </div></p> <p>An example in which the model <em>correctly</em> predicts the <a href="#positive_class"><strong>positive class</strong></a>. For example, the model infers that a particular email message is spam, and that email message really is spam.</p> <p><a class="glossary-anchor" name="TP_rate"></a> <h2 class="hide-from-toc" id="true-positive-rate-tpr" data-text=" true positive rate (TPR)" tabindex="-1"> true positive rate (TPR)</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="ML Fundamentals">#fundamentals</div> </div></p> <p>Synonym for <a href="#recall"><strong>recall</strong></a>. That is:</p> <div> $$\text{true positive rate} = \frac{\text{true positives}} {\text{true positives} + \text{false negatives}}$$ </div> <p>True positive rate is the y-axis in an <a href="#ROC"><strong>ROC curve</strong></a>.</p> <p><a class="glossary-anchor" name="u"></a> <h2 class="glossary" id="u" data-text="U" tabindex="-1">U</h2></p> <p><a class="glossary-anchor" name="unawareness"></a> <h2 class="hide-from-toc" id="unawareness-to-a-sensitive-attribute" data-text=" unawareness (to a sensitive attribute)" tabindex="-1"> unawareness (to a sensitive attribute)</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="Fairness">#fairness</div> </div></p> <p>A situation in which <a href="#sensitive_attribute"><strong>sensitive attributes</strong></a> are present, but not included in the training data. Because sensitive attributes are often correlated with other attributes of one's data, a model trained with unawareness about a sensitive attribute could still have <a href="#disparate_impact"><strong>disparate impact</strong></a> with respect to that attribute, or violate other <a href="#fairness_constraint"><strong>fairness constraints</strong></a>.</p> <p><a class="glossary-anchor" name="underfitting"></a> <h2 class="hide-from-toc" id="underfitting" data-text=" underfitting" tabindex="-1"> underfitting</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="ML Fundamentals">#fundamentals</div> </div></p> <p>Producing a <a href="#model"><strong>model</strong></a> with poor predictive ability because the model hasn't fully captured the complexity of the training data. Many problems can cause underfitting, including:</p> <ul> <li>Training on the wrong set of <a href="#feature"><strong>features</strong></a>.</li> <li>Training for too few <a href="#epoch"><strong>epochs</strong></a> or at too low a <a href="#learning_rate"><strong>learning rate</strong></a>.</li> <li>Training with too high a <a href="#regularization_rate"><strong>regularization rate</strong></a>.</li> <li>Providing too few <a href="#hidden_layer"><strong>hidden layers</strong></a> in a deep neural network.</li> </ul> <p><a class="glossary-anchor" name="undersampling"></a> <h2 class="hide-from-toc" id="undersampling" data-text=" undersampling" tabindex="-1"> undersampling</h2></p> <p>Removing <a href="#example"><strong>examples</strong></a> from the <a href="#majority_class"><strong>majority class</strong></a> in a <a href="#class_imbalanced_data_set"><strong>class-imbalanced dataset</strong></a> in order to create a more balanced <a href="#training_set"><strong>training set</strong></a>.</p> <p>For example, consider a dataset in which the ratio of the majority class to the <a href="#minority_class"><strong>minority class</strong></a> is 20:1. To overcome this class imbalance, you could create a training set consisting of <em>all</em> of the minority class examples but only a <em>tenth</em> of the majority class examples, which would create a training-set class ratio of 2:1. Thanks to undersampling, this more balanced training set might produce a better model. Alternatively, this more balanced training set might contain insufficient examples to train an effective model.</p> <p>Contrast with <a href="#oversampling"><strong>oversampling</strong></a>.</p> <p><a class="glossary-anchor" name="unidirectional"></a> <h2 class="hide-from-toc" id="unidirectional" data-text=" unidirectional" tabindex="-1"> unidirectional</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="Language Evaluation">#language</div> </div></p> <p>A system that only evaluates the text that <em>precedes</em> a target section of text. In contrast, a bidirectional system evaluates both the text that <em>precedes</em> and <em>follows</em> a target section of text. See <a href="#bidirectional"><strong>bidirectional</strong></a> for more details.</p> <p><a class="glossary-anchor" name="unidirectional-language-model"></a> <h2 class="hide-from-toc" id="unidirectional-language-model" data-text=" unidirectional language model" tabindex="-1"> unidirectional language model</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="Language Evaluation">#language</div> </div></p> <p>A <a href="#language-model"><strong>language model</strong></a> that bases its probabilities only on the <a href="#token"><strong>tokens</strong></a> appearing <em>before</em>, not <em>after</em>, the target token(s). Contrast with <a href="#bidirectional-language-model"><strong>bidirectional language model</strong></a>.</p> <p><a class="glossary-anchor" name="unlabeled_example"></a> <h2 class="hide-from-toc" id="unlabeled-example" data-text=" unlabeled example" tabindex="-1"> unlabeled example</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="ML Fundamentals">#fundamentals</div> </div></p> <p>An example that contains <a href="#feature"><strong>features</strong></a> but no <a href="#label"><strong>label</strong></a>. For example, the following table shows three unlabeled examples from a house valuation model, each with three features but no house value:</p> <table> <tr><th>Number of bedrooms</th> <th>Number of bathrooms</th> <th>House age</th> </tr> <tr><td>3</td> <td>2</td> <td>15</td> </tr> <tr><td>2</td> <td>1</td> <td>72</td> </tr> <tr><td>4</td> <td>2</td> <td>34</td> </tr> </table> <p>In <a href="#supervised_machine_learning"><strong>supervised machine learning</strong></a>, models train on labeled examples and make predictions on <a href="#unlabeled_example"><strong>unlabeled examples</strong></a>.</p> <p>In <a href="#semi-supervised_learning"><strong>semi-supervised</strong></a> and <a href="#unsupervised_machine_learning"><strong>unsupervised</strong></a> learning, unlabeled examples are used during training.</p> <p>Contrast unlabeled example with <a href="#labeled_example"><strong>labeled example</strong></a>.</p> <p><a class="glossary-anchor" name="unsupervised_machine_learning"></a> <h2 class="hide-from-toc" id="unsupervised-machine-learning" data-text=" unsupervised machine learning" tabindex="-1"> unsupervised machine learning</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="Clustering">#clustering</div> <div class="glossary-icon" title="ML Fundamentals">#fundamentals</div> </div></p> <p>Training a <a href="#model"><strong>model</strong></a> to find patterns in a dataset, typically an unlabeled dataset.</p> <p>The most common use of unsupervised machine learning is to <a href="#clustering"><strong>cluster</strong></a> data into groups of similar examples. For example, an unsupervised machine learning algorithm can cluster songs based on various properties of the music. The resulting clusters can become an input to other machine learning algorithms (for example, to a music recommendation service). Clustering can help when useful labels are scarce or absent. For example, in domains such as anti-abuse and fraud, clusters can help humans better understand the data.</p> <p>Contrast with <a href="#supervised_machine_learning"><strong>supervised machine learning</strong></a>.</p> <section class="expandable"> <h4 class="showalways" id="click-the-icon-for-additional-notes._15" data-text=" Click the icon for additional notes. " tabindex="-1"> Click the icon for additional notes. </h4> <div class="expand-background"> <p> Another example of unsupervised machine learning is <a href="https://wikipedia.org/wiki/Principal_component_analysis" target="T">principal component analysis (PCA)</a>. For example, applying PCA on a dataset containing the contents of millions of shopping carts might reveal that shopping carts containing lemons frequently also contain antacids. </p> </div> <hr /> </section> <p><a class="glossary-anchor" name="uplift-modeling"></a> <h2 class="hide-from-toc" id="uplift-modeling" data-text=" uplift modeling" tabindex="-1"> uplift modeling</h2></p> <p>A modeling technique, commonly used in marketing, that models the "causal effect" (also known as the "incremental impact") of a "treatment" on an "individual." Here are two examples:</p> <ul> <li>Doctors might use uplift modeling to predict the mortality decrease (causal effect) of a medical procedure (treatment) depending on the age and medical history of a patient (individual).</li> <li>Marketers might use uplift modeling to predict the increase in probability of a purchase (causal effect) due to an advertisement (treatment) on a person (individual).</li> </ul> <p>Uplift modeling differs from <a href="#classification_model"><strong>classification</strong></a> or <a href="#regression_model"><strong>regression</strong></a> in that some labels (for example, half of the labels in binary treatments) are always missing in uplift modeling. For example, a patient can either receive or not receive a treatment; therefore, we can only observe whether the patient is going to heal or not heal in only one of these two situations (but never both). The main advantage of an uplift model is that it can generate predictions for the unobserved situation (the counterfactual) and use it to compute the causal effect.</p> <p><a class="glossary-anchor" name="upweighting"></a> <h2 class="hide-from-toc" id="upweighting" data-text=" upweighting" tabindex="-1"> upweighting</h2></p> <p>Applying a weight to the <a href="#downsampling"><strong>downsampled</strong></a> class equal to the factor by which you downsampled.</p> <p><a class="glossary-anchor" name="user_matrix"></a> <h2 class="hide-from-toc" id="user-matrix" data-text=" user matrix" tabindex="-1"> user matrix</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="Recommendation Systems">#recsystems</div> </div></p> <p>In <a href="#recommendation_system"><strong>recommendation systems</strong></a>, an <a href="#embedding_vector"><strong>embedding vector</strong></a> generated by <a href="#matrix_factorization"><strong>matrix factorization</strong></a> that holds latent signals about user preferences. Each row of the user matrix holds information about the relative strength of various latent signals for a single user. For example, consider a movie recommendation system. In this system, the latent signals in the user matrix might represent each user's interest in particular genres, or might be harder-to-interpret signals that involve complex interactions across multiple factors.</p> <p>The user matrix has a column for each latent feature and a row for each user. That is, the user matrix has the same number of rows as the target matrix that is being factorized. For example, given a movie recommendation system for 1,000,000 users, the user matrix will have 1,000,000 rows.</p> <p><a class="glossary-anchor" name="v"></a> <h2 class="glossary" id="v" data-text="V" tabindex="-1">V</h2></p> <p><a class="glossary-anchor" name="validation"></a> <h2 class="hide-from-toc" id="validation" data-text=" validation" tabindex="-1"> validation</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="ML Fundamentals">#fundamentals</div> </div></p> <p>The initial evaluation of a model's quality. Validation checks the quality of a model's predictions against the <a href="#validation_set"><strong>validation set</strong></a>.</p> <p>Because the validation set differs from the <a href="#training_set"><strong>training set</strong></a>, validation helps guard against <a href="#overfitting"><strong>overfitting</strong></a>.</p> <p>You might think of evaluating the model against the validation set as the first round of testing and evaluating the model against the <a href="#test_set"><strong>test set</strong></a> as the second round of testing.</p> <p><a class="glossary-anchor" name="validation-loss"></a> <h2 class="hide-from-toc" id="validation-loss" data-text=" validation loss" tabindex="-1"> validation loss</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="ML Fundamentals">#fundamentals</div> </div></p> <p>A <a href="#metric"><strong>metric</strong></a> representing a model's <a href="#loss"><strong>loss</strong></a> on the <a href="#validation_set"><strong>validation set</strong></a> during a particular <a href="#iteration"><strong>iteration</strong></a> of training.</p> <p>See also <a href="#generalization_curve"><strong>generalization curve</strong></a>.</p> <p><a class="glossary-anchor" name="validation_set"></a> <h2 class="hide-from-toc" id="validation-set" data-text=" validation set" tabindex="-1"> validation set</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="ML Fundamentals">#fundamentals</div> </div></p> <p>The subset of the <a href="#dataset"><strong>dataset</strong></a> that performs initial evaluation against a trained <a href="#model"><strong>model</strong></a>. Typically, you evaluate the trained model against the <a href="#validation_set"><strong>validation set</strong></a> several times before evaluating the model against the <a href="#test_set"><strong>test set</strong></a>.</p> <p>Traditionally, you divide the examples in the dataset into the following three distinct subsets:</p> <ul> <li>a <a href="#training_set"><strong>training set</strong></a></li> <li>a validation set</li> <li>a <a href="#test_set"><strong>test set</strong></a></li> </ul> <p>Ideally, each example in the dataset should belong to only one of the preceding subsets. For example, a single example shouldn't belong to both the training set and the validation set.</p> <p><a class="glossary-anchor" name="value-imputation"></a> <h2 class="hide-from-toc" id="value-imputation" data-text="value imputation" tabindex="-1">value imputation</h2></p> <p>The process of replacing a missing value with an acceptable substitute. When a value is missing, you can either discard the entire example or you can use value imputation to salvage the example.</p> <p>For example, consider a dataset containing a <code translate="no" dir="ltr">temperature</code> feature that is supposed to be recorded every hour. However, the temperature reading was unavailable for a particular hour. Here is a section of the dataset:</p> <table> <tr><th>Timestamp</th> <th>Temperature</th></tr> <tr><td>1680561000</td> <td>10</td></tr> <tr><td>1680564600</td> <td>12</td></tr> <tr><td>1680568200</td> <td>missing</td></tr> <tr><td>1680571800</td> <td>20</td></tr> <tr><td>1680575400</td> <td>21</td></tr> <tr><td>1680579000</td> <td>21</td></tr> </table> <p>A system could either delete the missing example or impute the missing temperature as 12, 16, 18, or 20, depending on the imputation algorithm.</p> <p><a class="glossary-anchor" name="vanishing_gradient_problem"></a> <h2 class="hide-from-toc" id="vanishing-gradient-problem" data-text="vanishing gradient problem" tabindex="-1">vanishing gradient problem</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="Sequence Models">#seq</div> </div></p> <p>The tendency for the gradients of early <a href="#hidden_layer"><strong>hidden layers</strong></a> of some <a href="#deep_neural_network"><strong>deep neural networks</strong></a> to become surprisingly flat (low). Increasingly lower gradients result in increasingly smaller changes to the weights on nodes in a deep neural network, leading to little or no learning. Models suffering from the vanishing gradient problem become difficult or impossible to train. <a href="#Long_Short-Term_Memory"><strong>Long Short-Term Memory</strong></a> cells address this issue.</p> <p>Compare to <a href="#exploding_gradient_problem"><strong>exploding gradient problem</strong></a>.</p> <p><a class="glossary-anchor" name="variable-importances"></a> <h2 class="hide-from-toc" id="variable-importances" data-text=" variable importances " tabindex="-1"> variable importances </h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="Decision Forests">#df</div> </div></p> <p>A set of scores that indicates the relative importance of each <a href="#feature"><strong>feature</strong></a> to the model.</p> <p>For example, consider a <a href="#decision-tree"><strong>decision tree</strong></a> that estimates house prices. Suppose this decision tree uses three features: size, age, and style. If a set of variable importances for the three features are calculated to be {size=5.8, age=2.5, style=4.7}, then size is more important to the decision tree than age or style.</p> <p>Different variable importance metrics exist, which can inform ML experts about different aspects of models.</p> <p><a class="glossary-anchor" name="variational-autoencoder"></a> <h2 class="hide-from-toc" id="variational-autoencoder-vae" data-text=" variational autoencoder (VAE) " tabindex="-1"> variational autoencoder (VAE) </h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="Language Evaluation">#language</div> </div></p> <p>A type of <a href="#autoencoder"><strong>autoencoder</strong></a> that leverages the discrepancy between inputs and outputs to generate modified versions of the inputs. Variational autoencoders are useful for <a href="#generative-AI"><strong>generative AI</strong></a>.</p> <p>VAEs are based on variational inference: a technique for estimating the parameters of a probability model.</p> <p><a class="glossary-anchor" name="vector"></a> <h2 class="hide-from-toc" id="vector" data-text=" vector " tabindex="-1"> vector </h2></p> <p>Very overloaded term whose meaning varies across different mathematical and scientific fields. Within machine learning, a vector has two properties:</p> <ul> <li>Data type: Vectors in machine learning usually hold floating-point numbers.</li> <li>Number of elements: This is the vector's length or its <em>dimension</em>.</li> </ul> <p>For example, consider a <a href="#feature_vector"><strong>feature vector</strong></a> that holds eight floating-point numbers. This feature vector has a length or dimension of eight. Note that machine learning vectors often have a huge number of dimensions.</p> <p>You can represent many different kinds of information as a vector. For example:</p> <ul> <li>Any position on the surface of Earth can be represented as a 2-dimensional vector, where one dimension is the latitude and the other is the longitude.</li> <li>The current prices of each of 500 stocks can be represented as a 500-dimensional vector.</li> <li>A probability distribution over a finite number of classes can be represented as a vector. For example, a <a href="#multi-class"><strong>multiclass classification</strong></a> system that predicts one of three output colors (red, green, or yellow) could output the vector <code translate="no" dir="ltr">(0.3, 0.2, 0.5)</code> to mean <code translate="no" dir="ltr">P[red]=0.3, P[green]=0.2, P[yellow]=0.5</code>.</li> </ul> <p>Vectors can be concatenated; therefore, a variety of different media can be represented as a single vector. Some models operate directly on the concatenation of many <a href="#one-hot_encoding"><strong>one-hot encodings</strong></a>.</p> <p>Specialized processors such as <a href="#TPU"><strong>TPUs</strong></a> are optimized to perform mathematical operations on vectors.</p> <p>A vector is a <a href="#tensor"><strong>tensor</strong></a> of <a href="#rank_Tensor"><strong>rank</strong></a> 1.</p> <p><a class="glossary-anchor" name="w"></a> <h2 class="glossary" id="w" data-text="W" tabindex="-1">W</h2></p> <p><a class="glossary-anchor" name="Wasserstein_loss"></a> <h2 class="hide-from-toc" id="wasserstein-loss" data-text="Wasserstein loss" tabindex="-1">Wasserstein loss</h2></p> <p>One of the loss functions commonly used in <a href="#generative_adversarial_network"><strong>generative adversarial networks</strong></a>, based on the <a href="#earth-movers-distance"><strong>earth mover's distance</strong></a> between the distribution of generated data and real data.</p> <p><a class="glossary-anchor" name="weight"></a> <h2 class="hide-from-toc" id="weight" data-text=" weight" tabindex="-1"> weight</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="ML Fundamentals">#fundamentals</div> </div></p> <p>A value that a model multiplies by another value. <a href="#training"><strong>Training</strong></a> is the process of determining a model's ideal weights; <a href="#inference"><strong>inference</strong></a> is the process of using those learned weights to make predictions.</p> <section class="expandable"> <h4 class="showalways" id="click-the-icon-to-see-an-example-of-weights-in-a-linear-model." data-text=" Click the icon to see an example of weights in a linear model. " tabindex="-1"> Click the icon to see an example of weights in a linear model. </h4> <div class="expand-background"> <p> Imagine a <a href="#linear_model"><b>linear model</b></a> with two features. Suppose that training determines the following weights (and <a href="#bias">bias</a>): </p> <ul> <li>The bias, b, has a value of 2.2</li> <li>The weight, w<sub>1</sub> associated with one feature is 1.5.</li> <li>The weight, w<sub>2</sub> associated with the other feature is 0.4.</li> </ul> <p>Now imagine an <a href="#example">example</a> with the following feature values:</p> <ul> <li>The value of one feature, x<sub>1</sub>, is 6.</li> <li>The value of the other feature, x<sub>2</sub>, is 10.</li> </ul> <p>This linear model uses the following formula to generate a prediction, y':</p> <div> $$y' = b + w_1x_1 + w_2x_2$$ </div> <p>Therefore, the prediction is:</p> <div> $$y' = 2.2 + (1.5)(6) + (0.4)(10) = 15.2$$ </div> <p>If a weight is 0, then the corresponding feature doesn't contribute to the model. For example, if w<sub>1</sub> is 0, then the value of x<sub>1</sub> is irrelevant.</p> </div> <hr /> </section> <p><a class="glossary-anchor" name="WALS"></a> <h2 class="hide-from-toc" id="weighted-alternating-least-squares-wals" data-text="Weighted Alternating Least Squares (WALS)" tabindex="-1">Weighted Alternating Least Squares (WALS)</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="Recommendation Systems">#recsystems</div> </div></p> <p>An algorithm for minimizing the objective function during <a href="#matrix_factorization"><strong>matrix factorization</strong></a> in <a href="#recommendation_system"><strong>recommendation systems</strong></a>, which allows a downweighting of the missing examples. WALS minimizes the weighted squared error between the original matrix and the reconstruction by alternating between fixing the row factorization and column factorization. Each of these optimizations can be solved by least squares <a href="#convex_optimization"><strong>convex optimization</strong></a>. For details, see the <a href="/machine-learning/recommendation/collaborative/matrix" target="T" class="gc-analytics-event" data-category="launchRecommendationCourse" data-label="ml-glossary" data-action="click">Recommendation Systems course</a>.</p> <p><a class="glossary-anchor" name="weighted_sum"></a> <h2 class="hide-from-toc" id="weighted-sum" data-text=" weighted sum" tabindex="-1"> weighted sum</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="ML Fundamentals">#fundamentals</div> </div></p> <p>The sum of all the relevant input values multiplied by their corresponding weights. For example, suppose the relevant inputs consist of the following:</p> <table> <tr><td>input value</td> <td>input weight</td></tr> <tr><td>2</td> <td>-1.3</td></tr> <tr><td>-1</td> <td>0.6</td></tr> <tr><td>3</td> <td>0.4</td></tr> </table> <p>The weighted sum is therefore:</p> <div></div><devsite-code><pre class="devsite-click-to-copy" translate="no" dir="ltr" is-upgraded syntax="Text only">weighted sum = (2)(-1.3) + (-1)(0.6) + (3)(0.4) = -2.0</pre></devsite-code> <p>A weighted sum is the input argument to an <a href="#activation_function"><strong>activation function</strong></a>.</p> <p><a class="glossary-anchor" name="wide_model"></a> <h2 class="hide-from-toc" id="wide-model" data-text=" wide model" tabindex="-1"> wide model</h2></p> <p>A linear model that typically has many <a href="#sparse_features"><strong>sparse input features</strong></a>. We refer to it as "wide" since such a model is a special type of <a href="#neural_network"><strong>neural network</strong></a> with a large number of inputs that connect directly to the output node. Wide models are often easier to debug and inspect than <a href="#deep_model"><strong>deep models</strong></a>. Although wide models cannot express nonlinearities through <a href="#hidden_layer"><strong>hidden layers</strong></a>, wide models can use transformations such as <a href="#feature_cross"><strong>feature crossing</strong></a> and <a href="#bucketing"><strong>bucketization</strong></a> to model nonlinearities in different ways.</p> <p>Contrast with <a href="#deep_model"><strong>deep model</strong></a>.</p> <p><a class="glossary-anchor" name="width"></a> <h2 class="hide-from-toc" id="width" data-text=" width" tabindex="-1"> width</h2></p> <p>The number of <a href="#neuron"><strong>neurons</strong></a> in a particular <a href="#layer"><strong>layer</strong></a> of a <a href="#neural_network"><strong>neural network</strong></a>.</p> <p><a class="glossary-anchor" name="wisdom-of-the-crowd"></a> <h2 class="hide-from-toc" id="wisdom-of-the-crowd" data-text=" wisdom of the crowd" tabindex="-1"> wisdom of the crowd</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="Decision Forests">#df</div> </div></p> <p>The idea that averaging the opinions or estimates of a large group of people ("the crowd") often produces surprisingly good results. For example, consider a game in which people guess the number of jelly beans packed into a large jar. Although most individual guesses will be inaccurate, the average of all the guesses has been empirically shown to be surprisingly close to the actual number of jelly beans in the jar.</p> <p><a href="#ensemble"><strong>Ensembles</strong></a> are a software analog of wisdom of the crowd. Even if individual models make wildly inaccurate predictions, averaging the predictions of many models often generates surprisingly good predictions. For example, although an individual <a href="#decision-tree"><strong>decision tree</strong></a> might make poor predictions, a <a href="#decision-forest"><strong>decision forest</strong></a> often makes very good predictions.</p> <p><a class="glossary-anchor" name="word-embedding"></a> <h2 class="hide-from-toc" id="word-embedding" data-text=" word embedding" tabindex="-1"> word embedding</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="Language Evaluation">#language</div> </div></p> <p><a href="#representation"><strong>Representing</strong></a> each word in a word set within an <a href="#embedding_vector"><strong>embedding vector</strong></a>; that is, representing each word as a vector of floating-point values between 0.0 and 1.0. Words with similar meanings have more-similar representations than words with different meanings. For example, <em>carrots</em>, <em>celery</em>, and <em>cucumbers</em> would all have relatively similar representations, which would be very different from the representations of <em>airplane</em>, <em>sunglasses</em>, and <em>toothpaste</em>.</p> <p><a class="glossary-anchor" name="x"></a> <h2 class="glossary" id="x" data-text="X" tabindex="-1">X</h2></p> <p><a class="glossary-anchor" name="XLA"></a> <h2 class="hide-from-toc" id="xla-accelerated-linear-algebra" data-text=" XLA (Accelerated Linear Algebra)" tabindex="-1"> XLA (Accelerated Linear Algebra)</h2></p> <p>An open-source machine learning compiler for GPUs, CPUs, and ML accelerators.</p> <p>The XLA compiler takes models from popular ML frameworks such as <a href="https://pytorch.org" target="T">PyTorch</a>, <a href="#TensorFlow"><strong>TensorFlow</strong></a>, and <a href="#JAX"><strong>JAX</strong></a>, and optimizes them for high-performance execution across different hardware platforms including GPUs, CPUs, and ML <a href="#accelerator-chip"><strong>accelerators</strong></a>.</p> <p><a class="glossary-anchor" name="z"></a> <h2 class="glossary" id="z" data-text="Z" tabindex="-1">Z</h2></p> <p><a class="glossary-anchor" name="zero-shot-learning"></a> <h2 class="hide-from-toc" id="zero-shot-learning" data-text=" zero-shot learning" tabindex="-1"> zero-shot learning</h2></p> <p>A type of machine learning <a href="#training"><strong>training</strong></a> where the <a href="#model"><strong>model</strong></a> infers a <a href="#prediction"><strong>prediction</strong></a> for a task that it was not specifically already trained on. In other words, the model is given zero task-specific training <a href="#example"><strong>examples</strong></a> but asked to do <a href="#inference"><strong>inference</strong></a> for that task.</p> <p><a class="glossary-anchor" name="zero-shot-prompting"></a> <h2 class="hide-from-toc" id="zero-shot-prompting" data-text=" zero-shot prompting" tabindex="-1"> zero-shot prompting</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="Language Evaluation">#language</div> <div class="glossary-icon" title="Generative AI">#generativeAI</div> </div></p> <p>A <a href="#prompt"><strong>prompt</strong></a> that does <em>not</em> provide an example of how you want the <a href="#large-language-model"><strong>large language model</strong></a> to respond. For example:</p> <table> <tr> <th>Parts of one prompt</th> <th>Notes</th> </tr> <tr> <td><tt>What is the official currency of the specified country?</tt></td> <td>The question you want the LLM to answer.</td> </tr> <tr> <td><tt>India:</tt></td> <td>The actual query.</td> </tr> </table> <p>The large language model might respond with any of the following:</p> <ul> <li>Rupee</li> <li>INR</li> <li>₹</li> <li>Indian rupee</li> <li>The rupee</li> <li>The Indian rupee</li> </ul> <p>All of the answers are correct, though you might prefer a particular format.</p> <p>Compare and contrast <strong>zero-shot prompting</strong> with the following terms:</p> <ul> <li><a href="#one-shot-prompting"><strong>one-shot prompting</strong></a></li> <li><a href="#few-shot-prompting"><strong>few-shot prompting</strong></a></li> </ul> <p><a class="glossary-anchor" name="Z-score-normalization"></a> <h2 class="hide-from-toc" id="z-score-normalization" data-text=" Z-score normalization" tabindex="-1"> Z-score normalization</h2> <div class="glossary-icon-container"> <div class="glossary-icon" title="ML Fundamentals">#fundamentals</div> </div></p> <p>A <a href="#scaling"><strong>scaling</strong></a> technique that replaces a raw <a href="#feature"><strong>feature</strong></a> value with a floating-point value representing the number of standard deviations from that feature's mean. For example, consider a feature whose mean is 800 and whose standard deviation is 100. The following table shows how Z-score normalization would map the raw value to its Z-score:</p> <table> <tr> <th>Raw value</th> <th>Z-score</th> </tr> <tr> <td>800</td> <td>0</td> </tr> <tr> <td>950</td> <td>+1.5</td> </tr> <tr> <td>575</td> <td>-2.25</td> </tr> </table> <p>The machine learning model then trains on the Z-scores for that feature instead of on the raw values.</p> </div> <devsite-recommendations display="in-page" hidden yield> </devsite-recommendations> <devsite-thumb-rating position="footer"> </devsite-thumb-rating> <devsite-feedback position="footer" project-name="Machine Learning" product-id="5005867" bucket="" context="" version="t-devsite-webserver-20241114-r00-rc02.464922260396498922" data-label="Send Feedback Button" track-type="feedback" track-name="sendFeedbackLink" track-metadata-position="footer" class="nocontent" project-icon="https://www.gstatic.com/devrel-devsite/prod/v870e399c64f7c43c99a3043db4b3a74327bb93d0914e84a0c3dba90bbfd67625/developers/images/touchicon-180-new.png" > <button> Send feedback </button> </devsite-feedback> <devsite-recommendations id="recommendations-link" yield></devsite-recommendations> <div class="devsite-floating-action-buttons"> </div> </article> <devsite-content-footer class="nocontent"> <p>Except as otherwise noted, the content of this page is licensed under the <a href="https://creativecommons.org/licenses/by/4.0/">Creative Commons Attribution 4.0 License</a>, and code samples are licensed under the <a href="https://www.apache.org/licenses/LICENSE-2.0">Apache 2.0 License</a>. For details, see the <a href="https://developers.google.com/site-policies">Google Developers Site Policies</a>. Java is a registered trademark of Oracle and/or its affiliates.</p> <p>Last updated 2024-10-17 UTC.</p> </devsite-content-footer> <devsite-notification > </devsite-notification> <div class="devsite-content-data"> <template class="devsite-thumb-rating-feedback"> <devsite-feedback position="thumb-rating" project-name="Machine Learning" product-id="5005867" bucket="" context="" version="t-devsite-webserver-20241114-r00-rc02.464922260396498922" data-label="Send Feedback Button" track-type="feedback" track-name="sendFeedbackLink" track-metadata-position="thumb-rating" class="nocontent" project-icon="https://www.gstatic.com/devrel-devsite/prod/v870e399c64f7c43c99a3043db4b3a74327bb93d0914e84a0c3dba90bbfd67625/developers/images/touchicon-180-new.png" > <button> Need to tell us more? </button> </devsite-feedback> </template> <template class="devsite-content-data-template"> [[["Easy to understand","easyToUnderstand","thumb-up"],["Solved my problem","solvedMyProblem","thumb-up"],["Other","otherUp","thumb-up"]],[["Missing the information I need","missingTheInformationINeed","thumb-down"],["Too complicated / too many steps","tooComplicatedTooManySteps","thumb-down"],["Out of date","outOfDate","thumb-down"],["Samples / code issue","samplesCodeIssue","thumb-down"],["Other","otherDown","thumb-down"]],["Last updated 2024-10-17 UTC."],[],[]] </template> </div> </devsite-content> </main> <devsite-footer-promos class="devsite-footer"> </devsite-footer-promos> <devsite-footer-linkboxes class="devsite-footer"> <nav class="devsite-footer-linkboxes nocontent" aria-label="Footer links"> <ul class="devsite-footer-linkboxes-list"> <li class="devsite-footer-linkbox "> <h3 class="devsite-footer-linkbox-heading no-link">Connect</h3> <ul class="devsite-footer-linkbox-list"> <li class="devsite-footer-linkbox-item"> <a href="//googledevelopers.blogspot.com" class="devsite-footer-linkbox-link gc-analytics-event" data-category="Site-Wide Custom Events" data-label="Footer Link (index 1)" > Blog </a> </li> <li class="devsite-footer-linkbox-item"> <a href="https://www.instagram.com/googlefordevs/" class="devsite-footer-linkbox-link gc-analytics-event" data-category="Site-Wide Custom Events" data-label="Footer Link (index 2)" > Instagram </a> </li> <li class="devsite-footer-linkbox-item"> <a href="https://www.linkedin.com/showcase/googledevelopers/" class="devsite-footer-linkbox-link gc-analytics-event" data-category="Site-Wide Custom Events" data-label="Footer Link (index 3)" > LinkedIn </a> </li> <li class="devsite-footer-linkbox-item"> <a href="//twitter.com/googledevs" class="devsite-footer-linkbox-link gc-analytics-event" data-category="Site-Wide Custom Events" data-label="Footer Link (index 4)" > X (Twitter) </a> </li> <li class="devsite-footer-linkbox-item"> <a href="//www.youtube.com/user/GoogleDevelopers" class="devsite-footer-linkbox-link gc-analytics-event" data-category="Site-Wide Custom Events" data-label="Footer Link (index 5)" > YouTube </a> </li> </ul> </li> <li class="devsite-footer-linkbox "> <h3 class="devsite-footer-linkbox-heading no-link">Programs</h3> <ul class="devsite-footer-linkbox-list"> <li class="devsite-footer-linkbox-item"> <a href="//www.womentechmakers.com" class="devsite-footer-linkbox-link gc-analytics-event" data-category="Site-Wide Custom Events" data-label="Footer Link (index 1)" > Women Techmakers </a> </li> <li class="devsite-footer-linkbox-item"> <a href="/community/gdg" class="devsite-footer-linkbox-link gc-analytics-event" data-category="Site-Wide Custom Events" data-label="Footer Link (index 2)" > Google Developer Groups </a> </li> <li class="devsite-footer-linkbox-item"> <a href="/community/experts" class="devsite-footer-linkbox-link gc-analytics-event" data-category="Site-Wide Custom Events" data-label="Footer Link (index 3)" > Google Developer Experts </a> </li> <li class="devsite-footer-linkbox-item"> <a href="/community/accelerators" class="devsite-footer-linkbox-link gc-analytics-event" data-category="Site-Wide Custom Events" data-label="Footer Link (index 4)" > Accelerators </a> </li> <li class="devsite-footer-linkbox-item"> <a href="/community/gdsc" class="devsite-footer-linkbox-link gc-analytics-event" data-category="Site-Wide Custom Events" data-label="Footer Link (index 5)" > Google Developer Student Clubs </a> </li> </ul> </li> <li class="devsite-footer-linkbox "> <h3 class="devsite-footer-linkbox-heading no-link">Developer consoles</h3> <ul class="devsite-footer-linkbox-list"> <li class="devsite-footer-linkbox-item"> <a href="//console.developers.google.com" class="devsite-footer-linkbox-link gc-analytics-event" data-category="Site-Wide Custom Events" data-label="Footer Link (index 1)" > Google API Console </a> </li> <li class="devsite-footer-linkbox-item"> <a href="//console.cloud.google.com" class="devsite-footer-linkbox-link gc-analytics-event" data-category="Site-Wide Custom Events" data-label="Footer Link (index 2)" > Google Cloud Platform Console </a> </li> <li class="devsite-footer-linkbox-item"> <a href="//play.google.com/apps/publish" class="devsite-footer-linkbox-link gc-analytics-event" data-category="Site-Wide Custom Events" data-label="Footer Link (index 3)" > Google Play Console </a> </li> <li class="devsite-footer-linkbox-item"> <a href="//console.firebase.google.com" class="devsite-footer-linkbox-link gc-analytics-event" data-category="Site-Wide Custom Events" data-label="Footer Link (index 4)" > Firebase Console </a> </li> <li class="devsite-footer-linkbox-item"> <a href="//console.actions.google.com" class="devsite-footer-linkbox-link gc-analytics-event" data-category="Site-Wide Custom Events" data-label="Footer Link (index 5)" > Actions on Google Console </a> </li> <li class="devsite-footer-linkbox-item"> <a href="//cast.google.com/publish" class="devsite-footer-linkbox-link gc-analytics-event" data-category="Site-Wide Custom Events" data-label="Footer Link (index 6)" > Cast SDK Developer Console </a> </li> <li class="devsite-footer-linkbox-item"> <a href="//chrome.google.com/webstore/developer/dashboard" class="devsite-footer-linkbox-link gc-analytics-event" data-category="Site-Wide Custom Events" data-label="Footer Link (index 7)" > Chrome Web Store Dashboard </a> </li> <li class="devsite-footer-linkbox-item"> <a href="//console.home.google.com" class="devsite-footer-linkbox-link gc-analytics-event" data-category="Site-Wide Custom Events" data-label="Footer Link (index 8)" > Google Home Developer Console </a> </li> </ul> </li> </ul> </nav> </devsite-footer-linkboxes> <devsite-footer-utility class="devsite-footer"> <div class="devsite-footer-utility nocontent"> <nav class="devsite-footer-sites" aria-label="Other Google Developers websites"> <a href="https://developers.google.com/" class="devsite-footer-sites-logo-link gc-analytics-event" data-category="Site-Wide Custom Events" data-label="Footer Google Developers Link"> <picture> <img class="devsite-footer-sites-logo" src="https://www.gstatic.com/devrel-devsite/prod/v870e399c64f7c43c99a3043db4b3a74327bb93d0914e84a0c3dba90bbfd67625/developers/images/lockup-google-for-developers.svg" loading="lazy" alt="Google Developers"> </picture> </a> <ul class="devsite-footer-sites-list"> <li class="devsite-footer-sites-item"> <a href="//developer.android.com" class="devsite-footer-sites-link gc-analytics-event" data-category="Site-Wide Custom Events" data-label="Footer Android Link" > Android </a> </li> <li class="devsite-footer-sites-item"> <a href="//developer.chrome.com/home" class="devsite-footer-sites-link gc-analytics-event" data-category="Site-Wide Custom Events" data-label="Footer Chrome Link" > Chrome </a> </li> <li class="devsite-footer-sites-item"> <a href="//firebase.google.com" class="devsite-footer-sites-link gc-analytics-event" data-category="Site-Wide Custom Events" data-label="Footer Firebase Link" > Firebase </a> </li> <li class="devsite-footer-sites-item"> <a href="//cloud.google.com" class="devsite-footer-sites-link gc-analytics-event" data-category="Site-Wide Custom Events" data-label="Footer Google Cloud Platform Link" > Google Cloud Platform </a> </li> <li class="devsite-footer-sites-item"> <a href="//ai.google.dev/" class="devsite-footer-sites-link gc-analytics-event" data-category="Site-Wide Custom Events" data-label="Footer Google AI Link" > Google AI </a> </li> <li class="devsite-footer-sites-item"> <a href="/products" class="devsite-footer-sites-link gc-analytics-event" data-category="Site-Wide Custom Events" data-label="Footer All products Link" > All products </a> </li> </ul> </nav> <nav class="devsite-footer-utility-links" aria-label="Utility links"> <ul class="devsite-footer-utility-list"> <li class="devsite-footer-utility-item "> <a class="devsite-footer-utility-link gc-analytics-event" href="/terms/site-terms" data-category="Site-Wide Custom Events" data-label="Footer Terms link" > Terms </a> </li> <li class="devsite-footer-utility-item "> <a class="devsite-footer-utility-link gc-analytics-event" href="//policies.google.com/privacy" data-category="Site-Wide Custom Events" data-label="Footer Privacy link" > Privacy </a> </li> <li class="devsite-footer-utility-item glue-cookie-notification-bar-control"> <a class="devsite-footer-utility-link gc-analytics-event" href="#" data-category="Site-Wide Custom Events" data-label="Footer Manage cookies link" aria-hidden="true" > Manage cookies </a> </li> <li class="devsite-footer-utility-item devsite-footer-utility-button"> <span class="devsite-footer-utility-description">Sign up for the Google for Developers newsletter</span> <a class="devsite-footer-utility-link gc-analytics-event" href="/newsletter/subscribe" data-category="Site-Wide Custom Events" data-label="Footer Subscribe link" > Subscribe </a> </li> </ul> <devsite-language-selector> <ul role="presentation"> <li role="presentation"> <a role="menuitem" lang="en" >English</a> </li> <li role="presentation"> <a role="menuitem" lang="de" >Deutsch</a> </li> <li role="presentation"> <a role="menuitem" lang="es" >Español</a> </li> <li role="presentation"> <a role="menuitem" lang="es_419" >Español – América Latina</a> </li> <li role="presentation"> <a role="menuitem" lang="fr" >Français</a> </li> <li role="presentation"> <a role="menuitem" lang="id" >Indonesia</a> </li> <li role="presentation"> <a role="menuitem" lang="it" >Italiano</a> </li> <li role="presentation"> <a role="menuitem" lang="pl" >Polski</a> </li> <li role="presentation"> <a role="menuitem" lang="pt_br" >Português – Brasil</a> </li> <li role="presentation"> <a role="menuitem" lang="vi" >Tiếng Việt</a> </li> <li role="presentation"> <a role="menuitem" lang="tr" >Türkçe</a> </li> <li role="presentation"> <a role="menuitem" lang="ru" >Русский</a> </li> <li role="presentation"> <a role="menuitem" lang="he" >עברית</a> </li> <li role="presentation"> <a role="menuitem" lang="ar" >العربيّة</a> </li> <li role="presentation"> <a role="menuitem" lang="fa" >فارسی</a> </li> <li role="presentation"> <a role="menuitem" lang="hi" >हिंदी</a> </li> <li role="presentation"> <a role="menuitem" lang="bn" >বাংলা</a> </li> <li role="presentation"> <a role="menuitem" lang="th" >ภาษาไทย</a> </li> <li role="presentation"> <a role="menuitem" lang="zh_cn" >中文 – 简体</a> </li> <li role="presentation"> <a role="menuitem" lang="zh_tw" >中文 – 繁體</a> </li> <li role="presentation"> <a role="menuitem" lang="ja" >日本語</a> </li> <li role="presentation"> <a role="menuitem" lang="ko" >한국어</a> </li> </ul> </devsite-language-selector> </nav> </div> </devsite-footer-utility> <devsite-panel></devsite-panel> <devsite-concierge data-info-panel data-ai-panel data-api-explorer-panel > </devsite-concierge> </section></section> <devsite-sitemask></devsite-sitemask> <devsite-snackbar></devsite-snackbar> <devsite-tooltip ></devsite-tooltip> <devsite-heading-link></devsite-heading-link> <devsite-analytics> <script type="application/json" analytics>[{"dimensions": {"dimension6": "en", "dimension11": false, "dimension4": "Machine Learning", "dimension1": "Signed out", "dimension3": false, "dimension5": "en"}, "gaid": "UA-24532603-1", "metrics": {"ratings_count": "metric2", "ratings_value": "metric1"}, "purpose": 1}, {"dimensions": {"dimension6": "en", "dimension11": false, "dimension4": "Machine Learning", "dimension1": "Signed out", "dimension3": false, "dimension5": "en"}, "gaid": "UA-105980039-1", "metrics": {"ratings_count": "metric2", "ratings_value": "metric1"}, "purpose": 0}]</script> <script type="application/json" tag-management>{"at": "True", "ga4": [{"id": "G-272J68FCRF", "purpose": 1}, {"id": "G-PRD3Z0HRX3", "purpose": 0}], "ga4p": [{"id": "G-272J68FCRF", "purpose": 1}], "gtm": [], "parameters": {"internalUser": "False", "language": {"machineTranslated": "False", "requested": "en", "served": "en"}, "pageType": "article", "projectName": "Machine Learning", "signedIn": "False", "tenant": "developers", "recommendations": {"sourcePage": "", "sourceType": 0, "sourceRank": 0, "sourceIdenticalDescriptions": 0, "sourceTitleWords": 0, "sourceDescriptionWords": 0, "experiment": ""}, "experiment": {"ids": ""}}}</script> </devsite-analytics> <devsite-badger></devsite-badger> <script nonce="tiy8NCMxdawQHLbn+FP8zjHgH0g9IQ"> (function(d,e,v,s,i,t,E){d['GoogleDevelopersObject']=i; t=e.createElement(v);t.async=1;t.src=s;E=e.getElementsByTagName(v)[0]; E.parentNode.insertBefore(t,E);})(window, document, 'script', 'https://www.gstatic.com/devrel-devsite/prod/v870e399c64f7c43c99a3043db4b3a74327bb93d0914e84a0c3dba90bbfd67625/developers/js/app_loader.js', '[1,"en",null,"/js/devsite_app_module.js","https://www.gstatic.com/devrel-devsite/prod/v870e399c64f7c43c99a3043db4b3a74327bb93d0914e84a0c3dba90bbfd67625","https://www.gstatic.com/devrel-devsite/prod/v870e399c64f7c43c99a3043db4b3a74327bb93d0914e84a0c3dba90bbfd67625/developers","https://developers-dot-devsite-v2-prod.appspot.com",1,null,["/_pwa/developers/manifest.json","https://www.gstatic.com/devrel-devsite/prod/v870e399c64f7c43c99a3043db4b3a74327bb93d0914e84a0c3dba90bbfd67625/images/video-placeholder.svg","https://www.gstatic.com/devrel-devsite/prod/v870e399c64f7c43c99a3043db4b3a74327bb93d0914e84a0c3dba90bbfd67625/developers/images/favicon-new.png","https://fonts.googleapis.com/css?family=Google+Sans:400,500|Roboto:400,400italic,500,500italic,700,700italic|Roboto+Mono:400,500,700&display=swap"],1,null,[1,6,8,12,14,17,21,25,50,52,63,70,75,76,80,87,91,92,93,97,98,100,101,102,103,104,105,107,108,109,110,112,113,117,118,120,122,124,125,126,127,129,130,131,132,133,134,135,136,138,140,141,147,148,149,151,152,156,157,158,159,161,163,164,168,169,170,179,180,182,183,186,191,193,196],"AIzaSyAP-jjEJBzmIyKR4F-3XITp8yM9T1gEEI8","AIzaSyB6xiKGDR5O3Ak2okS4rLkauxGUG7XP0hg","developers.google.com","AIzaSyAQk0fBONSGUqCNznf6Krs82Ap1-NV6J4o","AIzaSyCCxcqdrZ_7QMeLCRY20bh_SXdAYqy70KY",null,null,null,["MiscFeatureFlags__enable_view_transitions","Concierge__enable_concierge","Cloud__enable_cloud_shell_fte_user_flow","Analytics__enable_clearcut_logging","MiscFeatureFlags__enable_firebase_utm","DevPro__enable_cloud_innovators_plus","Profiles__require_profile_eligibility_for_signin","Concierge__enable_concierge_restricted","Profiles__enable_profile_collections","Cloud__enable_legacy_calculator_redirect","MiscFeatureFlags__enable_project_variables","Profiles__enable_awarding_url","Search__enable_ai_eligibility_checks","MiscFeatureFlags__developers_footer_image","Experiments__reqs_query_experiments","CloudShell__cloud_shell_button","Profiles__enable_page_saving","Cloud__enable_llm_concierge_chat","Cloud__enable_cloud_shell","MiscFeatureFlags__developers_footer_dark_image","Cloud__enable_free_trial_server_call","CloudShell__cloud_code_overflow_menu","Profiles__enable_recognition_badges","Search__enable_ai_search_summaries","MiscFeatureFlags__emergency_css","Profiles__enable_public_developer_profiles","Profiles__enable_developer_profiles_callout","Search__enable_dynamic_content_confidential_banner","Cloud__enable_cloud_dlp_service","Cloud__enable_cloud_facet_chat","Profiles__enable_dashboard_curated_recommendations","TpcFeatures__enable_mirror_tenant_redirects","Search__enable_ai_search_summaries_restricted","BookNav__enable_tenant_cache_key","Search__enable_page_map","Significatio__enable_by_tenant","Concierge__enable_pushui","DevPro__enable_developer_subscriptions","Profiles__enable_completecodelab_endpoint","Cloud__enable_cloudx_experiment_ids","MiscFeatureFlags__enable_variable_operator","Profiles__enable_release_notes_notifications","EngEduTelemetry__enable_engedu_telemetry","MiscFeatureFlags__enable_explain_this_code","TpcFeatures__enable_required_headers","Search__enable_suggestions_from_borg","Profiles__enable_complete_playlist_endpoint","Cloud__enable_cloudx_ping"],null,null,"AIzaSyBLEMok-5suZ67qRPzx0qUtbnLmyT_kCVE","https://developerscontentserving-pa.clients6.google.com","AIzaSyCM4QpTRSqP5qI4Dvjt4OAScIN8sOUlO-k","https://developerscontentsearch-pa.clients6.google.com",1,4,null,"https://developerprofiles-pa.clients6.google.com",[1,"developers","Google for Developers","developers.google.com",null,"developers-dot-devsite-v2-prod.appspot.com",null,null,[1,1,[1],null,null,null,null,null,null,null,null,[1],null,null,null,null,null,null,[1],[1,null,null,[1,20],"/recommendations/information"],null,null,null,[1,1,1],[1,1,null,1,1]],null,[null,null,null,null,null,null,"/images/lockup-new.svg","/images/touchicon-180-new.png",null,null,null,null,1,null,null,null,null,null,null,null,null,1,null,null,null,"/images/lockup-dark-theme-new.svg",[]],[],null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,null,[6,1,14,15,20,22,23,29,32,36],null,[[null,null,null,[3,7,10,2,39,17,4,32,24,11,12,13,34,15,25],null,null,[1,[["docType","Choose a content type",[["Tutorial",null,null,null,null,null,null,null,null,"Tutorial"],["Guide",null,null,null,null,null,null,null,null,"Guide"],["Sample",null,null,null,null,null,null,null,null,"Sample"]]],["product","Choose a product",[["Android",null,null,null,null,null,null,null,null,"Android"],["ARCore",null,null,null,null,null,null,null,null,"ARCore"],["ChromeOS",null,null,null,null,null,null,null,null,"ChromeOS"],["Firebase",null,null,null,null,null,null,null,null,"Firebase"],["Flutter",null,null,null,null,null,null,null,null,"Flutter"],["Assistant",null,null,null,null,null,null,null,null,"Google Assistant"],["GoogleCloud",null,null,null,null,null,null,null,null,"Google Cloud"],["GoogleMapsPlatform",null,null,null,null,null,null,null,null,"Google Maps Platform"],["GooglePay",null,null,null,null,null,null,null,null,"Google Pay & Google Wallet"],["GooglePlay",null,null,null,null,null,null,null,null,"Google Play"],["Tensorflow",null,null,null,null,null,null,null,null,"TensorFlow"]]],["category","Choose a topic",[["AiAndMachineLearning",null,null,null,null,null,null,null,null,"AI and Machine Learning"],["Data",null,null,null,null,null,null,null,null,"Data"],["Enterprise",null,null,null,null,null,null,null,null,"Enterprise"],["Gaming",null,null,null,null,null,null,null,null,"Gaming"],["Mobile",null,null,null,null,null,null,null,null,"Mobile"],["Web",null,null,null,null,null,null,null,null,"Web"]]]]]],[1,1],null,1],[[["UA-24532603-1"],["UA-22084204-5"],null,null,["UA-24532603-5"],null,null,[["G-272J68FCRF"],null,null,[["G-272J68FCRF",2]]],[["UA-24532603-1",2]],null,[["UA-24532603-5",2]],null,1],[[4,3],[5,4],[12,9],[6,5],[16,13],[13,10],[3,2],[14,11],[15,12],[1,1],[11,8]],[[2,2],[1,1]]],null,4,null,null,null,null,null,null,null,null,null,null,null,null,null,"developers.devsite.google"],null,"pk_live_5170syrHvgGVmSx9sBrnWtA5luvk9BwnVcvIi7HizpwauFG96WedXsuXh790rtij9AmGllqPtMLfhe2RSwD6Pn38V00uBCydV4m"]') </script> <devsite-a11y-announce></devsite-a11y-announce> </body> </html>