CINXE.COM
And I Shall Call It Mini-Me GPT: Using Large Language Models to Classify the Uncharted Web – Sophos News
<!DOCTYPE html> <html lang="en-US"> <head> <meta charset="UTF-8"> <meta http-equiv="x-ua-compatible" content="ie=edge"> <meta name="viewport" content="width=device-width, initial-scale=1"> <link rel="profile" href="http://gmpg.org/xfn/11"> <link rel="pingback" href="https://news.sophos.com/xmlrpc.php"> <link rel="alternate" hreflang="es-419" href="https://news.sophos.com/es-419/2023/06/22/using-large-language-models-classify-uncharted-web" /> <link rel="alternate" hreflang="nl-nl" href="https://news.sophos.com/nl-nl/2023/06/22/using-large-language-models-classify-uncharted-web" /> <link rel="alternate" hreflang="pt-br" href="https://news.sophos.com/pt-br/2023/06/22/using-large-language-models-classify-uncharted-web" /> <link rel="alternate" hreflang="de-de" href="https://news.sophos.com/de-de/2023/06/22/using-large-language-models-classify-uncharted-web" /> <link rel="alternate" hreflang="en-us" href="https://news.sophos.com/en-us/2023/06/22/using-large-language-models-classify-uncharted-web" /> <link rel="alternate" hreflang="fr-fr" href="https://news.sophos.com/fr-fr/2023/06/22/using-large-language-models-classify-uncharted-web" /> <link rel="alternate" hreflang="es-es" href="https://news.sophos.com/es-es/2023/06/22/using-large-language-models-classify-uncharted-web" /> <link rel="alternate" hreflang="it-it" href="https://news.sophos.com/it-it/2023/06/22/using-large-language-models-classify-uncharted-web" /> <link rel="alternate" hreflang="ja-jp" href="https://news.sophos.com/ja-jp/2023/06/22/using-large-language-models-classify-uncharted-web" /> <link rel="alternate" hreflang="zh-tw" href="https://news.sophos.com/zh-tw/2023/06/22/using-large-language-models-classify-uncharted-web" /> <!-- Google Tag Manager --> <script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start': new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0], j=d.createElement(s),dl=l!='dataLayer'?'&l='+l:'';j.async=true;j.src= 'https://www.googletagmanager.com/gtm.js?id='+i+dl;f.parentNode.insertBefore(j,f); })(window,document,'script','dataLayer','GTM-TW8W88B');</script> <!-- End Google Tag Manager --> <script type="text/javascript"> /* <![CDATA[ */ (()=>{var e={};e.g=function(){if("object"==typeof globalThis)return globalThis;try{return this||new Function("return this")()}catch(e){if("object"==typeof window)return window}}(),function({ampUrl:n,isCustomizePreview:t,isAmpDevMode:r,noampQueryVarName:o,noampQueryVarValue:s,disabledStorageKey:i,mobileUserAgents:a,regexRegex:c}){if("undefined"==typeof sessionStorage)return;const d=new RegExp(c);if(!a.some((e=>{const n=e.match(d);return!(!n||!new RegExp(n[1],n[2]).test(navigator.userAgent))||navigator.userAgent.includes(e)})))return;e.g.addEventListener("DOMContentLoaded",(()=>{const e=document.getElementById("amp-mobile-version-switcher");if(!e)return;e.hidden=!1;const n=e.querySelector("a[href]");n&&n.addEventListener("click",(()=>{sessionStorage.removeItem(i)}))}));const g=r&&["paired-browsing-non-amp","paired-browsing-amp"].includes(window.name);if(sessionStorage.getItem(i)||t||g)return;const u=new URL(location.href),m=new URL(n);m.hash=u.hash,u.searchParams.has(o)&&s===u.searchParams.get(o)?sessionStorage.setItem(i,"1"):m.href!==u.href&&(window.stop(),location.replace(m.href))}({"ampUrl":"https:\/\/news.sophos.com\/en-us\/2023\/06\/22\/using-large-language-models-classify-uncharted-web\/?amp=1","noampQueryVarName":"noamp","noampQueryVarValue":"mobile","disabledStorageKey":"amp_mobile_redirect_disabled","mobileUserAgents":["Mobile","Android","Silk\/","Kindle","BlackBerry","Opera Mini","Opera Mobi"],"regexRegex":"^\\\/((?:.|\\n)+)\\\/([i]*)$","isCustomizePreview":false,"isAmpDevMode":false})})(); /* ]]> */ </script> <title>And I Shall Call It Mini-Me GPT: Using Large Language Models to Classify the Uncharted Web – Sophos News</title> <meta name='robots' content='max-image-preview:large' /> <style>img:is([sizes="auto" i], [sizes^="auto," i]) { contain-intrinsic-size: 3000px 1500px }</style> <!-- Jetpack Site Verification Tags --> <meta name="google-site-verification" content="8r1qg681OjOolfxmHEY1IYupmTBdyKXc-OPfpgeQHFk" /> <link rel='dns-prefetch' href='//unpkg.com' /> <link rel='dns-prefetch' href='//stats.wp.com' /> <link rel='dns-prefetch' href='//v0.wordpress.com' /> <link rel="alternate" type="application/rss+xml" title="Sophos News » Feed" href="https://news.sophos.com/feed/" /> <link rel="alternate" type="application/rss+xml" title="Sophos News » Comments Feed" href="https://news.sophos.com/comments/feed/" /> <script type="text/javascript"> /* <![CDATA[ */ window._wpemojiSettings = {"baseUrl":"https:\/\/s.w.org\/images\/core\/emoji\/15.0.3\/72x72\/","ext":".png","svgUrl":"https:\/\/s.w.org\/images\/core\/emoji\/15.0.3\/svg\/","svgExt":".svg","source":{"concatemoji":"https:\/\/news.sophos.com\/wp-includes\/js\/wp-emoji-release.min.js?ver=6.7.2"}}; /*! This file is auto-generated */ !function(i,n){var o,s,e;function c(e){try{var t={supportTests:e,timestamp:(new Date).valueOf()};sessionStorage.setItem(o,JSON.stringify(t))}catch(e){}}function p(e,t,n){e.clearRect(0,0,e.canvas.width,e.canvas.height),e.fillText(t,0,0);var t=new Uint32Array(e.getImageData(0,0,e.canvas.width,e.canvas.height).data),r=(e.clearRect(0,0,e.canvas.width,e.canvas.height),e.fillText(n,0,0),new Uint32Array(e.getImageData(0,0,e.canvas.width,e.canvas.height).data));return t.every(function(e,t){return e===r[t]})}function u(e,t,n){switch(t){case"flag":return n(e,"\ud83c\udff3\ufe0f\u200d\u26a7\ufe0f","\ud83c\udff3\ufe0f\u200b\u26a7\ufe0f")?!1:!n(e,"\ud83c\uddfa\ud83c\uddf3","\ud83c\uddfa\u200b\ud83c\uddf3")&&!n(e,"\ud83c\udff4\udb40\udc67\udb40\udc62\udb40\udc65\udb40\udc6e\udb40\udc67\udb40\udc7f","\ud83c\udff4\u200b\udb40\udc67\u200b\udb40\udc62\u200b\udb40\udc65\u200b\udb40\udc6e\u200b\udb40\udc67\u200b\udb40\udc7f");case"emoji":return!n(e,"\ud83d\udc26\u200d\u2b1b","\ud83d\udc26\u200b\u2b1b")}return!1}function f(e,t,n){var r="undefined"!=typeof WorkerGlobalScope&&self instanceof WorkerGlobalScope?new OffscreenCanvas(300,150):i.createElement("canvas"),a=r.getContext("2d",{willReadFrequently:!0}),o=(a.textBaseline="top",a.font="600 32px Arial",{});return e.forEach(function(e){o[e]=t(a,e,n)}),o}function t(e){var t=i.createElement("script");t.src=e,t.defer=!0,i.head.appendChild(t)}"undefined"!=typeof Promise&&(o="wpEmojiSettingsSupports",s=["flag","emoji"],n.supports={everything:!0,everythingExceptFlag:!0},e=new Promise(function(e){i.addEventListener("DOMContentLoaded",e,{once:!0})}),new Promise(function(t){var n=function(){try{var e=JSON.parse(sessionStorage.getItem(o));if("object"==typeof e&&"number"==typeof e.timestamp&&(new Date).valueOf()<e.timestamp+604800&&"object"==typeof e.supportTests)return e.supportTests}catch(e){}return null}();if(!n){if("undefined"!=typeof Worker&&"undefined"!=typeof OffscreenCanvas&&"undefined"!=typeof URL&&URL.createObjectURL&&"undefined"!=typeof Blob)try{var e="postMessage("+f.toString()+"("+[JSON.stringify(s),u.toString(),p.toString()].join(",")+"));",r=new Blob([e],{type:"text/javascript"}),a=new Worker(URL.createObjectURL(r),{name:"wpTestEmojiSupports"});return void(a.onmessage=function(e){c(n=e.data),a.terminate(),t(n)})}catch(e){}c(n=f(s,u,p))}t(n)}).then(function(e){for(var t in e)n.supports[t]=e[t],n.supports.everything=n.supports.everything&&n.supports[t],"flag"!==t&&(n.supports.everythingExceptFlag=n.supports.everythingExceptFlag&&n.supports[t]);n.supports.everythingExceptFlag=n.supports.everythingExceptFlag&&!n.supports.flag,n.DOMReady=!1,n.readyCallback=function(){n.DOMReady=!0}}).then(function(){return e}).then(function(){var e;n.supports.everything||(n.readyCallback(),(e=n.source||{}).concatemoji?t(e.concatemoji):e.wpemoji&&e.twemoji&&(t(e.twemoji),t(e.wpemoji)))}))}((window,document),window._wpemojiSettings); /* ]]> */ </script> <style id='wp-emoji-styles-inline-css'> img.wp-smiley, img.emoji { display: inline !important; border: none !important; box-shadow: none !important; height: 1em !important; width: 1em !important; margin: 0 0.07em !important; vertical-align: -0.1em !important; background: none !important; padding: 0 !important; } </style> <link rel='stylesheet' id='all-css-2' href='https://news.sophos.com/wp-includes/css/dist/block-library/style.min.css?m=1739294329g' type='text/css' media='all' /> <style id='safe-svg-svg-icon-style-inline-css'> .safe-svg-cover{text-align:center}.safe-svg-cover .safe-svg-inside{display:inline-block;max-width:100%}.safe-svg-cover svg{height:100%;max-height:100%;max-width:100%;width:100%} </style> <link rel='stylesheet' id='all-css-6' href='https://news.sophos.com/_static/??-eJzTLy/QzcxLzilNSS3WzyrWz01NyUxMzUnNTc0rQeEU5CRWphbp5qSmJyZX6uVm5uklFxfr6OPTDpRD5sM02efaGpobWxpZmhgbGwAAROEu5A==' type='text/css' media='all' /> <style id='jetpack-sharing-buttons-style-inline-css'> .jetpack-sharing-buttons__services-list{display:flex;flex-direction:row;flex-wrap:wrap;gap:0;list-style-type:none;margin:5px;padding:0}.jetpack-sharing-buttons__services-list.has-small-icon-size{font-size:12px}.jetpack-sharing-buttons__services-list.has-normal-icon-size{font-size:16px}.jetpack-sharing-buttons__services-list.has-large-icon-size{font-size:24px}.jetpack-sharing-buttons__services-list.has-huge-icon-size{font-size:36px}@media print{.jetpack-sharing-buttons__services-list{display:none!important}}.editor-styles-wrapper .wp-block-jetpack-sharing-buttons{gap:0;padding-inline-start:0}ul.jetpack-sharing-buttons__services-list.has-background{padding:1.25em 2.375em} </style> <style id='co-authors-plus-coauthors-style-inline-css'> .wp-block-co-authors-plus-coauthors.is-layout-flow [class*=wp-block-co-authors-plus]{display:inline} </style> <style id='co-authors-plus-avatar-style-inline-css'> .wp-block-co-authors-plus-avatar :where(img){height:auto;max-width:100%;vertical-align:bottom}.wp-block-co-authors-plus-coauthors.is-layout-flow .wp-block-co-authors-plus-avatar :where(img){vertical-align:middle}.wp-block-co-authors-plus-avatar:is(.alignleft,.alignright){display:table}.wp-block-co-authors-plus-avatar.aligncenter{display:table;margin-inline:auto} </style> <style id='co-authors-plus-image-style-inline-css'> .wp-block-co-authors-plus-image{margin-bottom:0}.wp-block-co-authors-plus-image :where(img){height:auto;max-width:100%;vertical-align:bottom}.wp-block-co-authors-plus-coauthors.is-layout-flow .wp-block-co-authors-plus-image :where(img){vertical-align:middle}.wp-block-co-authors-plus-image:is(.alignfull,.alignwide) :where(img){width:100%}.wp-block-co-authors-plus-image:is(.alignleft,.alignright){display:table}.wp-block-co-authors-plus-image.aligncenter{display:table;margin-inline:auto} </style> <style id='elasticpress-facet-style-inline-css'> .widget_ep-facet input[type=search],.wp-block-elasticpress-facet input[type=search]{margin-bottom:1rem}.widget_ep-facet .searchable .inner,.wp-block-elasticpress-facet .searchable .inner{max-height:20em;overflow:scroll}.widget_ep-facet .term.hide,.wp-block-elasticpress-facet .term.hide{display:none}.widget_ep-facet .empty-term,.wp-block-elasticpress-facet .empty-term{opacity:.5;position:relative}.widget_ep-facet .empty-term:after,.wp-block-elasticpress-facet .empty-term:after{bottom:0;content:" ";display:block;left:0;position:absolute;right:0;top:0;width:100%;z-index:2}.widget_ep-facet .level-1,.wp-block-elasticpress-facet .level-1{padding-left:20px}.widget_ep-facet .level-2,.wp-block-elasticpress-facet .level-2{padding-left:40px}.widget_ep-facet .level-3,.wp-block-elasticpress-facet .level-3{padding-left:60px}.widget_ep-facet .level-4,.wp-block-elasticpress-facet .level-4{padding-left:5pc}.widget_ep-facet .level-5,.wp-block-elasticpress-facet .level-5{padding-left:75pt}.widget_ep-facet input[disabled],.wp-block-elasticpress-facet input[disabled]{cursor:pointer;opacity:1}.widget_ep-facet .term a,.wp-block-elasticpress-facet .term a{-webkit-box-align:center;-ms-flex-align:center;align-items:center;display:-webkit-box;display:-ms-flexbox;display:flex;position:relative}.widget_ep-facet .term a:hover .ep-checkbox,.wp-block-elasticpress-facet .term a:hover .ep-checkbox{background-color:#ccc}.ep-checkbox{-webkit-box-align:center;-ms-flex-align:center;-ms-flex-negative:0;-webkit-box-pack:center;-ms-flex-pack:center;align-items:center;background-color:#eee;display:-webkit-box;display:-ms-flexbox;display:flex;flex-shrink:0;height:1em;justify-content:center;margin-right:.25em;width:1em}.ep-checkbox:after{border:solid #fff;border-width:0 .125em .125em 0;content:"";display:none;height:.5em;-webkit-transform:rotate(45deg);transform:rotate(45deg);width:.25em}.ep-checkbox.checked{background-color:#5e5e5e}.ep-checkbox.checked:after{display:block} </style> <link rel='stylesheet' id='all-css-18' href='https://news.sophos.com/wp-content/mu-plugins/search/elasticpress/dist/css/related-posts-block-styles.min.css?m=1736472017g' type='text/css' media='all' /> <style id='classic-theme-styles-inline-css'> /*! This file is auto-generated */ .wp-block-button__link{color:#fff;background-color:#32373c;border-radius:9999px;box-shadow:none;text-decoration:none;padding:calc(.667em + 2px) calc(1.333em + 2px);font-size:1.125em}.wp-block-file__button{background:#32373c;color:#fff;text-decoration:none} </style> <style id='global-styles-inline-css'> :root{--wp--preset--aspect-ratio--square: 1;--wp--preset--aspect-ratio--4-3: 4/3;--wp--preset--aspect-ratio--3-4: 3/4;--wp--preset--aspect-ratio--3-2: 3/2;--wp--preset--aspect-ratio--2-3: 2/3;--wp--preset--aspect-ratio--16-9: 16/9;--wp--preset--aspect-ratio--9-16: 9/16;--wp--preset--color--black: #000000;--wp--preset--color--cyan-bluish-gray: #abb8c3;--wp--preset--color--white: #ffffff;--wp--preset--color--pale-pink: #f78da7;--wp--preset--color--vivid-red: #cf2e2e;--wp--preset--color--luminous-vivid-orange: #ff6900;--wp--preset--color--luminous-vivid-amber: #fcb900;--wp--preset--color--light-green-cyan: #7bdcb5;--wp--preset--color--vivid-green-cyan: #00d084;--wp--preset--color--pale-cyan-blue: #8ed1fc;--wp--preset--color--vivid-cyan-blue: #0693e3;--wp--preset--color--vivid-purple: #9b51e0;--wp--preset--gradient--vivid-cyan-blue-to-vivid-purple: linear-gradient(135deg,rgba(6,147,227,1) 0%,rgb(155,81,224) 100%);--wp--preset--gradient--light-green-cyan-to-vivid-green-cyan: linear-gradient(135deg,rgb(122,220,180) 0%,rgb(0,208,130) 100%);--wp--preset--gradient--luminous-vivid-amber-to-luminous-vivid-orange: linear-gradient(135deg,rgba(252,185,0,1) 0%,rgba(255,105,0,1) 100%);--wp--preset--gradient--luminous-vivid-orange-to-vivid-red: linear-gradient(135deg,rgba(255,105,0,1) 0%,rgb(207,46,46) 100%);--wp--preset--gradient--very-light-gray-to-cyan-bluish-gray: linear-gradient(135deg,rgb(238,238,238) 0%,rgb(169,184,195) 100%);--wp--preset--gradient--cool-to-warm-spectrum: linear-gradient(135deg,rgb(74,234,220) 0%,rgb(151,120,209) 20%,rgb(207,42,186) 40%,rgb(238,44,130) 60%,rgb(251,105,98) 80%,rgb(254,248,76) 100%);--wp--preset--gradient--blush-light-purple: linear-gradient(135deg,rgb(255,206,236) 0%,rgb(152,150,240) 100%);--wp--preset--gradient--blush-bordeaux: linear-gradient(135deg,rgb(254,205,165) 0%,rgb(254,45,45) 50%,rgb(107,0,62) 100%);--wp--preset--gradient--luminous-dusk: linear-gradient(135deg,rgb(255,203,112) 0%,rgb(199,81,192) 50%,rgb(65,88,208) 100%);--wp--preset--gradient--pale-ocean: linear-gradient(135deg,rgb(255,245,203) 0%,rgb(182,227,212) 50%,rgb(51,167,181) 100%);--wp--preset--gradient--electric-grass: linear-gradient(135deg,rgb(202,248,128) 0%,rgb(113,206,126) 100%);--wp--preset--gradient--midnight: linear-gradient(135deg,rgb(2,3,129) 0%,rgb(40,116,252) 100%);--wp--preset--font-size--small: 13px;--wp--preset--font-size--medium: 20px;--wp--preset--font-size--large: 36px;--wp--preset--font-size--x-large: 42px;--wp--preset--spacing--20: 0.44rem;--wp--preset--spacing--30: 0.67rem;--wp--preset--spacing--40: 1rem;--wp--preset--spacing--50: 1.5rem;--wp--preset--spacing--60: 2.25rem;--wp--preset--spacing--70: 3.38rem;--wp--preset--spacing--80: 5.06rem;--wp--preset--shadow--natural: 6px 6px 9px rgba(0, 0, 0, 0.2);--wp--preset--shadow--deep: 12px 12px 50px rgba(0, 0, 0, 0.4);--wp--preset--shadow--sharp: 6px 6px 0px rgba(0, 0, 0, 0.2);--wp--preset--shadow--outlined: 6px 6px 0px -3px rgba(255, 255, 255, 1), 6px 6px rgba(0, 0, 0, 1);--wp--preset--shadow--crisp: 6px 6px 0px rgba(0, 0, 0, 1);}:where(.is-layout-flex){gap: 0.5em;}:where(.is-layout-grid){gap: 0.5em;}body .is-layout-flex{display: flex;}.is-layout-flex{flex-wrap: wrap;align-items: center;}.is-layout-flex > :is(*, div){margin: 0;}body .is-layout-grid{display: grid;}.is-layout-grid > :is(*, div){margin: 0;}:where(.wp-block-columns.is-layout-flex){gap: 2em;}:where(.wp-block-columns.is-layout-grid){gap: 2em;}:where(.wp-block-post-template.is-layout-flex){gap: 1.25em;}:where(.wp-block-post-template.is-layout-grid){gap: 1.25em;}.has-black-color{color: var(--wp--preset--color--black) !important;}.has-cyan-bluish-gray-color{color: var(--wp--preset--color--cyan-bluish-gray) !important;}.has-white-color{color: var(--wp--preset--color--white) !important;}.has-pale-pink-color{color: var(--wp--preset--color--pale-pink) !important;}.has-vivid-red-color{color: var(--wp--preset--color--vivid-red) !important;}.has-luminous-vivid-orange-color{color: var(--wp--preset--color--luminous-vivid-orange) !important;}.has-luminous-vivid-amber-color{color: var(--wp--preset--color--luminous-vivid-amber) !important;}.has-light-green-cyan-color{color: var(--wp--preset--color--light-green-cyan) !important;}.has-vivid-green-cyan-color{color: var(--wp--preset--color--vivid-green-cyan) !important;}.has-pale-cyan-blue-color{color: var(--wp--preset--color--pale-cyan-blue) !important;}.has-vivid-cyan-blue-color{color: var(--wp--preset--color--vivid-cyan-blue) !important;}.has-vivid-purple-color{color: var(--wp--preset--color--vivid-purple) !important;}.has-black-background-color{background-color: var(--wp--preset--color--black) !important;}.has-cyan-bluish-gray-background-color{background-color: var(--wp--preset--color--cyan-bluish-gray) !important;}.has-white-background-color{background-color: var(--wp--preset--color--white) !important;}.has-pale-pink-background-color{background-color: var(--wp--preset--color--pale-pink) !important;}.has-vivid-red-background-color{background-color: var(--wp--preset--color--vivid-red) !important;}.has-luminous-vivid-orange-background-color{background-color: var(--wp--preset--color--luminous-vivid-orange) !important;}.has-luminous-vivid-amber-background-color{background-color: var(--wp--preset--color--luminous-vivid-amber) !important;}.has-light-green-cyan-background-color{background-color: var(--wp--preset--color--light-green-cyan) !important;}.has-vivid-green-cyan-background-color{background-color: var(--wp--preset--color--vivid-green-cyan) !important;}.has-pale-cyan-blue-background-color{background-color: var(--wp--preset--color--pale-cyan-blue) !important;}.has-vivid-cyan-blue-background-color{background-color: var(--wp--preset--color--vivid-cyan-blue) !important;}.has-vivid-purple-background-color{background-color: var(--wp--preset--color--vivid-purple) !important;}.has-black-border-color{border-color: var(--wp--preset--color--black) !important;}.has-cyan-bluish-gray-border-color{border-color: var(--wp--preset--color--cyan-bluish-gray) !important;}.has-white-border-color{border-color: var(--wp--preset--color--white) !important;}.has-pale-pink-border-color{border-color: var(--wp--preset--color--pale-pink) !important;}.has-vivid-red-border-color{border-color: var(--wp--preset--color--vivid-red) !important;}.has-luminous-vivid-orange-border-color{border-color: var(--wp--preset--color--luminous-vivid-orange) !important;}.has-luminous-vivid-amber-border-color{border-color: var(--wp--preset--color--luminous-vivid-amber) !important;}.has-light-green-cyan-border-color{border-color: var(--wp--preset--color--light-green-cyan) !important;}.has-vivid-green-cyan-border-color{border-color: var(--wp--preset--color--vivid-green-cyan) !important;}.has-pale-cyan-blue-border-color{border-color: var(--wp--preset--color--pale-cyan-blue) !important;}.has-vivid-cyan-blue-border-color{border-color: var(--wp--preset--color--vivid-cyan-blue) !important;}.has-vivid-purple-border-color{border-color: var(--wp--preset--color--vivid-purple) !important;}.has-vivid-cyan-blue-to-vivid-purple-gradient-background{background: var(--wp--preset--gradient--vivid-cyan-blue-to-vivid-purple) !important;}.has-light-green-cyan-to-vivid-green-cyan-gradient-background{background: var(--wp--preset--gradient--light-green-cyan-to-vivid-green-cyan) !important;}.has-luminous-vivid-amber-to-luminous-vivid-orange-gradient-background{background: var(--wp--preset--gradient--luminous-vivid-amber-to-luminous-vivid-orange) !important;}.has-luminous-vivid-orange-to-vivid-red-gradient-background{background: var(--wp--preset--gradient--luminous-vivid-orange-to-vivid-red) !important;}.has-very-light-gray-to-cyan-bluish-gray-gradient-background{background: var(--wp--preset--gradient--very-light-gray-to-cyan-bluish-gray) !important;}.has-cool-to-warm-spectrum-gradient-background{background: var(--wp--preset--gradient--cool-to-warm-spectrum) !important;}.has-blush-light-purple-gradient-background{background: var(--wp--preset--gradient--blush-light-purple) !important;}.has-blush-bordeaux-gradient-background{background: var(--wp--preset--gradient--blush-bordeaux) !important;}.has-luminous-dusk-gradient-background{background: var(--wp--preset--gradient--luminous-dusk) !important;}.has-pale-ocean-gradient-background{background: var(--wp--preset--gradient--pale-ocean) !important;}.has-electric-grass-gradient-background{background: var(--wp--preset--gradient--electric-grass) !important;}.has-midnight-gradient-background{background: var(--wp--preset--gradient--midnight) !important;}.has-small-font-size{font-size: var(--wp--preset--font-size--small) !important;}.has-medium-font-size{font-size: var(--wp--preset--font-size--medium) !important;}.has-large-font-size{font-size: var(--wp--preset--font-size--large) !important;}.has-x-large-font-size{font-size: var(--wp--preset--font-size--x-large) !important;} :where(.wp-block-post-template.is-layout-flex){gap: 1.25em;}:where(.wp-block-post-template.is-layout-grid){gap: 1.25em;} :where(.wp-block-columns.is-layout-flex){gap: 2em;}:where(.wp-block-columns.is-layout-grid){gap: 2em;} :root :where(.wp-block-pullquote){font-size: 1.5em;line-height: 1.6;} </style> <link rel='stylesheet' id='all-css-22' href='https://news.sophos.com/_static/??-eJyNjkEOwiAQRS8kTGlVdGE8iiEwARQG0oE0vb2tK925e2/xXz4sVdhCDalBC5iRgUsNhQkXFuOgNHBbE244KmmZD/C1yF3U1H0khie2auxLqKMcIRfX054KZkZnnFs/GMn/nXhEstsVG00SqfjCPyJzpD11zzelp+t0Hi769AZmDEg5' type='text/css' media='all' /> <script type="text/javascript" src="https://news.sophos.com/_static/??-eJzTLy/QzcxLzilNSS3WzwKiwtLUokoopZebmaeXVayjj0+Rbm5melFiSSpUsX2uraG5saWRpYmxsUEWAK+aIiE=" ></script><link rel="https://api.w.org/" href="https://news.sophos.com/wp-json/" /><link rel="alternate" title="JSON" type="application/json" href="https://news.sophos.com/wp-json/wp/v2/posts/92239" /><link rel="EditURI" type="application/rsd+xml" title="RSD" href="https://news.sophos.com/xmlrpc.php?rsd" /> <meta name="generator" content="WordPress 6.7.2" /> <link rel="canonical" href="https://news.sophos.com/en-us/2023/06/22/using-large-language-models-classify-uncharted-web/" /> <link rel='shortlink' href='https://news.sophos.com/?p=92239' /> <link rel="alternate" title="oEmbed (JSON)" type="application/json+oembed" href="https://news.sophos.com/wp-json/oembed/1.0/embed?url=https%3A%2F%2Fnews.sophos.com%2Fen-us%2F2023%2F06%2F22%2Fusing-large-language-models-classify-uncharted-web%2F" /> <link rel="alternate" title="oEmbed (XML)" type="text/xml+oembed" href="https://news.sophos.com/wp-json/oembed/1.0/embed?url=https%3A%2F%2Fnews.sophos.com%2Fen-us%2F2023%2F06%2F22%2Fusing-large-language-models-classify-uncharted-web%2F&format=xml" /> <link rel="me" href="https://infosec.exchange/@SophosXOps"/> <link rel="alternate" type="text/html" media="only screen and (max-width: 640px)" href="https://news.sophos.com/en-us/2023/06/22/using-large-language-models-classify-uncharted-web/?amp=1"> <style>img#wpstats{display:none}</style> <link rel="amphtml" href="https://news.sophos.com/en-us/2023/06/22/using-large-language-models-classify-uncharted-web/?amp=1"><style>#amp-mobile-version-switcher{left:0;position:absolute;width:100%;z-index:100}#amp-mobile-version-switcher>a{background-color:#444;border:0;color:#eaeaea;display:block;font-family:-apple-system,BlinkMacSystemFont,Segoe UI,Roboto,Oxygen-Sans,Ubuntu,Cantarell,Helvetica Neue,sans-serif;font-size:16px;font-weight:600;padding:15px 0;text-align:center;-webkit-text-decoration:none;text-decoration:none}#amp-mobile-version-switcher>a:active,#amp-mobile-version-switcher>a:focus,#amp-mobile-version-switcher>a:hover{-webkit-text-decoration:underline;text-decoration:underline}</style> <!-- Jetpack Open Graph Tags --> <meta property="og:type" content="article" /> <meta property="og:title" content="And I Shall Call It Mini-Me GPT: Using Large Language Models to Classify the Uncharted Web" /> <meta property="og:url" content="https://news.sophos.com/en-us/2023/06/22/using-large-language-models-classify-uncharted-web/" /> <meta property="og:description" content="Sophos AI team employs GPT and other large language models as teachers to train smaller models to label websites." /> <meta property="article:published_time" content="2023-06-22T11:30:10+00:00" /> <meta property="article:modified_time" content="2023-06-22T12:24:51+00:00" /> <meta property="og:site_name" content="Sophos News" /> <meta property="og:image" content="https://news.sophos.com/wp-content/uploads/2023/06/robuts.png?w=640" /> <meta property="og:image:secure_url" content="https://news.sophos.com/wp-content/uploads/2023/06/robuts.png?w=640" /> <meta property="og:image:width" content="640" /> <meta property="og:image:height" content="359" /> <meta property="og:image:alt" content="" /> <meta property="og:locale" content="en_US" /> <meta property="fb:admins" content="28552295016" /> <meta name="twitter:text:title" content="And I Shall Call It Mini-Me GPT: Using Large Language Models to Classify the Uncharted Web" /> <meta name="twitter:image" content="https://news.sophos.com/wp-content/uploads/2023/06/robuts.png?w=640" /> <meta name="twitter:card" content="summary_large_image" /> <!-- End Jetpack Open Graph Tags --> <link rel="icon" href="https://news.sophos.com/wp-content/uploads/2020/01/cropped-sophos.png?w=32" sizes="32x32" /> <link rel="icon" href="https://news.sophos.com/wp-content/uploads/2020/01/cropped-sophos.png?w=192" sizes="192x192" /> <link rel="apple-touch-icon" href="https://news.sophos.com/wp-content/uploads/2020/01/cropped-sophos.png?w=180" /> <meta name="msapplication-TileImage" content="https://news.sophos.com/wp-content/uploads/2020/01/cropped-sophos.png?w=270" /> <style type="text/css" id="wp-custom-css"> .entry-content .embed-vimeo iframe, .entry-content .embed-youtube iframe { aspect-ratio: 16/9; width: 100%; height: auto; } </style> </head> <body class="post-template-default single single-post postid-92239 single-format-standard group-blog"> <!-- Google Tag Manager (noscript) --> <noscript><iframe src="https://www.googletagmanager.com/ns.html?id=GTM-TW8W88B" height="0" width="0" style="display:none;visibility:hidden"></iframe></noscript> <!-- End Google Tag Manager (noscript) --> <div id="page" class="hfeed site"> <a class="sr-only" href="#content">Skip to content</a> <header class="bg-blue-600" x-data="{ mobileMenu: false, searchField: false }"> <div class="container"> <div class="flex items-center justify-between h-16"> <!-- Logo --> <div class="flex-shrink-0"> <a class="site-logo" href="https://news.sophos.com/en-us/" rel="home"> <svg width="172" height="17" xmlns="http://www.w3.org/2000/svg"> <g fill="#FFF" fill-rule="evenodd"> <path d="M113.024 5.298V16.74h-2.595V.259h2.265l7.997 11.49V.26h2.619v16.482h-2.289l-7.997-11.443M126.064.259h10.78v2.307H128.8v4.521h7.549v2.214h-7.55v5.133h8.376v2.307h-11.111V.259M138.478.259h2.855l2.694 12.29L147.29.26h2.783l3.61 12.314L156.005.26h2.783l-3.62 16.482h-2.76l-3.751-12.126-3.426 12.126h-2.784L138.478.259M168.933 4.968v-.283c0-1.318-.778-2.425-3.492-2.425-2.43 0-3.279 1.013-3.279 2.284 0 1.201.708 1.743 2.218 2.073l3.491.776c2.123.448 4.129 1.602 4.129 4.333 0 3.014-1.675 5.274-6.204 5.274-5.214 0-6.559-2.26-6.559-4.52v-.307h2.737v.26c0 1.2.755 2.284 3.774 2.284 2.5 0 3.421-1.084 3.421-2.638 0-1.224-.731-1.907-2.289-2.237l-3.49-.777c-2.407-.517-3.917-1.742-3.917-4.309 0-2.566 1.77-4.756 6.016-4.756 4.553 0 6.18 2.26 6.18 4.639v.33h-2.736M85.303 16.718h8.88c2.492 0 3.549-.15 4.379-.677 1.308-.803 2.139-2.378 2.139-4.162 0-1.457-.504-2.868-1.258-3.622-.981-1.006-2.316-1.382-4.783-1.382h-2.693c-1.208 0-2.097-.05-2.6-.276-.605-.277-.956-.81-.956-1.562 0-.88.427-1.455 1.132-1.632.529-.124 1.14-.124 2.726-.15h7.949V.265h-8.754c-1.963 0-2.843.075-3.598.353-1.737.602-2.921 2.383-2.921 4.518 0 1.458.58 2.745 1.587 3.624.881.753 2.189 1.105 4.202 1.105h3.584c.805 0 1.46.1 1.813.3.678.327 1.08.934 1.08 1.714 0 .652-.301 1.122-.83 1.447-.426.278-1.158.403-2.49.403h-8.588v2.99zm-84.945 0h8.88c2.492 0 3.549-.15 4.38-.677 1.307-.803 2.138-2.378 2.138-4.162 0-1.457-.504-2.868-1.258-3.622-.982-1.006-2.316-1.382-4.783-1.382H7.023c-1.209 0-2.098-.05-2.6-.276-.605-.277-.957-.81-.957-1.562 0-.88.427-1.455 1.132-1.632.53-.124 1.141-.124 2.726-.15h7.95V.265H6.52c-1.964 0-2.844.075-3.6.353C1.185 1.22 0 3 0 5.136 0 6.594.582 7.881 1.587 8.76c.881.753 2.19 1.105 4.203 1.105h3.582c.807 0 1.46.1 1.814.3.678.327 1.08.934 1.08 1.714 0 .652-.3 1.122-.83 1.447-.426.278-1.157.403-2.49.403H.358v2.99zM71.99 4.596c-.52.813-.765 2.118-.765 3.87 0 3.845 1.331 5.595 4.294 5.595 2.915 0 4.248-1.75 4.248-5.546 0-3.847-1.308-5.571-4.248-5.571-1.604 0-2.864.592-3.53 1.652zm10.05-1.897c1.013 1.33 1.58 3.498 1.58 6.039 0 2.882-.914 5.249-2.544 6.555-1.233.986-3.11 1.528-5.335 1.528-3.16 0-5.654-1.037-6.937-2.884-.964-1.355-1.435-3.155-1.435-5.35 0-3.152.866-5.544 2.495-6.826C71.149.726 73.175.158 75.497.158c2.938 0 5.284.913 6.543 2.54zM65.36.279h-3.507v6.73h-6.345V.278h-3.507v16.439h3.507V9.94h6.345v6.778h3.506V.278zM43.533 8.042c.938 0 1.48-.123 1.852-.469.442-.37.715-1.158.715-2.07 0-1.084-.443-1.872-1.208-2.144-.272-.1-.717-.149-1.286-.149h-4.839v4.832h4.766zm-4.766 8.674h-3.507V.278h8.223c2.889 0 3.902.295 4.988 1.504.964 1.036 1.481 2.39 1.481 3.845 0 1.725-.69 3.327-1.826 4.289-.962.813-1.854 1.058-3.728 1.058h-5.63v5.743zM21.665 4.596c-.519.813-.764 2.118-.764 3.87 0 3.845 1.333 5.595 4.297 5.595 2.913 0 4.247-1.75 4.247-5.546 0-3.847-1.308-5.571-4.247-5.571-1.606 0-2.866.592-3.533 1.652zm10.052-1.897c1.014 1.33 1.581 3.498 1.581 6.039 0 2.882-.914 5.249-2.545 6.555-1.233.986-3.11 1.528-5.333 1.528-3.162 0-5.656-1.037-6.94-2.884-.964-1.355-1.432-3.155-1.432-5.35 0-3.152.865-5.544 2.496-6.826C20.825.726 22.85.158 25.173.158c2.938 0 5.286.913 6.544 2.54z"/> </g> </svg> </a> </div> <!-- Search Field --> <div class="lg:flex justify-end flex-grow hidden" x-show="searchField" x-cloak> <div class="relative w-1/2 rounded-md shadow-sm"> <form role="search" method="get" action="https://news.sophos.com/en-us/"> <input type="text" class="block w-full text-lg text-white placeholder-gray-100 bg-blue-800 border-0 rounded-md font-sansMedium font-medium" placeholder="Type to Search News" x-ref="searchInput" name="s" /> <div class="absolute inset-y-0 right-0 flex items-center px-3"> <button class="hover:opacity-100 opacity-60 p-1 text-xs text-white uppercase rounded-full cursor-pointer" type="submit" > Search </button> </div> </form> </div> </div> <!-- Main Nav --> <div class="lg:flex items-center flex-grow hidden" x-show="!searchField" x-cloak> <div class="flex ml-auto"> <ul id="menu-en-us-primary" class="primary-menu"><li id="menu-item-77773" class="menu-item menu-item-type-taxonomy menu-item-object-category menu-item-77773"><a href="https://news.sophos.com/en-us/category/products-services/">Products & Services<div class="menu-item-description"></div></a></li> <li id="menu-item-77772" class="menu-item menu-item-type-taxonomy menu-item-object-category menu-item-77772"><a href="https://news.sophos.com/en-us/category/security-operations/">Security Operations<div class="menu-item-description"></div></a></li> <li id="menu-item-77774" class="menu-item menu-item-type-taxonomy menu-item-object-category menu-item-77774"><a href="https://news.sophos.com/en-us/category/threat-research/">Threat Research<div class="menu-item-description"></div></a></li> <li id="menu-item-85326" class="menu-item menu-item-type-taxonomy menu-item-object-category current-post-ancestor current-menu-parent current-post-parent menu-item-85326"><a href="https://news.sophos.com/en-us/category/ai-research/">AI Research<div class="menu-item-description"></div></a></li> <li id="menu-item-951374" class="menu-item menu-item-type-taxonomy menu-item-object-category menu-item-951374"><a href="https://news.sophos.com/en-us/category/serious-security/">Naked Security<div class="menu-item-description"></div></a></li> <li id="menu-item-83702" class="menu-item menu-item-type-taxonomy menu-item-object-category menu-item-83702"><a href="https://news.sophos.com/en-us/category/sophos-life/">Sophos Life<div class="menu-item-description"></div></a></li> </ul> </div> </div> <!-- Search button --> <div class="lg:block hidden ml-4"> <div class="flex items-center"> <button class="border-2 border-transparent hover:border-white inline-flex items-center justify-center p-2 text-white rounded-md focus:outline-none transition-colors" @click.prevent="searchField = !searchField; $nextTick(() => { setTimeout(() => { $refs.searchInput.focus(); }, 150);});" > <span class="sr-only">Search</span> <!-- Heroicon name: outline/bell --> <svg class="w-5 h-5" xmlns="http://www.w3.org/2000/svg" fill="none" viewBox="0 0 24 24" stroke="currentColor" :class="{ 'block': !searchField, 'hidden': searchField }" > <path stroke-linecap="round" stroke-linejoin="round" stroke-width="3" d="M21 21l-6-6m2-5a7 7 0 11-14 0 7 7 0 0114 0z" /> </svg> <svg class="hidden w-5 h-5" xmlns="http://www.w3.org/2000/svg" fill="none" viewBox="0 0 24 24" stroke="currentColor" aria-hidden="true" :class="{ 'block': searchField, 'hidden': !searchField }" > <path stroke-linecap="round" stroke-linejoin="round" stroke-width="3" d="M6 18L18 6M6 6l12 12" /> </svg> </button> </div> </div> <!-- Mobile menu button --> <div class="lg:hidden flex -mr-2"> <button type="button" class="hover:text-white hover:bg-blue-800 focus:outline-none hover:ring-2 focus:ring-offset-2 focus:ring-offset-gray-300 focus:ring-white inline-flex items-center justify-center p-2 text-white rounded-md" aria-controls="mobile-menu" aria-expanded="false" @click="mobileMenu = !mobileMenu" > <span class="sr-only">Open main menu</span> <!-- Heroicon name: outline/menu Menu open: "hidden", Menu closed: "block" --> <svg class="block w-6 h-6" xmlns="http://www.w3.org/2000/svg" fill="none" viewBox="0 0 24 24" stroke="currentColor" :class="{ 'block': !mobileMenu, 'hidden': mobileMenu }" > <path stroke-linecap="round" stroke-linejoin="round" stroke-width="3" d="M4 6h16M4 12h16m-7 6h7" /> </svg> <!-- Heroicon name: outline/x Menu open: "block", Menu closed: "hidden" --> <svg class="hidden w-6 h-6" xmlns="http://www.w3.org/2000/svg" fill="none" viewBox="0 0 24 24" stroke="currentColor" aria-hidden="true" :class="{ 'block': mobileMenu, 'hidden': !mobileMenu }" > <path stroke-linecap="round" stroke-linejoin="round" stroke-width="3" d="M6 18L18 6M6 6l12 12" /> </svg> </button> </div> </div> </div> <!-- Mobile menu, show/hide based on menu state. --> <div class="lg:hidden container" x-show="mobileMenu" x-cloak x-transition:enter="transition-all ease-out duration-100" x-transition:enter-start="transform opacity-0 scale-95" x-transition:enter-end="transform opacity-100 scale-100" x-transition:leave="transition ease-in duration-75" x-transition:leave-start="transform opacity-100 scale-100" x-transition:leave-end="transform opacity-0 scale-95" > <div class="pt-2 pb-8 space-y-2"> <div class="relative rounded-md shadow-sm"> <form role="search" method="get" action="https://news.sophos.com/en-us/"> <input type="text" class="focus:ring-blue-600 focus:border-blue-600 sm:text-sm block w-full placeholder-gray-600 border-gray-300 rounded-md" placeholder="Search News" name="s" /> <div class="absolute inset-y-0 right-0 flex items-center px-3 pointer-events-none" > <button class="p-1 text-gray-500 rounded-full" type="submit"> <span class="sr-only">Search</span> <!-- Heroicon name: outline/bell --> <svg class="w-4 h-4" xmlns="http://www.w3.org/2000/svg" fill="none" viewBox="0 0 24 24" stroke="currentColor" > <path stroke-linecap="round" stroke-linejoin="round" stroke-width="3" d="M21 21l-6-6m2-5a7 7 0 11-14 0 7 7 0 0114 0z" /> </svg> </button> </div> </form> </div> <ul id="menu-en-us-primary-1" class="mobile-menu"><li class="menu-item menu-item-type-taxonomy menu-item-object-category menu-item-77773"><a href="https://news.sophos.com/en-us/category/products-services/">Products & Services<div class="menu-item-description"></div></a></li> <li class="menu-item menu-item-type-taxonomy menu-item-object-category menu-item-77772"><a href="https://news.sophos.com/en-us/category/security-operations/">Security Operations<div class="menu-item-description"></div></a></li> <li class="menu-item menu-item-type-taxonomy menu-item-object-category menu-item-77774"><a href="https://news.sophos.com/en-us/category/threat-research/">Threat Research<div class="menu-item-description"></div></a></li> <li class="menu-item menu-item-type-taxonomy menu-item-object-category current-post-ancestor current-menu-parent current-post-parent menu-item-85326"><a href="https://news.sophos.com/en-us/category/ai-research/">AI Research<div class="menu-item-description"></div></a></li> <li class="menu-item menu-item-type-taxonomy menu-item-object-category menu-item-951374"><a href="https://news.sophos.com/en-us/category/serious-security/">Naked Security<div class="menu-item-description"></div></a></li> <li class="menu-item menu-item-type-taxonomy menu-item-object-category menu-item-83702"><a href="https://news.sophos.com/en-us/category/sophos-life/">Sophos Life<div class="menu-item-description"></div></a></li> </ul> </div> </div> </header> <div id="content"> <div id="primary" class="content-area"> <main id="main" class="site-main" role="main"> <article id="post-92239" class="post-92239 post type-post status-publish format-standard has-post-thumbnail hentry category-ai-research tag-bert tag-featured tag-gpt-3 tag-large-language-models tag-sophos-x-ops tag-t5-large-llm tag-web-filtering tag-website-categorization region-en-us"> <div class="md:mt-16 container mt-8"> <div class="relative max-w-5xl mx-auto"> <div class="aspect-w-16 aspect-h-9 flex bg-gray-400 bg-right bg-no-repeat bg-cover" > <img width="1200" height="673" src="https://news.sophos.com/wp-content/uploads/2023/06/robuts.png?w=1200" class="object-cover wp-post-image" alt="" decoding="async" fetchpriority="high" srcset="https://news.sophos.com/wp-content/uploads/2023/06/robuts.png 1456w, https://news.sophos.com/wp-content/uploads/2023/06/robuts.png?resize=300,168 300w, https://news.sophos.com/wp-content/uploads/2023/06/robuts.png?resize=768,430 768w, https://news.sophos.com/wp-content/uploads/2023/06/robuts.png?resize=1024,574 1024w" sizes="(max-width: 1200px) 100vw, 1200px" /> </div> <div class="left-4 w-24 h-24 lg:left-12 xl:left-16 lg:w-40 lg:h-40 place-items-center absolute top-0 grid bg-sophos-blue-600" > <img src="https://news.sophos.com/wp-content/uploads/2022/07/Category-Icon-X-Ops-v2.png" alt="AI Research" /> </div> </div> </div> <header> <div class="container mt-8 md:mt-16 md:-mb-4"> <div class="max-w-4xl mx-auto"> <h1 class="text-style-h1 mb-8">And I Shall Call It Mini-Me GPT: Using Large Language Models to Classify the Uncharted Web</h1> <div class="text-xl md:text-2xl -mt-2 mb-6"> Sophos AI team employs GPT and other large language models as teachers to train smaller models to label websites. </div> <div class="text-xl md:text-xl -mt-2"> <span class="byline"> Written by <span class="author vcard"> <a href="https://news.sophos.com/en-us/author/sean-gallagher/" title="Posts by Sean Gallagher" class="author url fn" rel="author">Sean Gallagher</a> </span> </span> </div> <div class="text-sophos-gray-600 mt-4 text-xs font-sansSemiBold font-semibold leading-tight uppercase"> <span class="posted-on"><a href="https://news.sophos.com/en-us/2023/06/22/using-large-language-models-classify-uncharted-web/" rel="bookmark">June 22, 2023</a></span> </div> <div class="mt-6 space-y-2 space-x-1"> <a href="https://news.sophos.com/en-us/category/ai-research/" class="category-tag-pill">AI Research</a> <a href="https://news.sophos.com/en-us/tag/bert/" class="category-tag-pill">BERT</a> <a href="https://news.sophos.com/en-us/tag/featured/" class="category-tag-pill">featured</a> <a href="https://news.sophos.com/en-us/tag/gpt-3/" class="category-tag-pill">GPT-3</a> <a href="https://news.sophos.com/en-us/tag/large-language-models/" class="category-tag-pill">Large Language Models</a> <a href="https://news.sophos.com/en-us/tag/sophos-x-ops/" class="category-tag-pill">Sophos X-Ops</a> <a href="https://news.sophos.com/en-us/tag/t5-large-llm/" class="category-tag-pill">T5 Large LLM</a> <a href="https://news.sophos.com/en-us/tag/web-filtering/" class="category-tag-pill">Web filtering</a> <a href="https://news.sophos.com/en-us/tag/website-categorization/" class="category-tag-pill">website categorization</a> </div> </div> </div> </header><!-- .entry-header --> <div class="container md:my-16 xl:my-24 my-8"> <div class="entry-content lg:prose-lg mx-auto prose max-w-4xl"> <p>While it may not seem as central to security as malware protection and breach detection, web content filtering plays an important role in ensuring regulatory compliance and the safety of workplaces as well as network security. Unlike security classification of URLs, which screens for malicious content such as malware or phishing, web filtering has to label content based not on attack mechanisms but the nature of its content, a much more generalized problem than checking for malicious patterns in the content behind the URL.</p> <p>Website category labels generally describe what the content or purpose of the site is. Some categories are broad classifications such as “business,” “computers and internet”, “food and dining” and “entertainment”. Others focus on intent, such as “banking,” “shopping,” “search engines,” “social media,” “job search” and “education”. And then there are categories that may include content of concern—”sexually explicit,” “alcohol,” “marijuana” and “weapons,” for example. Organizations may want to set various policies for filtering or measuring the types of websites accessed from their networks.</p> <p>Sophos X-Ops has been researching ways to apply large language model (LLM) machine learning to web filtering to help catch the “long tail” of websites—those millions of domains that have relatively few visitors and little or no visibility to human analysts. LLMs themselves are not practical for this application because of their size and computational resource cost. But they can be used in turn as “teacher” models to train smaller models on categorization—reducing the computational resources required to generate labels on the fly for newly encountered domains.</p> <p>Using LLMs such as OpenAI’s GPT-3 and Google’s T5 Large , the SophosAI team were able to train much smaller models to classify never-before-screened URLs on the fly. Most importantly, the methodology used here could be used to create small, economically deployable models based on the output of LLMs for other security tasks.</p> <p>The team’s research, detailed in a recently published paper entitled <a href="https://arxiv.org/pdf/2305.05027.pdf">“Web Content Filtering Through Knowledge Distillation of Large Language Models,”</a> explores ways in which LLMs can be used to bolster existing human-driven site classification, and to build systems that can be deployed to perform real-time labeling of never-before-seen URLs.</p> <h3>The “long tail” problem</h3> <p>Categorization of sites has relied largely on rule-based domain-to-category mapping, where analyst-crafted signatures are used to look for tell-tales in URLs to quickly assign labels to new domains. This sort of mapping is vital in speedy labeling of URLs on well-known sites and preventing false positives that block important content. The hands-on human identification of site classification patterns gets folded back into the domain mapping tools’ feature sets.</p> <p>The problem comes with the “long tail” of websites—those less-visited domains that typically don’t get signatures assigned to them. With the daily emergence of thousands of new websites, and with over a billion existing websites, maintaining and scaling signature-based approaches manually for the long tail has become increasingly challenging. That’s evident in the steep drop-off of labeling for less-visited domains–while well-known, high-traffic sites get nearly 100 percent coverage in most labeling schemes, as shown in the diagram below, the proportion of analyst-labeled domains begins to fall off quickly beyond the top hundred visited domains. Sites ranked below the top 5000 are less than 50 percent likely to have been labeled for content.</p> <p><a style="font-size: 1em" href="https://news.sophos.com/wp-content/uploads/2023/06/label_coverage.jpg"><img decoding="async" class="wp-image-92240 size-full" src="https://news.sophos.com/wp-content/uploads/2023/06/label_coverage.jpg" alt="Figure 1. Labeling of content relative to popularity of domains, derived from telemetry. Chart shows logarithmic plot of domains versus proportion with labeling, with almost all of sites in the top 100 labeled but less popular sites having decreasing levels of coverage for each power of ten they're ranked in." width="640" height="400" srcset="https://news.sophos.com/wp-content/uploads/2023/06/label_coverage.jpg 8749w, https://news.sophos.com/wp-content/uploads/2023/06/label_coverage.jpg?resize=300,187 300w, https://news.sophos.com/wp-content/uploads/2023/06/label_coverage.jpg?resize=768,480 768w, https://news.sophos.com/wp-content/uploads/2023/06/label_coverage.jpg?resize=1024,639 1024w, https://news.sophos.com/wp-content/uploads/2023/06/label_coverage.jpg?resize=1536,959 1536w, https://news.sophos.com/wp-content/uploads/2023/06/label_coverage.jpg?resize=2048,1279 2048w" sizes="(max-width: 640px) 100vw, 640px" /></a></p> <p>Figure 1. Labeling of content relative to popularity of domains, derived from telemetry. One way to fix this is through application of machine learning for processing previously unlabeled domains. But up until now, most machine learning efforts (such as Microsoft’s <a href="https://arxiv.org/pdf/2106.05256.pdf">URLTran</a>) have used deep learning models to focus on the task of detecting security threats, rather than categorizing sites by content. These models could be retrained to perform multi-category classification, but they would require extremely large training sets of data. URLTran used over 1 million samples just for training on detection of malicious URLS.</p> <h3>Automating with AI</h3> <p>That’s where LLMs come in. Because they are pre-trained on massive amounts of unlabeled text, the SophosAI team believed that LLMs could be used to perform URL labeling more accurately and with much less initial data. When fine-tuned on data labeled with domain-propagation signatures, the SophosAI team found that LLMs have a 9% accuracy advantage over the state of the art model architecture from Microsoft when tackling the “long tail” categorization problem—and only required a training set of thousands of URLs, rather than millions.</p> <p>The LLMs, using semantic relationships between the site classes and keywords within URLs in a smaller data set, were then used to create labels for an unlabeled set of data from long tail sites that were in turn used to train smaller models (the BERTiny and BERT-based URLTran transformer models and the 1D convolutional model eXpose). This “knowledge distillation” approach allowed the team to reach performance levels similar to that of the LLM with models 175 times smaller, reducing the number of parameters from 770 million to just 4 million.</p> <figure id="attachment_92256" aria-describedby="caption-attachment-92256" style="width: 1200px" class="wp-caption alignnone"><a href="https://news.sophos.com/wp-content/uploads/2023/06/knowledge-distill-2.jpg"><img decoding="async" class="wp-image-92256 size-full" src="https://news.sophos.com/wp-content/uploads/2023/06/knowledge-distill-2.jpg" alt="A three step process: 1. Manually-labeled URLs are used to train large language models to semantically recognize categories for URLs.2.Trained LLMs categorize a larger set of unlabeled URLS, which are then used to train smaller models (BERTiny, eXpose, URLTran). 3.The best-performing small model can then be deployed in combination with existing rules to catch long-tail websites." width="1200" height="675" srcset="https://news.sophos.com/wp-content/uploads/2023/06/knowledge-distill-2.jpg 1200w, https://news.sophos.com/wp-content/uploads/2023/06/knowledge-distill-2.jpg?resize=300,169 300w, https://news.sophos.com/wp-content/uploads/2023/06/knowledge-distill-2.jpg?resize=768,432 768w, https://news.sophos.com/wp-content/uploads/2023/06/knowledge-distill-2.jpg?resize=1024,576 1024w" sizes="(max-width: 1200px) 100vw, 1200px" /></a><figcaption id="caption-attachment-92256" class="wp-caption-text">Figure 2. How knowledge distillation was used to create deployable models.</figcaption></figure> <p>While the most accurate sets of models created performed far better than models trained via “deep learning” alone, their accuracy fell short of perfection—even the best models scored under 50 percent accuracy. Many URLs failed to be properly labeled simply because they didn’t have sufficient “signals” embedded in them, while others had keywords that could be associated with multiple classifications—creating uncertainty that could only be clarified by deeper examination of the content behind the URL.</p> <p><a href="https://news.sophos.com/wp-content/uploads/2023/06/domain_scaling.jpg"><img loading="lazy" decoding="async" class="alignnone wp-image-92243 size-full" src="https://news.sophos.com/wp-content/uploads/2023/06/domain_scaling.jpg" alt="Figures 3 and 4. An accuracy plot of trained models. LLMs outperformed smaller models trained with deep learning, and reached highest accuracy with less data" width="640" height="384" srcset="https://news.sophos.com/wp-content/uploads/2023/06/domain_scaling.jpg 10000w, https://news.sophos.com/wp-content/uploads/2023/06/domain_scaling.jpg?resize=300,180 300w, https://news.sophos.com/wp-content/uploads/2023/06/domain_scaling.jpg?resize=768,461 768w, https://news.sophos.com/wp-content/uploads/2023/06/domain_scaling.jpg?resize=1024,614 1024w, https://news.sophos.com/wp-content/uploads/2023/06/domain_scaling.jpg?resize=1536,922 1536w, https://news.sophos.com/wp-content/uploads/2023/06/domain_scaling.jpg?resize=2048,1229 2048w" sizes="auto, (max-width: 640px) 100vw, 640px" /></a></p> <figure id="attachment_92242" aria-describedby="caption-attachment-92242" style="width: 640px" class="wp-caption alignnone"><a href="https://news.sophos.com/wp-content/uploads/2023/06/domain_augmentation.jpg"><img loading="lazy" decoding="async" class="wp-image-92242 size-full" src="https://news.sophos.com/wp-content/uploads/2023/06/domain_augmentation.jpg" alt="Smaller models trained by the LLMS approached the same level of acccuracy as the LLMs themselves." width="640" height="384" srcset="https://news.sophos.com/wp-content/uploads/2023/06/domain_augmentation.jpg 10000w, https://news.sophos.com/wp-content/uploads/2023/06/domain_augmentation.jpg?resize=300,180 300w, https://news.sophos.com/wp-content/uploads/2023/06/domain_augmentation.jpg?resize=768,461 768w, https://news.sophos.com/wp-content/uploads/2023/06/domain_augmentation.jpg?resize=1024,614 1024w, https://news.sophos.com/wp-content/uploads/2023/06/domain_augmentation.jpg?resize=1536,922 1536w, https://news.sophos.com/wp-content/uploads/2023/06/domain_augmentation.jpg?resize=2048,1229 2048w" sizes="auto, (max-width: 640px) 100vw, 640px" /></a><figcaption id="caption-attachment-92242" class="wp-caption-text">Figures 3 and 4. An accuracy plot of trained models. LLMs outperformed smaller models trained with deep learning, but the smaller models approached the same level of accuracy when the LLMs were used as teaching models. (Y axis in both charts are from 0 to 0.5 accuracy.)</figcaption></figure> <p>However, the T5 Large model performed reasonably well on categories that would potentially be filtered out, as shown in the confusion matrix below—with gambling and peer-to-peer sharing sites having near-perfect labeling on test data. Alcohol, weapons, and pornography sites also had better than 60% true positive detection rates.</p> <figure id="attachment_92244" aria-describedby="caption-attachment-92244" style="width: 640px" class="wp-caption alignnone"><a href="https://news.sophos.com/wp-content/uploads/2023/06/confusion_matrix.jpg"><img loading="lazy" decoding="async" class="wp-image-92244 size-full" src="https://news.sophos.com/wp-content/uploads/2023/06/confusion_matrix.jpg" alt="Figure 5. A confusion matrix showing the relationship between the labels the T5 Large model assigned to test URLs, and their true manually-assigned labels," width="640" height="543" srcset="https://news.sophos.com/wp-content/uploads/2023/06/confusion_matrix.jpg 10335w, https://news.sophos.com/wp-content/uploads/2023/06/confusion_matrix.jpg?resize=300,255 300w, https://news.sophos.com/wp-content/uploads/2023/06/confusion_matrix.jpg?resize=768,652 768w, https://news.sophos.com/wp-content/uploads/2023/06/confusion_matrix.jpg?resize=1024,869 1024w, https://news.sophos.com/wp-content/uploads/2023/06/confusion_matrix.jpg?resize=1536,1304 1536w, https://news.sophos.com/wp-content/uploads/2023/06/confusion_matrix.jpg?resize=2048,1739 2048w" sizes="auto, (max-width: 640px) 100vw, 640px" /></a><figcaption id="caption-attachment-92244" class="wp-caption-text">Figure 5. A confusion matrix showing the relationship between the labels the T5 Large model assigned to test URLs, and their true manually-assigned labels,</figcaption></figure> <p>There are several ways to improve this accuracy going forward that the SophosAI team has suggested. First, allowing for the assignment of multiple categories to a site would eliminate problems with category overlap. Augmenting the URL samples with retrieved HTML and images from them could also provide better recognition of their categorization, And newer LLMs, such as GPT-4, could be used as a teacher.</p> <p>When combined with existing processes, this form of AI-based classification can greatly improve the handling of long tail websites. And there are other security-related tasks that the “knowledge distillation” methodology tested in this experiment could be applied to.</p> <p>For more details, see the paper authored by Tamas Voros, Sean Bergeron and Head of SophosAI Konstantin Berlin <a href="https://arxiv.org/pdf/2305.05027.pdf">here on arxiv.org</a>.</p> <p> </p> <p> </p> <p> </p> <div class="sharedaddy sd-sharing-enabled"><div class="robots-nocontent sd-block sd-social sd-social-icon-text sd-sharing"><h3 class="sd-title">Share this:</h3><div class="sd-content"><ul><li class="share-mastodon"><a rel="nofollow noopener noreferrer" data-shared="sharing-mastodon-92239" class="share-mastodon sd-button share-icon" href="https://news.sophos.com/en-us/2023/06/22/using-large-language-models-classify-uncharted-web/?share=mastodon" target="_blank" title="Click to share on Mastodon" ><span>Mastodon</span></a></li><li class="share-bluesky"><a rel="nofollow noopener noreferrer" data-shared="sharing-bluesky-92239" class="share-bluesky sd-button share-icon" href="https://news.sophos.com/en-us/2023/06/22/using-large-language-models-classify-uncharted-web/?share=bluesky" target="_blank" title="Click to share on Bluesky" ><span>Bluesky</span></a></li><li class="share-reddit"><a rel="nofollow noopener noreferrer" data-shared="" class="share-reddit sd-button share-icon" href="https://news.sophos.com/en-us/2023/06/22/using-large-language-models-classify-uncharted-web/?share=reddit" target="_blank" title="Click to share on Reddit" ><span>Reddit</span></a></li><li class="share-linkedin"><a rel="nofollow noopener noreferrer" data-shared="sharing-linkedin-92239" class="share-linkedin sd-button share-icon" href="https://news.sophos.com/en-us/2023/06/22/using-large-language-models-classify-uncharted-web/?share=linkedin" target="_blank" title="Click to share on LinkedIn" ><span>LinkedIn</span></a></li><li><a href="#" class="sharing-anchor sd-button share-more"><span>More</span></a></li><li class="share-end"></li></ul><div class="sharing-hidden"><div class="inner" style="display: none;"><ul><li class="share-tumblr"><a rel="nofollow noopener noreferrer" data-shared="" class="share-tumblr sd-button share-icon" href="https://news.sophos.com/en-us/2023/06/22/using-large-language-models-classify-uncharted-web/?share=tumblr" target="_blank" title="Click to share on Tumblr" ><span>Tumblr</span></a></li><li class="share-pocket"><a rel="nofollow noopener noreferrer" data-shared="" class="share-pocket sd-button share-icon" href="https://news.sophos.com/en-us/2023/06/22/using-large-language-models-classify-uncharted-web/?share=pocket" target="_blank" title="Click to share on Pocket" ><span>Pocket</span></a></li><li class="share-print"><a rel="nofollow noopener noreferrer" data-shared="" class="share-print sd-button share-icon" href="https://news.sophos.com/en-us/2023/06/22/using-large-language-models-classify-uncharted-web/#print" target="_blank" title="Click to print" ><span>Print</span></a></li><li class="share-email"><a rel="nofollow noopener noreferrer" data-shared="" class="share-email sd-button share-icon" href="mailto:?subject=%5BShared%20Post%5D%20And%20I%20Shall%20Call%20It%20Mini-Me%20GPT%3A%20Using%20Large%20Language%20Models%20to%20Classify%20the%20Uncharted%20Web&body=https%3A%2F%2Fnews.sophos.com%2Fen-us%2F2023%2F06%2F22%2Fusing-large-language-models-classify-uncharted-web%2F&share=email" target="_blank" title="Click to email a link to a friend" data-email-share-error-title="Do you have email set up?" data-email-share-error-text="If you're having problems sharing via email, you might not have email set up for your browser. You may need to create a new email yourself." data-email-share-nonce="e7ebd86b1b" data-email-share-track-url="https://news.sophos.com/en-us/2023/06/22/using-large-language-models-classify-uncharted-web/?share=email"><span>Email</span></a></li><li class="share-end"></li></ul></div></div></div></div></div> </div> <div class="mt-12"> <ul id="social-sharing" class="flex justify-center items-center space-x-6" > <li class="facebook"> <a class="js-share-modal" href="http://www.facebook.com/share.php?u=https://news.sophos.com/?p=92239&title=And%20I%20Shall%20Call%20It%20Mini-Me%20GPT:%20Using%20Large%20Language%20Models%20to%20Classify%20the%20Uncharted%20Web" data-title="And I Shall Call It Mini-Me GPT: Using Large Language Models to Classify the Uncharted Web" title="Share on Facebook"> <span class="sr-only">Share on Facebook</span> <svg width="8" height="16" xmlns="http://www.w3.org/2000/svg" class="text-sophos-gray-600 hover:text-black" fill="currentColor" > <path d="M7.145 8.006H4.903V16H1.581V8.006H0V5.182h1.581V3.354C1.581 2.045 2.202 0 4.933 0l2.461.01v2.742H5.608c-.291 0-.705.145-.705.77v1.66h2.533l-.291 2.824z" fill-rule="nonzero"/> </svg> </a> </li> <li class="twitter"> <a class="js-share-modal" href="http://twitter.com/intent/tweet?text=And%20I%20Shall%20Call%20It%20Mini-Me%20GPT%3A%20Using%20Large%20Language%20Models%20to%20Classify%20the%20Uncharted%20Web%20https%3A%2F%2Fnews.sophos.com%2F%3Fp%3D92239" data-title="" title="Share on X"> <span class="sr-only">Share on X</span> <svg xmlns="http://www.w3.org/2000/svg" width="16" height="16" class="text-sophos-gray-600 hover:text-black" fill="currentColor" > <path d="M12.163 1.5h2.206L9.55 7.006l5.669 7.494H10.78L7.303 9.956 3.328 14.5h-2.21l5.154-5.89L.838 1.5h4.55l3.14 4.153zm-.776 11.681h1.222L4.722 2.75H3.409z"/> </svg> </a> </li> <li class="linkedin"> <a href="http://www.linkedin.com/shareArticle?mini=true&url=https://news.sophos.com/en-us/2023/06/22/using-large-language-models-classify-uncharted-web/" data-title="And I Shall Call It Mini-Me GPT: Using Large Language Models to Classify the Uncharted Web" title="Share on LinkedIn" onclick="window.open(this.href, '', 'left=20,top=20,width=500,height=500,toolbar=1,resizable=0'); return false;"> <span class="sr-only">Share on LinkedIn</span> <svg width="16" height="16" xmlns="http://www.w3.org/2000/svg" class="text-sophos-gray-600 hover:text-black" fill="currentColor" > <path d="M16 15.293h-3.43v-5.52c0-1.386-.496-2.334-1.738-2.334-.946 0-1.512.64-1.76 1.256-.09.22-.113.526-.113.836v5.762H5.53s.044-9.35 0-10.316h3.43v1.46c.456-.705 1.27-1.703 3.091-1.703 2.256 0 3.95 1.473 3.95 4.643v5.916zM1.917 3.566h-.022C.745 3.566 0 2.773 0 1.783 0 .772.768 0 1.94 0c1.173 0 1.896.772 1.917 1.783 0 .99-.744 1.783-1.94 1.783zM.202 15.293h3.431V4.977H.203v10.316z" fill-rule="nonzero"/> </svg> </a> </li> <li class="comments"> <a href="#comments" title="Leave a Reply" class="flex items-center space-x-1" > <svg width="16" height="16" xmlns="http://www.w3.org/2000/svg" class="text-sophos-gray-600 hover:text-black" fill="currentColor" > <path d="M8.5 0a7.5 7.5 0 11-3.916 13.898C3.317 15.273 1.773 15.36.256 15.135c1.011-1.185 1.678-2.357 2-3.517l-.007.027A7.5 7.5 0 018.5 0z" fill-rule="evenodd"/> </svg> </a> </li> </ul> </div><!-- .entry-social --> </div> </article><!-- #post-## --> <div class="container my-8 md:my-16"> <div class="max-w-4xl mx-auto"> <div class="article-author-block article-co-authors-block"> <div class="author-block"> <div class="author-block__profile"> <img width="400" height="400" src="https://news.sophos.com/wp-content/uploads/2020/02/sean-gallagher.jpg?w=400" class="avatar avatar-400 photo wp-post-image" alt="Sean Gallagher" /> </div> <!-- .author-profile --> <div class="author-block__wrapper"> <div class="author-block__content"> <div class="author-block__about"> About the Author </div> <h3 class="author-block__name"> <a href="https://news.sophos.com/en-us/author/sean-gallagher/" title="Posts by Sean Gallagher" class="author url fn" rel="author">Sean Gallagher</a> </h3> <div class="author-block__bio"> <p>Sean Gallagher is Principal Threat Researcher, Sophos X-Ops. Prior to joining Sophos, he was an information security and technology journalist for over 30 years, including 10 as information security and national security editor for Ars Technica.</p> </div> <!-- .author-bio --> </div> </div> </div> <!-- .author-block-container --> </div> </div> </div> <div class="pb-24 bg-white"> <div class="container"> <div class="max-w-5xl mx-auto"> <h3 class="text-style-h2 md:my-8 my-4"> Read Similar Articles </h3> <div class="article-grid article-grid--3-column"> <!-- Article --> <article id="post-75410" class="hover:shadow-lg dark:bg-sophos-gray-900 border-sophos-gray-200 flex flex-col overflow-hidden text-gray-700 transition-all bg-white border rounded-md shadow-md post-75410 post type-post status-publish format-standard has-post-thumbnail hentry category-security-operations tag-encryption tag-mtr tag-ransomware tag-security tag-sidebar tag-sophos-rapid-response region-en-us"> <!-- Image --> <a class="aspect-w-16 aspect-h-9 flex block bg-gray-400 bg-right bg-no-repeat bg-cover" href="https://news.sophos.com/en-us/2021/05/24/what-to-expect-when-youve-been-hit-with-avaddon-ransomware/" rel="bookmark" style=" background-image: url('https://news.sophos.com/wp-content/uploads/2021/05/sophos-ransomware-web-banner-1200x628px-option-2.png?w=640'); " ></a> <!-- Wrapper --> <div class="flex flex-col justify-between flex-grow"> <!-- Content --> <div class="sm:px-8 sm:py-8 p-4 py-6"> <!-- Date --> <div class="text-sophos-blue-600 font-sansMedium mb-2 text-xs leading-tight uppercase truncate" > May 24, 2021 </div> <!-- Post Title --> <h2 class="text-style-h2 line-clamp-3 sm:mb-4 sm:text-2xl sm:leading-snug text-lg leading-tight text-gray-700"><a href="https://news.sophos.com/en-us/2021/05/24/what-to-expect-when-youve-been-hit-with-avaddon-ransomware/" rel="bookmark" class="dark:text-white font-sansSemiBold font-semibold text-gray-900 no-underline cursor-pointer">What to expect when you’ve been hit with Avaddon ransomware</a></h2> <!-- Excerpt --> </div> </div> </article> <!-- Article --> <article id="post-75301" class="hover:shadow-lg dark:bg-sophos-gray-900 border-sophos-gray-200 flex flex-col overflow-hidden text-gray-700 transition-all bg-white border rounded-md shadow-md post-75301 post type-post status-publish format-standard has-post-thumbnail hentry category-products-services tag-intercept-x tag-sidebar tag-sophos-edr tag-sophos-xdr region-en-us"> <!-- Image --> <a class="aspect-w-16 aspect-h-9 flex block bg-gray-400 bg-right bg-no-repeat bg-cover" href="https://news.sophos.com/en-us/2021/05/19/whats-new-in-sophos-edr-4-0/" rel="bookmark" style=" background-image: url('https://news.sophos.com/wp-content/uploads/2021/05/sophos-edr-news-blog-image-838x440px@2x.png?w=640'); " ></a> <!-- Wrapper --> <div class="flex flex-col justify-between flex-grow"> <!-- Content --> <div class="sm:px-8 sm:py-8 p-4 py-6"> <!-- Date --> <div class="text-sophos-blue-600 font-sansMedium mb-2 text-xs leading-tight uppercase truncate" > May 19, 2021 </div> <!-- Post Title --> <h2 class="text-style-h2 line-clamp-3 sm:mb-4 sm:text-2xl sm:leading-snug text-lg leading-tight text-gray-700"><a href="https://news.sophos.com/en-us/2021/05/19/whats-new-in-sophos-edr-4-0/" rel="bookmark" class="dark:text-white font-sansSemiBold font-semibold text-gray-900 no-underline cursor-pointer">What’s New in Sophos EDR 4.0</a></h2> <!-- Excerpt --> </div> </div> </article> <!-- Article --> <article id="post-75396" class="hover:shadow-lg dark:bg-sophos-gray-900 border-sophos-gray-200 flex flex-col overflow-hidden text-gray-700 transition-all bg-white border rounded-md shadow-md post-75396 post type-post status-publish format-standard has-post-thumbnail hentry category-products-services tag-sidebar tag-sophos-xdr tag-xdr region-en-us"> <!-- Image --> <a class="aspect-w-16 aspect-h-9 flex block bg-gray-400 bg-right bg-no-repeat bg-cover" href="https://news.sophos.com/en-us/2021/05/19/sophos-xdr-driven-by-data/" rel="bookmark" style=" background-image: url('https://news.sophos.com/wp-content/uploads/2021/05/sophos-xdr.png?w=640'); " ></a> <!-- Wrapper --> <div class="flex flex-col justify-between flex-grow"> <!-- Content --> <div class="sm:px-8 sm:py-8 p-4 py-6"> <!-- Date --> <div class="text-sophos-blue-600 font-sansMedium mb-2 text-xs leading-tight uppercase truncate" > May 19, 2021 </div> <!-- Post Title --> <h2 class="text-style-h2 line-clamp-3 sm:mb-4 sm:text-2xl sm:leading-snug text-lg leading-tight text-gray-700"><a href="https://news.sophos.com/en-us/2021/05/19/sophos-xdr-driven-by-data/" rel="bookmark" class="dark:text-white font-sansSemiBold font-semibold text-gray-900 no-underline cursor-pointer">Sophos XDR: Driven by data</a></h2> <!-- Excerpt --> </div> </div> </article> </div> </div> </div> </div> <!-- #secondary --> </main><!-- #main --> </div><!-- #primary --> </div> <!-- #content --> <div class="bg-sophos-gray-50 md:py-16 px-4 pb-4 pt-8"> <div class="container max-w-2xl" x-show="!subscribed"> <div class="text-style-h2-lg"> Subscribe to get the latest updates in your inbox. </div> <div id="mc_embed_shell"> <link href="//cdn-images.mailchimp.com/embedcode/classic-061523.css" rel="stylesheet" type="text/css"> <style type="text/css"> /* Add your own Mailchimp form style overrides in your site stylesheet or in this style block. We recommend moving this block and the preceding CSS link to the HEAD of your HTML file. */ #mc_embed_signup form, #mc_embed_signup #mc-embedded-subscribe-form div.mce_inline_error { margin:0; background: transparent; } #mc_embed_signup input { border-color: rgba(240, 242, 244, var(--tw-border-opacity)); } #mc_embed_signup input#mc-embedded-subscribe { border-radius: 9999px; } #mc-embedded-subscribe { margin-left:0; } #mc_embed_signup .mc-field-group.input-group input { height:1rem; width:1rem; } #mc_embed_signup #mc-embedded-subscribe-form input.mce_inline_error { border-color: rgba( 209, 213, 219, var( --tw-border-opacity ) );} #mc_embed_signup #mce-success-response { display: block; color: #fff; font-weight: normal; padding: .75rem 1rem; margin: 0; } #mc_embed_signup div#mce-responses { padding: 0; width: 100%; margin: .5rem 0; } #mc_embed_signup div.response { width:100%; padding: .75rem 1rem; font-weight: normal; } </style> <div id="mc_embed_signup"> <form action="https://sophos.us2.list-manage.com/subscribe/post?u=2a2849a8c809119f4bd4929cc&id=8d6471d831&f_id=007062e1f0" method="post" id="mc-embedded-subscribe-form" name="mc-embedded-subscribe-form" class="validate" target="_blank"> <div id="mc_embed_signup_scroll"> <div class="mc-field-group"> <input type="email" name="EMAIL" class="required email" id="mce-EMAIL" required="" value="" placeholder="name@email.com"> <div id="mce-responses" class="clear flex flex-col my-6"> <div class="response font-sansMedium px-4 py-3 mt-2 text-sm font-medium text-white bg-black border rounded-md" id="mce-error-response" style="display: none;"></div> <div class="response font-sansMedium px-4 py-3 mt-2 text-sm font-medium text-white bg-black border rounded-md" id="mce-success-response" style="display: none;"></div> </div> </div> <div class="mc-field-group input-group mb-4 text-lg"> Which categories are you interested in? <ul> <li><input type="checkbox" name="group[3][1]" id="mce-group[3]-3-0" value=""><label for="mce-group[3]-3-0" class="text-style-form-label ml-2">Products and Services</label></li> <li><input type="checkbox" name="group[3][2]" id="mce-group[3]-3-1" value=""><label for="mce-group[3]-3-1" class="text-style-form-label ml-2">Threat Research</label></li> <li><input type="checkbox" name="group[3][4]" id="mce-group[3]-3-2" value=""><label for="mce-group[3]-3-2" class="text-style-form-label ml-2">Security Operations</label></li> <li><input type="checkbox" name="group[3][8]" id="mce-group[3]-3-3" value=""><label for="mce-group[3]-3-3" class="text-style-form-label ml-2">AI Research</label></li> <li><input type="checkbox" name="group[3][16]" id="mce-group[3]-3-4" value=""><label for="mce-group[3]-3-4" class="text-style-form-label ml-2">#SophosLife</label></li> </ul> </div> <div aria-hidden="true" style="position: absolute; left: -5000px;"> <input type="text" name="b_2a2849a8c809119f4bd4929cc_8d6471d831" tabindex="-1" value=""> </div> <div class="clear"> <input type="submit" name="subscribe" id="mc-embedded-subscribe" class="round-button round-button--primary" value="Subscribe"> </div> </div> </form> </div> </div> </div> </div> <footer class="bg-white border-t border-sophos-gray-200 " x-data="{ languageMenu: false, privacyMenu: false, legalMenu: false }" > <div class="container"> <div class="md:flex-row md:items-center flex flex-col justify-between py-8"> <div class="flex items-baseline flex-grow space-x-6"> <!-- Language --> <div class="relative mr-auto"> <a href="#" class="whitespace-nowrap font-sansMedium text-sophos-gray-600 inline-block text-xs font-medium leading-tight" @click.prevent="languageMenu = !languageMenu" @click.away="languageMenu = false" > Change Region <svg xmlns="http://www.w3.org/2000/svg" width="8" height="4" class="inline-block transition-transform transform" :class="{'rotate-180': languageMenu }" > <path fill="#7F8C9D" fill-rule="evenodd" d="M4 2.178L5.915.262a.708.708 0 01.996 0 .702.702 0 010 .995L4.75 3.415A.7.7 0 014 3.94a.702.702 0 01-.751-.524l-2.16-2.158a.702.702 0 11.996-.995L4 2.178z" /> </svg> </a> <!-- Language Menu --> <div class="focus:outline-none border-sophos-gray-200 absolute bottom-0 left-0 w-48 px-4 py-1 py-4 mb-8 -ml-4 origin-bottom-left bg-white border rounded-md shadow-md" role="menu" aria-orientation="vertical" aria-labelledby="user-menu" x-show="languageMenu" x-cloak x-transition:enter="transition-all ease-out duration-100" x-transition:enter-start="transform opacity-0 scale-95" x-transition:enter-end="transform opacity-100 scale-100" x-transition:leave="transition ease-in duration-75" x-transition:leave-start="transform opacity-100 scale-100" x-transition:leave-end="transform opacity-0 scale-95" > <ul class="font-sansMedium text-sophos-gray-600 space-y-1 text-xs font-medium" > <li> <a href="https://news.sophos.com/es-419"> América Latina </a> </li> <li> <a href="https://news.sophos.com/pt-br"> Brasil </a> </li> <li> <a href="https://news.sophos.com/de-de"> Deutschland </a> </li> <li> <a href="https://news.sophos.com/en-us"> English </a> </li> <li> <a href="https://news.sophos.com/fr-fr"> France </a> </li> <li> <a href="https://news.sophos.com/es-es"> Iberia </a> </li> <li> <a href="https://news.sophos.com/it-it"> Italia </a> </li> <li> <a href="https://news.sophos.com/ja-jp"> Japan </a> </li> </ul> </div> </div> <!-- Terms --> <a href="https://www.sophos.com/en-us/legal/sophos-website.aspx" class="whitespace-nowrap font-sansMedium text-sophos-gray-600 inline-block ml-auto text-xs font-medium leading-tight" >Terms</a > <!-- Privacy --> <span class="relative"> <a href="#" class="whitespace-nowrap font-sansMedium text-sophos-gray-600 inline-block text-xs font-medium leading-tight" @click.prevent="privacyMenu = !privacyMenu" @click.away="privacyMenu = false" > Privacy <svg xmlns="http://www.w3.org/2000/svg" width="8" height="4" class="inline-block transition-transform transform" :class="{'rotate-180': privacyMenu }" > <path fill="#7F8C9D" fill-rule="evenodd" d="M4 2.178L5.915.262a.708.708 0 01.996 0 .702.702 0 010 .995L4.75 3.415A.7.7 0 014 3.94a.702.702 0 01-.751-.524l-2.16-2.158a.702.702 0 11.996-.995L4 2.178z" /> </svg> </a> <div class="focus:outline-none border-sophos-gray-200 absolute bottom-0 left-0 w-48 px-4 py-1 py-4 mb-8 -ml-4 origin-bottom-left bg-white border rounded-md shadow-md" role="menu" aria-orientation="vertical" aria-labelledby="user-menu" x-show="privacyMenu" x-cloak x-transition:enter="transition-all ease-out duration-100" x-transition:enter-start="transform opacity-0 scale-95" x-transition:enter-end="transform opacity-100 scale-100" x-transition:leave="transition ease-in duration-75" x-transition:leave-start="transform opacity-100 scale-100" x-transition:leave-end="transform opacity-0 scale-95" > <ul class="font-sansMedium text-sophos-gray-600 space-y-1 text-xs font-medium" > <li> <a href="https://www.sophos.com/en-us/legal/sophos-group-privacy-policy.aspx" > Privacy Notice </a> </li> <li> <a href="https://www.sophos.com/en-us/legal/cookie-information.aspx" > Cookies </a> </li> </ul> </div> </span> <!-- Legal --> <span class="relative"> <a href="#" class="whitespace-nowrap font-sansMedium text-sophos-gray-600 inline-block text-xs font-medium leading-tight" @click.prevent="legalMenu = !legalMenu" @click.away="legalMenu = false" > Legal <svg xmlns="http://www.w3.org/2000/svg" width="8" height="4" class="inline-block transition-transform transform" :class="{'rotate-180': legalMenu }" > <path fill="#7F8C9D" fill-rule="evenodd" d="M4 2.178L5.915.262a.708.708 0 01.996 0 .702.702 0 010 .995L4.75 3.415A.7.7 0 014 3.94a.702.702 0 01-.751-.524l-2.16-2.158a.702.702 0 11.996-.995L4 2.178z" /> </svg> </a> <div class="focus:outline-none border-sophos-gray-200 absolute bottom-0 left-0 w-48 px-4 py-1 py-4 mb-8 -ml-4 origin-bottom-left bg-white border rounded-md shadow-md" role="menu" aria-orientation="vertical" aria-labelledby="user-menu" x-show="legalMenu" x-cloak x-transition:enter="transition-all ease-out duration-100" x-transition:enter-start="transform opacity-0 scale-95" x-transition:enter-end="transform opacity-100 scale-100" x-transition:leave="transition ease-in duration-75" x-transition:leave-start="transform opacity-100 scale-100" x-transition:leave-end="transform opacity-0 scale-95" > <ul class="font-sansMedium text-sophos-gray-600 space-y-1 text-xs font-medium" > <li> <a href="https://www.sophos.com/en-us/legal.aspx" > General </a> </li> <li> <a href="https://www.sophos.com/en-us/legal/modern-slavery-act-transparency-statement.aspx" > Modern Slavery Statement </a> </li> <li> <a href="https://secure.ethicspoint.eu/domain/media/en/gui/104916/index.html" > Speak Out </a> </li> </ul> </div> </span> </div> <!-- Copyright --> <div class="md:ml-6 mt-2 md:mt-0 md:ml-6 mt-2 md:mt-0 md:items-center mr-auto ml-auto"> <span class="whitespace-nowrap font-sansMedium text-sophos-gray-600 inline-block text-xs font-medium leading-tight"> © 1997 - 2025 Sophos Ltd. All rights reserved </span> </div> </div> </div> </footer> <div id="amp-mobile-version-switcher" hidden> <a rel="" href="https://news.sophos.com/en-us/2023/06/22/using-large-language-models-classify-uncharted-web/?amp=1"> Go to mobile version </a> </div> <script type="text/javascript"> window.WPCOM_sharing_counts = {"https:\/\/news.sophos.com\/en-us\/2023\/06\/22\/using-large-language-models-classify-uncharted-web\/":92239}; </script> <script type="text/javascript" id="sophos-js-core-js-extra"> /* <![CDATA[ */ var PG8Data = {"startPage":"1","maxPages":"1","nextLink":""}; /* ]]> */ </script> <script type="text/javascript" src="https://news.sophos.com/_static/??-eJyVjFsKwyAQRTdUnWqg5qd0LSJDookPnGnT5deSDWTgfh3OuXA0FWphLAy8YkYCqm2tVPAgZe/GQaKxIdUtovpYbbUZQJ9AJ7rBpY9Qu8DGL3dP1/0T6TfHXV4Fn5uPSxGHzXcu2MXd4sXJn43olZ/GTfP8MGZy6QegwaBF" ></script><script type="text/javascript" src="https://unpkg.com/alpinejs@2.8.1/dist/alpine.js?ver=2.0.3" id="alpine-js-js"></script> <script type="text/javascript" src="https://news.sophos.com/wp-content/themes/sophosnews-2017/js/sophos-mc-validate.js?m=1738861137g" ></script><script type="text/javascript" src="https://stats.wp.com/e-202508.js" id="jetpack-stats-js" data-wp-strategy="defer"></script> <script type="text/javascript" id="jetpack-stats-js-after"> /* <![CDATA[ */ _stq = window._stq || []; _stq.push([ "view", JSON.parse("{\"v\":\"ext\",\"blog\":\"166161023\",\"post\":\"92239\",\"tz\":\"-5\",\"srv\":\"news.sophos.com\",\"hp\":\"vip\",\"j\":\"1:14.2.1\"}") ]); _stq.push([ "clickTrackerInit", "166161023", "92239" ]); /* ]]> */ </script> <script type="text/javascript" id="sharing-js-js-extra"> /* <![CDATA[ */ var sharing_js_options = {"lang":"en","counts":"1","is_stats_active":"1"}; /* ]]> */ </script> <script type="text/javascript" src="https://news.sophos.com/wp-content/mu-plugins/jetpack-14.2/_inc/build/sharedaddy/sharing.min.js?ver=14.2.1" id="sharing-js-js"></script> <script type="text/javascript" id="sharing-js-js-after"> /* <![CDATA[ */ var windowOpen; ( function () { function matches( el, sel ) { return !! ( el.matches && el.matches( sel ) || el.msMatchesSelector && el.msMatchesSelector( sel ) ); } document.body.addEventListener( 'click', function ( event ) { if ( ! event.target ) { return; } var el; if ( matches( event.target, 'a.share-mastodon' ) ) { el = event.target; } else if ( event.target.parentNode && matches( event.target.parentNode, 'a.share-mastodon' ) ) { el = event.target.parentNode; } if ( el ) { event.preventDefault(); // If there's another sharing window open, close it. if ( typeof windowOpen !== 'undefined' ) { windowOpen.close(); } windowOpen = window.open( el.getAttribute( 'href' ), 'wpcommastodon', 'menubar=1,resizable=1,width=460,height=400' ); return false; } } ); } )(); var windowOpen; ( function () { function matches( el, sel ) { return !! ( el.matches && el.matches( sel ) || el.msMatchesSelector && el.msMatchesSelector( sel ) ); } document.body.addEventListener( 'click', function ( event ) { if ( ! event.target ) { return; } var el; if ( matches( event.target, 'a.share-bluesky' ) ) { el = event.target; } else if ( event.target.parentNode && matches( event.target.parentNode, 'a.share-bluesky' ) ) { el = event.target.parentNode; } if ( el ) { event.preventDefault(); // If there's another sharing window open, close it. if ( typeof windowOpen !== 'undefined' ) { windowOpen.close(); } windowOpen = window.open( el.getAttribute( 'href' ), 'wpcombluesky', 'menubar=1,resizable=1,width=600,height=400' ); return false; } } ); } )(); var windowOpen; ( function () { function matches( el, sel ) { return !! ( el.matches && el.matches( sel ) || el.msMatchesSelector && el.msMatchesSelector( sel ) ); } document.body.addEventListener( 'click', function ( event ) { if ( ! event.target ) { return; } var el; if ( matches( event.target, 'a.share-linkedin' ) ) { el = event.target; } else if ( event.target.parentNode && matches( event.target.parentNode, 'a.share-linkedin' ) ) { el = event.target.parentNode; } if ( el ) { event.preventDefault(); // If there's another sharing window open, close it. if ( typeof windowOpen !== 'undefined' ) { windowOpen.close(); } windowOpen = window.open( el.getAttribute( 'href' ), 'wpcomlinkedin', 'menubar=1,resizable=1,width=580,height=450' ); return false; } } ); } )(); var windowOpen; ( function () { function matches( el, sel ) { return !! ( el.matches && el.matches( sel ) || el.msMatchesSelector && el.msMatchesSelector( sel ) ); } document.body.addEventListener( 'click', function ( event ) { if ( ! event.target ) { return; } var el; if ( matches( event.target, 'a.share-tumblr' ) ) { el = event.target; } else if ( event.target.parentNode && matches( event.target.parentNode, 'a.share-tumblr' ) ) { el = event.target.parentNode; } if ( el ) { event.preventDefault(); // If there's another sharing window open, close it. if ( typeof windowOpen !== 'undefined' ) { windowOpen.close(); } windowOpen = window.open( el.getAttribute( 'href' ), 'wpcomtumblr', 'menubar=1,resizable=1,width=450,height=450' ); return false; } } ); } )(); var windowOpen; ( function () { function matches( el, sel ) { return !! ( el.matches && el.matches( sel ) || el.msMatchesSelector && el.msMatchesSelector( sel ) ); } document.body.addEventListener( 'click', function ( event ) { if ( ! event.target ) { return; } var el; if ( matches( event.target, 'a.share-pocket' ) ) { el = event.target; } else if ( event.target.parentNode && matches( event.target.parentNode, 'a.share-pocket' ) ) { el = event.target.parentNode; } if ( el ) { event.preventDefault(); // If there's another sharing window open, close it. if ( typeof windowOpen !== 'undefined' ) { windowOpen.close(); } windowOpen = window.open( el.getAttribute( 'href' ), 'wpcompocket', 'menubar=1,resizable=1,width=450,height=450' ); return false; } } ); } )(); /* ]]> */ </script> </body> </html>