CINXE.COM

How to Build a GPT-3 for Science | Future

<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"> <meta http-equiv="X-UA-Compatible" content="IE=edge"> <meta name="viewport" content="width=device-width, initial-scale=1"> <title>How to Build a GPT-3 for Science | Future</title> <link rel="icon" href="https://future.com/wp-content/themes/future/assets/images/favicon/favicon.ico"> <link rel="icon" href="https://future.com/wp-content/themes/future/assets/images/favicon/icon.svg" type="image/svg+xml"> <link rel="apple-touch-icon" href="https://future.com/wp-content/themes/future/assets/images/favicon/apple-touch-icon.png"> <link rel="manifest" href="https://future.com/wp-content/themes/future/assets/images/favicon/manifest.webmanifest"> <meta name='robots' content='index, follow, max-image-preview:large, max-snippet:-1, max-video-preview:-1' /> <script type="text/javascript" id="wpp-js" src="https://future.com/wp-content/plugins/wordpress-popular-posts/assets/js/wpp.min.js?ver=7.1.0" data-sampling="0" data-sampling-rate="100" data-api-url="https://future.com/wp-json/wordpress-popular-posts" data-post-id="6859" data-token="fa137f645a" data-lang="0" data-debug="0"></script> <!-- This site is optimized with the Yoast SEO plugin v21.8 - https://yoast.com/wordpress/plugins/seo/ --> <meta name="description" content="A GPT-3-like AI model for science would accelerate innovation and improve reproducibility. Creating it will require us to unlock research." /> <link rel="canonical" href="https://future.com/how-to-build-gpt-3-for-science/" /> <meta property="og:locale" content="en_US" /> <meta property="og:type" content="article" /> <meta property="og:title" content="How to Build a GPT-3 for Science" /> <meta property="og:description" content="A GPT-3-like AI model for science would accelerate innovation and improve reproducibility. Creating it will require us to unlock research." /> <meta property="og:url" content="https://future.com/how-to-build-gpt-3-for-science/" /> <meta property="og:site_name" content="Future" /> <meta property="article:published_time" content="2022-08-18T15:00:00+00:00" /> <meta property="article:modified_time" content="2022-08-18T15:13:57+00:00" /> <meta property="og:image" content="https://future.com/wp-content/uploads/2022/08/812-DALL-EScienceResearch-Facebook-Final-1.jpg" /> <meta property="og:image:width" content="1200" /> <meta property="og:image:height" content="628" /> <meta property="og:image:type" content="image/jpeg" /> <meta name="author" content="Josh Nicholson" /> <meta name="twitter:card" content="summary_large_image" /> <meta name="twitter:image" content="https://future.com/wp-content/uploads/2022/08/812-DALL-EScienceResearch-Twitter-Final-1.jpg" /> <meta name="twitter:site" content="@a16z" /> <meta name="twitter:label1" content="Written by" /> <meta name="twitter:data1" content="Josh Nicholson" /> <meta name="twitter:label2" content="Est. reading time" /> <meta name="twitter:data2" content="10 minutes" /> <script type="application/ld+json" class="yoast-schema-graph">{"@context":"https://schema.org","@graph":[{"@type":"Article","@id":"https://future.com/how-to-build-gpt-3-for-science/#article","isPartOf":{"@id":"https://future.com/how-to-build-gpt-3-for-science/"},"author":[{"@id":"https://future.com/#/schema/person/image/2a04bf639702055a83cda91ee69e53d1"}],"headline":"How to Build a GPT-3 for Science","datePublished":"2022-08-18T15:00:00+00:00","dateModified":"2022-08-18T15:13:57+00:00","mainEntityOfPage":{"@id":"https://future.com/how-to-build-gpt-3-for-science/"},"wordCount":2216,"publisher":{"@id":"https://future.com/#organization"},"articleSection":["Bio &amp; Science","Data"],"inLanguage":"en-US"},{"@type":"WebPage","@id":"https://future.com/how-to-build-gpt-3-for-science/","url":"https://future.com/how-to-build-gpt-3-for-science/","name":"How to Build a GPT-3 for Science | Future","isPartOf":{"@id":"https://future.com/#website"},"datePublished":"2022-08-18T15:00:00+00:00","dateModified":"2022-08-18T15:13:57+00:00","description":"A GPT-3-like AI model for science would accelerate innovation and improve reproducibility. Creating it will require us to unlock research.","breadcrumb":{"@id":"https://future.com/how-to-build-gpt-3-for-science/#breadcrumb"},"inLanguage":"en-US","potentialAction":[{"@type":"ReadAction","target":["https://future.com/how-to-build-gpt-3-for-science/"]}]},{"@type":"BreadcrumbList","@id":"https://future.com/how-to-build-gpt-3-for-science/#breadcrumb","itemListElement":[{"@type":"ListItem","position":1,"name":"Home","item":"https://future.com/"},{"@type":"ListItem","position":2,"name":"How to Build a GPT-3 for Science"}]},{"@type":"WebSite","@id":"https://future.com/#website","url":"https://future.com/","name":"Future","description":"","publisher":{"@id":"https://future.com/#organization"},"potentialAction":[{"@type":"SearchAction","target":{"@type":"EntryPoint","urlTemplate":"https://future.com/?s={search_term_string}"},"query-input":"required name=search_term_string"}],"inLanguage":"en-US"},{"@type":"Organization","@id":"https://future.com/#organization","name":"Future","url":"https://future.com/","logo":{"@type":"ImageObject","inLanguage":"en-US","@id":"https://future.com/#/schema/logo/image/","url":"https://future.com/wp-content/uploads/2021/06/a16z-logo-card.png","contentUrl":"https://future.com/wp-content/uploads/2021/06/a16z-logo-card.png","width":1200,"height":628,"caption":"Future"},"image":{"@id":"https://future.com/#/schema/logo/image/"},"sameAs":["https://twitter.com/a16z","https://www.linkedin.com/company/andreessen-horowitz/mycompany/","https://www.youtube.com/channel/UC9cn0TuPq4dnbTY-CBsm8XA"]},{"@type":"Person","@id":"https://future.com/#/schema/person/image/2a04bf639702055a83cda91ee69e53d1","name":"Josh Nicholson","image":{"@type":"ImageObject","inLanguage":"en-US","@id":"https://future.com/#/schema/person/image/bab1666129c92293b30fc2617e95c064","url":"https://future.com/wp-content/uploads/2022/06/IMG_4897-150x150.png","contentUrl":"https://future.com/wp-content/uploads/2022/06/IMG_4897-150x150.png","width":150,"height":150,"caption":"Josh Nicholson"},"description":"is co-founder and CEO of scite. He holds a PhD in Cell Biology from Virginia Tech and has built and sold two companies aimed at improving how researchers collaborate and publish their work.","url":"https://future.com/author/josh-nicholson/"}]}</script> <!-- / Yoast SEO plugin. --> <link rel='dns-prefetch' href='//cdn.parsely.com' /> <link rel='dns-prefetch' href='//info.future.com' /> <script type="text/javascript"> /* <![CDATA[ */ window._wpemojiSettings = {"baseUrl":"https:\/\/s.w.org\/images\/core\/emoji\/15.0.3\/72x72\/","ext":".png","svgUrl":"https:\/\/s.w.org\/images\/core\/emoji\/15.0.3\/svg\/","svgExt":".svg","source":{"concatemoji":"https:\/\/future.com\/wp-includes\/js\/wp-emoji-release.min.js?ver=6.6.1"}}; /*! This file is auto-generated */ !function(i,n){var o,s,e;function c(e){try{var t={supportTests:e,timestamp:(new Date).valueOf()};sessionStorage.setItem(o,JSON.stringify(t))}catch(e){}}function p(e,t,n){e.clearRect(0,0,e.canvas.width,e.canvas.height),e.fillText(t,0,0);var t=new Uint32Array(e.getImageData(0,0,e.canvas.width,e.canvas.height).data),r=(e.clearRect(0,0,e.canvas.width,e.canvas.height),e.fillText(n,0,0),new Uint32Array(e.getImageData(0,0,e.canvas.width,e.canvas.height).data));return t.every(function(e,t){return e===r[t]})}function u(e,t,n){switch(t){case"flag":return n(e,"\ud83c\udff3\ufe0f\u200d\u26a7\ufe0f","\ud83c\udff3\ufe0f\u200b\u26a7\ufe0f")?!1:!n(e,"\ud83c\uddfa\ud83c\uddf3","\ud83c\uddfa\u200b\ud83c\uddf3")&&!n(e,"\ud83c\udff4\udb40\udc67\udb40\udc62\udb40\udc65\udb40\udc6e\udb40\udc67\udb40\udc7f","\ud83c\udff4\u200b\udb40\udc67\u200b\udb40\udc62\u200b\udb40\udc65\u200b\udb40\udc6e\u200b\udb40\udc67\u200b\udb40\udc7f");case"emoji":return!n(e,"\ud83d\udc26\u200d\u2b1b","\ud83d\udc26\u200b\u2b1b")}return!1}function f(e,t,n){var r="undefined"!=typeof WorkerGlobalScope&&self instanceof WorkerGlobalScope?new OffscreenCanvas(300,150):i.createElement("canvas"),a=r.getContext("2d",{willReadFrequently:!0}),o=(a.textBaseline="top",a.font="600 32px Arial",{});return e.forEach(function(e){o[e]=t(a,e,n)}),o}function t(e){var t=i.createElement("script");t.src=e,t.defer=!0,i.head.appendChild(t)}"undefined"!=typeof Promise&&(o="wpEmojiSettingsSupports",s=["flag","emoji"],n.supports={everything:!0,everythingExceptFlag:!0},e=new Promise(function(e){i.addEventListener("DOMContentLoaded",e,{once:!0})}),new Promise(function(t){var n=function(){try{var e=JSON.parse(sessionStorage.getItem(o));if("object"==typeof e&&"number"==typeof e.timestamp&&(new Date).valueOf()<e.timestamp+604800&&"object"==typeof e.supportTests)return e.supportTests}catch(e){}return null}();if(!n){if("undefined"!=typeof Worker&&"undefined"!=typeof OffscreenCanvas&&"undefined"!=typeof URL&&URL.createObjectURL&&"undefined"!=typeof Blob)try{var e="postMessage("+f.toString()+"("+[JSON.stringify(s),u.toString(),p.toString()].join(",")+"));",r=new Blob([e],{type:"text/javascript"}),a=new Worker(URL.createObjectURL(r),{name:"wpTestEmojiSupports"});return void(a.onmessage=function(e){c(n=e.data),a.terminate(),t(n)})}catch(e){}c(n=f(s,u,p))}t(n)}).then(function(e){for(var t in e)n.supports[t]=e[t],n.supports.everything=n.supports.everything&&n.supports[t],"flag"!==t&&(n.supports.everythingExceptFlag=n.supports.everythingExceptFlag&&n.supports[t]);n.supports.everythingExceptFlag=n.supports.everythingExceptFlag&&!n.supports.flag,n.DOMReady=!1,n.readyCallback=function(){n.DOMReady=!0}}).then(function(){return e}).then(function(){var e;n.supports.everything||(n.readyCallback(),(e=n.source||{}).concatemoji?t(e.concatemoji):e.wpemoji&&e.twemoji&&(t(e.twemoji),t(e.wpemoji)))}))}((window,document),window._wpemojiSettings); /* ]]> */ </script> <style id='wp-emoji-styles-inline-css' type='text/css'> img.wp-smiley, img.emoji { display: inline !important; border: none !important; box-shadow: none !important; height: 1em !important; width: 1em !important; margin: 0 0.07em !important; vertical-align: -0.1em !important; background: none !important; padding: 0 !important; } </style> <link rel='stylesheet' id='wp-block-library-css' href='https://future.com/wp-includes/css/dist/block-library/style.min.css?ver=6.6.1' type='text/css' media='all' /> <style id='wp-parsely-recommendations-style-inline-css' type='text/css'> .parsely-recommendations-list-title{font-size:1.2em}.parsely-recommendations-list{list-style:none;padding:unset}.parsely-recommendations-cardbody{overflow:hidden;padding:.8em;text-overflow:ellipsis;white-space:nowrap}.parsely-recommendations-cardmedia{padding:.8em .8em 0} </style> <style id='classic-theme-styles-inline-css' type='text/css'> /*! This file is auto-generated */ .wp-block-button__link{color:#fff;background-color:#32373c;border-radius:9999px;box-shadow:none;text-decoration:none;padding:calc(.667em + 2px) calc(1.333em + 2px);font-size:1.125em}.wp-block-file__button{background:#32373c;color:#fff;text-decoration:none} </style> <style id='global-styles-inline-css' type='text/css'> :root{--wp--preset--aspect-ratio--square: 1;--wp--preset--aspect-ratio--4-3: 4/3;--wp--preset--aspect-ratio--3-4: 3/4;--wp--preset--aspect-ratio--3-2: 3/2;--wp--preset--aspect-ratio--2-3: 2/3;--wp--preset--aspect-ratio--16-9: 16/9;--wp--preset--aspect-ratio--9-16: 9/16;--wp--preset--color--black: #000000;--wp--preset--color--cyan-bluish-gray: #abb8c3;--wp--preset--color--white: #ffffff;--wp--preset--color--pale-pink: #f78da7;--wp--preset--color--vivid-red: #cf2e2e;--wp--preset--color--luminous-vivid-orange: #ff6900;--wp--preset--color--luminous-vivid-amber: #fcb900;--wp--preset--color--light-green-cyan: #7bdcb5;--wp--preset--color--vivid-green-cyan: #00d084;--wp--preset--color--pale-cyan-blue: #8ed1fc;--wp--preset--color--vivid-cyan-blue: #0693e3;--wp--preset--color--vivid-purple: #9b51e0;--wp--preset--gradient--vivid-cyan-blue-to-vivid-purple: linear-gradient(135deg,rgba(6,147,227,1) 0%,rgb(155,81,224) 100%);--wp--preset--gradient--light-green-cyan-to-vivid-green-cyan: linear-gradient(135deg,rgb(122,220,180) 0%,rgb(0,208,130) 100%);--wp--preset--gradient--luminous-vivid-amber-to-luminous-vivid-orange: linear-gradient(135deg,rgba(252,185,0,1) 0%,rgba(255,105,0,1) 100%);--wp--preset--gradient--luminous-vivid-orange-to-vivid-red: linear-gradient(135deg,rgba(255,105,0,1) 0%,rgb(207,46,46) 100%);--wp--preset--gradient--very-light-gray-to-cyan-bluish-gray: linear-gradient(135deg,rgb(238,238,238) 0%,rgb(169,184,195) 100%);--wp--preset--gradient--cool-to-warm-spectrum: linear-gradient(135deg,rgb(74,234,220) 0%,rgb(151,120,209) 20%,rgb(207,42,186) 40%,rgb(238,44,130) 60%,rgb(251,105,98) 80%,rgb(254,248,76) 100%);--wp--preset--gradient--blush-light-purple: linear-gradient(135deg,rgb(255,206,236) 0%,rgb(152,150,240) 100%);--wp--preset--gradient--blush-bordeaux: linear-gradient(135deg,rgb(254,205,165) 0%,rgb(254,45,45) 50%,rgb(107,0,62) 100%);--wp--preset--gradient--luminous-dusk: linear-gradient(135deg,rgb(255,203,112) 0%,rgb(199,81,192) 50%,rgb(65,88,208) 100%);--wp--preset--gradient--pale-ocean: linear-gradient(135deg,rgb(255,245,203) 0%,rgb(182,227,212) 50%,rgb(51,167,181) 100%);--wp--preset--gradient--electric-grass: linear-gradient(135deg,rgb(202,248,128) 0%,rgb(113,206,126) 100%);--wp--preset--gradient--midnight: linear-gradient(135deg,rgb(2,3,129) 0%,rgb(40,116,252) 100%);--wp--preset--font-size--small: 13px;--wp--preset--font-size--medium: 20px;--wp--preset--font-size--large: 36px;--wp--preset--font-size--x-large: 42px;--wp--preset--spacing--20: 0.44rem;--wp--preset--spacing--30: 0.67rem;--wp--preset--spacing--40: 1rem;--wp--preset--spacing--50: 1.5rem;--wp--preset--spacing--60: 2.25rem;--wp--preset--spacing--70: 3.38rem;--wp--preset--spacing--80: 5.06rem;--wp--preset--shadow--natural: 6px 6px 9px rgba(0, 0, 0, 0.2);--wp--preset--shadow--deep: 12px 12px 50px rgba(0, 0, 0, 0.4);--wp--preset--shadow--sharp: 6px 6px 0px rgba(0, 0, 0, 0.2);--wp--preset--shadow--outlined: 6px 6px 0px -3px rgba(255, 255, 255, 1), 6px 6px rgba(0, 0, 0, 1);--wp--preset--shadow--crisp: 6px 6px 0px rgba(0, 0, 0, 1);}:where(.is-layout-flex){gap: 0.5em;}:where(.is-layout-grid){gap: 0.5em;}body .is-layout-flex{display: flex;}.is-layout-flex{flex-wrap: wrap;align-items: center;}.is-layout-flex > :is(*, div){margin: 0;}body .is-layout-grid{display: grid;}.is-layout-grid > :is(*, div){margin: 0;}:where(.wp-block-columns.is-layout-flex){gap: 2em;}:where(.wp-block-columns.is-layout-grid){gap: 2em;}:where(.wp-block-post-template.is-layout-flex){gap: 1.25em;}:where(.wp-block-post-template.is-layout-grid){gap: 1.25em;}.has-black-color{color: var(--wp--preset--color--black) !important;}.has-cyan-bluish-gray-color{color: var(--wp--preset--color--cyan-bluish-gray) !important;}.has-white-color{color: var(--wp--preset--color--white) !important;}.has-pale-pink-color{color: var(--wp--preset--color--pale-pink) !important;}.has-vivid-red-color{color: var(--wp--preset--color--vivid-red) !important;}.has-luminous-vivid-orange-color{color: var(--wp--preset--color--luminous-vivid-orange) !important;}.has-luminous-vivid-amber-color{color: var(--wp--preset--color--luminous-vivid-amber) !important;}.has-light-green-cyan-color{color: var(--wp--preset--color--light-green-cyan) !important;}.has-vivid-green-cyan-color{color: var(--wp--preset--color--vivid-green-cyan) !important;}.has-pale-cyan-blue-color{color: var(--wp--preset--color--pale-cyan-blue) !important;}.has-vivid-cyan-blue-color{color: var(--wp--preset--color--vivid-cyan-blue) !important;}.has-vivid-purple-color{color: var(--wp--preset--color--vivid-purple) !important;}.has-black-background-color{background-color: var(--wp--preset--color--black) !important;}.has-cyan-bluish-gray-background-color{background-color: var(--wp--preset--color--cyan-bluish-gray) !important;}.has-white-background-color{background-color: var(--wp--preset--color--white) !important;}.has-pale-pink-background-color{background-color: var(--wp--preset--color--pale-pink) !important;}.has-vivid-red-background-color{background-color: var(--wp--preset--color--vivid-red) !important;}.has-luminous-vivid-orange-background-color{background-color: var(--wp--preset--color--luminous-vivid-orange) !important;}.has-luminous-vivid-amber-background-color{background-color: var(--wp--preset--color--luminous-vivid-amber) !important;}.has-light-green-cyan-background-color{background-color: var(--wp--preset--color--light-green-cyan) !important;}.has-vivid-green-cyan-background-color{background-color: var(--wp--preset--color--vivid-green-cyan) !important;}.has-pale-cyan-blue-background-color{background-color: var(--wp--preset--color--pale-cyan-blue) !important;}.has-vivid-cyan-blue-background-color{background-color: var(--wp--preset--color--vivid-cyan-blue) !important;}.has-vivid-purple-background-color{background-color: var(--wp--preset--color--vivid-purple) !important;}.has-black-border-color{border-color: var(--wp--preset--color--black) !important;}.has-cyan-bluish-gray-border-color{border-color: var(--wp--preset--color--cyan-bluish-gray) !important;}.has-white-border-color{border-color: var(--wp--preset--color--white) !important;}.has-pale-pink-border-color{border-color: var(--wp--preset--color--pale-pink) !important;}.has-vivid-red-border-color{border-color: var(--wp--preset--color--vivid-red) !important;}.has-luminous-vivid-orange-border-color{border-color: var(--wp--preset--color--luminous-vivid-orange) !important;}.has-luminous-vivid-amber-border-color{border-color: var(--wp--preset--color--luminous-vivid-amber) !important;}.has-light-green-cyan-border-color{border-color: var(--wp--preset--color--light-green-cyan) !important;}.has-vivid-green-cyan-border-color{border-color: var(--wp--preset--color--vivid-green-cyan) !important;}.has-pale-cyan-blue-border-color{border-color: var(--wp--preset--color--pale-cyan-blue) !important;}.has-vivid-cyan-blue-border-color{border-color: var(--wp--preset--color--vivid-cyan-blue) !important;}.has-vivid-purple-border-color{border-color: var(--wp--preset--color--vivid-purple) !important;}.has-vivid-cyan-blue-to-vivid-purple-gradient-background{background: var(--wp--preset--gradient--vivid-cyan-blue-to-vivid-purple) !important;}.has-light-green-cyan-to-vivid-green-cyan-gradient-background{background: var(--wp--preset--gradient--light-green-cyan-to-vivid-green-cyan) !important;}.has-luminous-vivid-amber-to-luminous-vivid-orange-gradient-background{background: var(--wp--preset--gradient--luminous-vivid-amber-to-luminous-vivid-orange) !important;}.has-luminous-vivid-orange-to-vivid-red-gradient-background{background: var(--wp--preset--gradient--luminous-vivid-orange-to-vivid-red) !important;}.has-very-light-gray-to-cyan-bluish-gray-gradient-background{background: var(--wp--preset--gradient--very-light-gray-to-cyan-bluish-gray) !important;}.has-cool-to-warm-spectrum-gradient-background{background: var(--wp--preset--gradient--cool-to-warm-spectrum) !important;}.has-blush-light-purple-gradient-background{background: var(--wp--preset--gradient--blush-light-purple) !important;}.has-blush-bordeaux-gradient-background{background: var(--wp--preset--gradient--blush-bordeaux) !important;}.has-luminous-dusk-gradient-background{background: var(--wp--preset--gradient--luminous-dusk) !important;}.has-pale-ocean-gradient-background{background: var(--wp--preset--gradient--pale-ocean) !important;}.has-electric-grass-gradient-background{background: var(--wp--preset--gradient--electric-grass) !important;}.has-midnight-gradient-background{background: var(--wp--preset--gradient--midnight) !important;}.has-small-font-size{font-size: var(--wp--preset--font-size--small) !important;}.has-medium-font-size{font-size: var(--wp--preset--font-size--medium) !important;}.has-large-font-size{font-size: var(--wp--preset--font-size--large) !important;}.has-x-large-font-size{font-size: var(--wp--preset--font-size--x-large) !important;} :where(.wp-block-post-template.is-layout-flex){gap: 1.25em;}:where(.wp-block-post-template.is-layout-grid){gap: 1.25em;} :where(.wp-block-columns.is-layout-flex){gap: 2em;}:where(.wp-block-columns.is-layout-grid){gap: 2em;} :root :where(.wp-block-pullquote){font-size: 1.5em;line-height: 1.6;} </style> <link rel='stylesheet' id='bcct_style-css' href='https://future.com/wp-content/plugins/better-click-to-tweet/assets/css/styles.css?ver=3.0' type='text/css' media='all' /> <link rel='stylesheet' id='wordpress-popular-posts-css-css' href='https://future.com/wp-content/plugins/wordpress-popular-posts/assets/css/wpp.css?ver=7.1.0' type='text/css' media='all' /> <link rel='stylesheet' id='base-theme-style-css' href='https://future.com/wp-content/themes/future/assets/compiled/css/theme.css?id=fa9fd051ff43aebff1c3' type='text/css' media='all' /> <script type="text/javascript" src="//info.future.com/js/forms2/js/forms2.min.js?ver=1.0" id="marketo-newsletter-js"></script> <script type="text/javascript" src="https://future.com/wp-includes/js/jquery/jquery.min.js?ver=3.7.1" id="jquery-core-js"></script> <script type="text/javascript" src="https://future.com/wp-includes/js/jquery/jquery-migrate.min.js?ver=3.4.1" id="jquery-migrate-js"></script> <link rel="https://api.w.org/" href="https://future.com/wp-json/" /><link rel="alternate" title="JSON" type="application/json" href="https://future.com/wp-json/wp/v2/posts/6859" /><link rel='shortlink' href='https://future.com/?p=6859' /> <link rel="alternate" title="oEmbed (JSON)" type="application/json+oembed" href="https://future.com/wp-json/oembed/1.0/embed?url=https%3A%2F%2Ffuture.com%2Fhow-to-build-gpt-3-for-science%2F" /> <link rel="alternate" title="oEmbed (XML)" type="text/xml+oembed" href="https://future.com/wp-json/oembed/1.0/embed?url=https%3A%2F%2Ffuture.com%2Fhow-to-build-gpt-3-for-science%2F&#038;format=xml" /> <style id="wpp-loading-animation-styles">@-webkit-keyframes bgslide{from{background-position-x:0}to{background-position-x:-200%}}@keyframes bgslide{from{background-position-x:0}to{background-position-x:-200%}}.wpp-widget-block-placeholder,.wpp-shortcode-placeholder{margin:0 auto;width:60px;height:3px;background:#dd3737;background:linear-gradient(90deg,#dd3737 0%,#571313 10%,#dd3737 100%);background-size:200% auto;border-radius:3px;-webkit-animation:bgslide 1s infinite linear;animation:bgslide 1s infinite linear}</style> <script type="application/ld+json">{"@context":"https:\/\/schema.org","@type":"NewsArticle","headline":"How to Build a GPT-3 for Science","url":"https:\/\/future.com\/how-to-build-gpt-3-for-science\/","mainEntityOfPage":{"@type":"WebPage","@id":"https:\/\/future.com\/how-to-build-gpt-3-for-science\/"},"thumbnailUrl":"","image":{"@type":"ImageObject","url":""},"articleSection":"Bio &amp; Science","author":[{"@type":"Person","name":"Josh Nicholson"}],"creator":["Josh Nicholson"],"publisher":{"@type":"Organization","name":"Future","logo":""},"keywords":[],"dateCreated":"2022-08-18T15:00:00Z","datePublished":"2022-08-18T15:00:00Z","dateModified":"2022-08-18T15:13:57Z"}</script><script src="https://cdn.optimizely.com/js/21286310933.js"></script> <style id="hidepage"> body{display:none !important;} </style> <script type="text/javascript"> if (self === top) { var hidepage = document.getElementById("hidepage"); hidepage.parentNode.removeChild(hidepage); } else { top.location = self.location; } </script> <script async src="https://www.googletagmanager.com/gtag/js?id=G-P4KXVVJ86J"></script> <script> window.dataLayer = window.dataLayer || []; function gtag(){dataLayer.push(arguments);} gtag('js', new Date()); gtag('config', 'G-P4KXVVJ86J'); </script> <script> window.search_page = 'https://future.com/search/'; window.search_index = 'future_posts_prod'; window.search_index_date = 'future_posts_prod_by_date'; </script> <!-- Google Tag Manager --> <script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start': new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0], j=d.createElement(s),dl=l!='dataLayer'?'&l='+l:'';j.async=true;j.src= 'https://www.googletagmanager.com/gtm.js?id='+i+dl;f.parentNode.insertBefore(j,f); })(window,document,'script','dataLayer','GTM-M59QZLF');</script> <!-- End Google Tag Manager --> <script> //MARKETO (function() { var didInit = false; function initMunchkin() { if(didInit === false) { didInit = true; Munchkin.init('382-JZB-798'); } } var s = document.createElement('script'); s.type = 'text/javascript'; s.async = true; s.src = '//munchkin.marketo.net/munchkin.js'; s.onreadystatechange = function() { if (this.readyState == 'complete' || this.readyState == 'loaded') { initMunchkin(); } }; s.onload = initMunchkin; document.getElementsByTagName('head')[0].appendChild(s); })(); </script> <script id="featured-posts"> window.featured_posts_list = [{"url":"https:\/\/future.com\/what-the-merge-means-for-ethereum-with-danny-ryan\/","is_post":false,"type":"Article","title":"What the Merge Means for Ethereum","author":"Danny Ryan and Jeff Benson","desc":"The Ethereum Foundation's Danny Ryan discusses how the Merge will increase security and explains how proof of stake impacts developers."},{"url":"https:\/\/future.com\/jason-fried-basecamp-hey-interview\/","is_post":false,"type":"Article","title":"Jason Fried on Why He Doesn&#8217;t Do Planning or Politics at Work","author":"Jason Fried and Lauren Murrow","desc":"The Basecamp and HEY founder discusses the power of short-term thinking, his framework for startup longevity, and the key thing he looks for when hiring remote."},{"url":"https:\/\/future.com\/how-to-build-gpt-3-for-science\/","is_post":false,"type":"Article","title":"How to Build a GPT-3 for Science","author":"Josh Nicholson","desc":"A GPT-3-like AI model for science would accelerate innovation and improve reproducibility. Creating it will require us to unlock scientific publications."},{"url":"https:\/\/future.com\/developers-side-projects\/","is_post":false,"type":"Article","title":"Why Developers Are Building So Many Side Projects","author":"Ben Stokes","desc":"From unleashing creativity to mitigating risk, Ben Stokes of Tiny Projects shares some of the main reasons why developers are building so many side projects."},{"url":"https:\/\/future.com\/how-the-merge-will-impact-future-ethereum-upgrades\/","is_post":false,"type":"Article","title":"What&#8217;s After Proof of Stake for Ethereum?","author":"Danny Ryan and Jeff Benson","desc":"Danny Ryan details the path forward for scalability and explains the possibility of stateless Ethereum, while sharing how the Merge will affect future upgrades."}]; </script> </head> <body class="is-article-page"> <div id="wrapper"> <div class="w1"> <header id="header"> <div class="container"> <a href="/" class="logo"> <img src="https://future.com/wp-content/themes/future/assets/images/future.svg" alt="Future Logo"> </a> <div class="search-wrap"> <a href="#" class="icon-search search-opener"></a> </div> <a href="#" class="nav-opener"><span></span></a> <div class="header-content"> <nav id="nav"> <ul> <li > <a href="https://future.com/topics/">Tech Trends</a> <div class="drop"> <div class="inner"> <div class="col"> <ul class="list"> <li> <a href="https://future.com/category/crypto-web3/">Crypto & Web3</a> </li> <li> <a href="https://future.com/category/marketplaces/">Marketplaces</a> </li> <li> <a href="https://future.com/category/creator-economy/">Creator Economy</a> </li> <li> <a href="https://future.com/category/saas/">SaaS</a> </li> <li> <a href="https://future.com/category/fintech/">Fintech</a> </li> <li> <a href="https://future.com/category/infrastructure/">Infrastructure</a> </li> <li> <a href="https://future.com/category/social/">Social</a> </li> <li> <a href="https://future.com/category/games/">Games</a> </li> <li> <a href="https://future.com/category/bio-science/">Bio & Science</a> </li> <li> <a href="https://future.com/category/data/">Data</a> </li> <li class="see-wrap"><a href="https://future.com/topics/">See all Tech Trends</a></li> </ul> </div> <div class="link-col"> <div class="link-wrap"> <a class="see" href="https://future.com/topics/">See all Tech Trends</a> </div> </div> </div> </div> </li> <li > <a href="https://future.com/topics/">Company Building</a> <div class="drop"> <div class="inner"> <div class="col"> <ul class="list"> <li> <a href="https://future.com/category/capital-fundraising">Capital & Fundraising</a> </li> <li> <a href="https://future.com/category/product-design-engineering/">Product, Design & Engineering</a> </li> <li> <a href="https://future.com/category/metrics-kpis/">Metrics & KPIs</a> </li> <li> <a href="https://future.com/category/sales-go-to-market/">Sales & Go To Market</a> </li> <li> <a href="https://future.com/category/growth-marketing/">Growth & Marketing</a> </li> <li> <a href="https://future.com/category/innovation-creativity">Innovation & Creativity</a> </li> <li> <a href="https://future.com/category/leadership-and-management/">Leadership & Management</a> </li> <li> <a href="https://future.com/category/work-life/">Work Life</a> </li> <li class="see-wrap"><a href="https://future.com/topics/">See all Company Building</a></li> </ul> </div> <div class="link-col"> <div class="link-wrap"> <a class="see" href="https://future.com/topics/">See all Company Building</a> </div> </div> </div> </div> </li> <li class="has-full-drop"> <a href="#">Special Reports</a> <div class="drop"> <div class="inner"> <div class="row pkg-list"> <div class="col-xs-6 col-sm-6 col-md-3"> <a href="https://future.com/marketplace-100/"> <div class="img-wrap"> <img src="https://future.com/wp-content/uploads/2022/04/M100-Social-Share-Facebook.jpg" class="attachment-full" alt="M100 Social Share Facebook" /> </div> <strong>Marketplace 100</strong> <span></span> </a> </div> <div class="col-xs-6 col-sm-6 col-md-3"> <a href="https://future.com/data50/"> <div class="img-wrap"> <img src="https://future.com/wp-content/uploads/2022/03/Data50-FutureHomepage-ReportPackage-Graphic.png" class="attachment-full" alt="Data50-FutureHomepage-ReportPackage-Graphic" /> </div> <strong>Data50</strong> <span></span> </a> </div> </div> </div> </div> </li> <li > <a href="https://future.com/podcast-network/">Podcasts</a> </li> </ul> </nav> <div class="header-right"> <!-- <a href="https://future.com/podcast-network/">Podcasts</a> --> <div class="search"> <a href="#" class="icon-search search-opener"></a> </div> </div> </div> </div> <div class="search-content"> <div class="search-row"> <div class="container"> <div class="search-holder"> <label for="search"></label> <div id="search-quick-field"></div> <span class="search-bg"></span> <a href="#" class="close search-opener" id="btn-search-opener"></a> </div> </div> </div> <div class="search-result-block"> <div class="container"> <div class="search-result-row"> <div class="search-suggestion-list"> <strong class="title">Top Suggestions</strong> <ul> <li> <a href="#" data-search-suggestion-quick="future 2024-2028 presidents timelines">future 2024-2028 presidents timelines</a> </li> <li> <a href="#" data-search-suggestion-quick="info diet">info diet</a> </li> <li> <a href="#" data-search-suggestion-quick="modern data stack">modern data stack</a> </li> <li> <a href="#" data-search-suggestion-quick="future investment">future investment</a> </li> <li> <a href="#" data-search-suggestion-quick="list">list</a> </li> <li> <a href="#" data-search-suggestion-quick="ivankiak87@gmail.com ivan kiiak 1008931239949 41070917554 2000251962 4337111122221224 4337111122221235 380660917554"><span class="__cf_email__" data-cfemail="4d243b2c2326242c26757a0d2a202c2421632e2220">[email&#160;protected]</span> ivan kiiak 1008931239949 41070917554 2000251962 4337111122221224 4337111122221235 380660917554</a> </li> <li> <a href="#" data-search-suggestion-quick="amazon">amazon</a> </li> <li> <a href="#" data-search-suggestion-quick="future">future</a> </li> <li> <a href="#" data-search-suggestion-quick="contact">contact</a> </li> <li> <a href="#" data-search-suggestion-quick="amlwatcher">amlwatcher</a> </li> </ul> </div> <div class="search-result-list"> <div id="search-quick-results-suggested"> <div> <div class="ais-Hits"> <ol class="ais-Hits-list"> <li class="ais-Hits-item"> <a href="https://future.com/north-star-metrics/"> <strong class="title"> Choosing Your North Star Metric </strong> <p>Lenny Rachitsky</p> </a> </li> <li class="ais-Hits-item"> <a href="https://future.com/community-%e2%89%a0-marketing-why-we-need-go-to-community-not-just-go-to-market/"> <strong class="title"> Community ≠ Marketing: Why We Need Go-to-Community, Not Just Go-to-Market </strong> <p>Patrick Woods</p> </a> </li> <li class="ais-Hits-item"> <a href="https://future.com/how-to-build-gpt-3-for-science/"> <strong class="title"> How to Build a GPT-3 for Science </strong> <p>Josh Nicholson</p> </a> </li> <li class="ais-Hits-item"> <a href="https://future.com/product-thinking/"> <strong class="title"> The Power of Product Thinking </strong> <p>Julie Zhuo</p> </a> </li> <li class="ais-Hits-item"> <a href="https://future.com/college-ambassador-program-how-to-for-startups/"> <strong class="title"> A Startup's Guide to Launching College Ambassador Programs </strong> <p>Jacob Westphal</p> </a> </li> </ol> </div> </div> </div> <div id="search-quick-results" class="hide-results"></div> <a id="btn-advanced-search" data-advanced-search="https://future.com/search/" href="https://future.com/search/" class="see-adv">See More Results</a> </div> </div> </div> </div> </div> </header> <div class="main-article"> <div class="container"> <div class="row"> <div class="col-xs-12 col-md-7 col-md-offset-3 col-lg-offset-2"> <div class="article-heading"> <strong class="topic-title"><a href="https://future.com/category/bio-science/">Bio &amp; Science</a></strong> <h1>How to Build a GPT-3 for Science</h1> <span class="sub-title"> <a href="https://future.com/author/josh-nicholson/"> Josh Nicholson </a> </span> <div class="share-holder"> <div class="social-networks js-dropdown"> <a href="#" class="share-opener">share</a> <ul> <li><a href="https://future.com/how-to-build-gpt-3-for-science/" data-share-title="How to Build a GPT-3 for Science" data-share="twitter"><i class="icon-twitter"></i>Twitter</a></li> <li><a href="https://future.com/how-to-build-gpt-3-for-science/" data-share-title="How to Build a GPT-3 for Science" data-share="linkedin"><i class="icon-linkedin"></i>LinkedIn</a></li> <li><a href="https://future.com/how-to-build-gpt-3-for-science/" data-share-title="How to Build a GPT-3 for Science" data-share="facebook"><i class="icon-facebook"></i>Facebook</a></li> <li><a href="https://future.com/how-to-build-gpt-3-for-science/" data-share-title="How to Build a GPT-3 for Science" data-share="hackernews"><i class="icon-hacker-news"></i>Hacker News</a></li> <li><a href="https://future.com/how-to-build-gpt-3-for-science/" data-share-title="How to Build a GPT-3 for Science" data-share="whatsapp"><i class="icon-whatsapp"></i>WhatsApp</a></li> <li><a href="https://future.com/how-to-build-gpt-3-for-science/" data-share-title="How to Build a GPT-3 for Science" data-share="flipboard"><i class="icon-flipboard"></i>Flipboard</a></li> <li><a href="https://future.com/how-to-build-gpt-3-for-science/" data-share-title="How to Build a GPT-3 for Science" data-share="reddit"><i class="icon-reddit"></i>Reddit</a></li> <li><a href="https://future.com/how-to-build-gpt-3-for-science/" data-share-link><i class="icon-link"></i>Copy Link</a></li> <li><a href="/cdn-cgi/l/email-protection#6a55191f08000f091e5722051d4a1e054a281f03060e4a0b4a2d3a3e47594a0c05184a3909030f04090f4c0b071a5108050e1357021e1e1a195045450c1f1e1f180f440905074502051d471e0547081f03060e470d1a1e4759470c0518471909030f04090f45"><i class="icon-mailto"></i>Send Email</a></li> </ul> </div> </div> </div> </div> </div> <div class="row articles-container"> <div class="col-xs-12 col-md-3 col-lg-2"> <div class="sticky-nav" id="toc-menu" style="display: none;"> <div class="top"> <a href="#" class="title"> <span class="text-hold"> Contents <span class="mark">:</span> <span class="currect-item toc-list--mobile">The Evolution of The Passion Economy</span> </span> </a> <ul class="toc-list--desktop"></ul> </div> </div> </div> <div class="col-xs-12 col-md-6 col-lg-7"> <article class="single-post"> <p><span style="font-weight: 400;">Want to create an image of </span><a href="https://twitter.com/Dalle2Pics/status/1540163384947113984?s=20&amp;t=iKCtfxqQGtt62mqvxd9Fww"><span style="font-weight: 400;">velociraptors working on a skyscraper, in the style of “Lunch Atop A Skyscraper” of 1932</span></a><span style="font-weight: 400;">? Use DALL-E. Want to create an imaginary </span><a href="https://arr.am/2020/07/22/why-gpt-3-is-good-for-comedy-or-reddit-eats-larry-page-alive/"><span style="font-weight: 400;">standup comedy show by Peter Thiel, Elon Musk, and Larry Page</span></a><span style="font-weight: 400;">? Use GPT-3. Want to deeply understand COVID-19 research and answer your questions based on evidence? Learn how to do a Boolean search, read scientific papers, and maybe get a PhD, because there are no generative AI models trained on the vast body of scientific research publications. If there were, getting evidence-backed, plain-language answers to scientific questions would be among the simplest benefits. Generative AI for science could help reverse the </span><a href="https://mattsclancy.substack.com/p/science-is-getting-harder"><span style="font-weight: 400;">deceleration of innovation in science</span></a><span style="font-weight: 400;"> by </span><span style="font-weight: 400;">making it </span><a href="https://www.aeaweb.org/articles?id=10.1257/aer.20180338"><span style="font-weight: 400;">easier </span></a><span style="font-weight: 400;">and </span><a href="https://cepr.org/voxeu/columns/ideas-arent-running-out-they-are-getting-more-expensive-find"><span style="font-weight: 400;">cheaper</span></a><span style="font-weight: 400;"> to find new ideas. Such models could also provide data-backed warnings of therapeutic hypotheses that are certain to fail, counterbalancing human bias and avoiding billion-dollar, </span><a href="https://www.statnews.com/2019/06/25/alzheimers-cabal-thwarted-progress-toward-cure/"><span style="font-weight: 400;">decades-long blind alleys</span></a><span style="font-weight: 400;">. Finally, such models could combat </span><a href="https://www.nature.com/articles/533452a"><span style="font-weight: 400;">the reproducibility crisis</span></a><span style="font-weight: 400;"> by mapping, weighing, and contextualizing research results, providing a score on trustability.</span></p> <p><span style="font-weight: 400;">So why don&#8217;t we have a DALL-E or GPT-3 for science? The reason is that although scientific research is the </span><span style="font-weight: 400;">world’s most valuable content, it is also the world&#8217;s least accessible and understandable content.</span><span style="font-weight: 400;"> I’ll explain what it would take to unlock scientific data at scale to make generative AI for science possible, and how it would transform the way we engage with research.</span><span style="font-weight: 400;">  </span></p> <h2><b></b><b><div id="section--1" data-toc-header="What makes scientific research data challenging"></div>What makes scientific research data challenging</b></h2> <p><span style="font-weight: 400;">Research publications are some of the world&#8217;s most important repositories for content and information ever created. They tie ideas and findings together across time and disciplines, and are forever preserved by a network of libraries. They are supported by evidence, analysis, expert insight, and statistical relationships. They are extremely valuable, yet they are largely hidden from the web and used very inefficiently. The web is rife with cute, cuddly cat videos but largely devoid of cutting-edge cancer research. As an example, the </span><a href="https://clarivate.com/webofsciencegroup/solutions/web-of-science/"><span style="font-weight: 400;">Web of Science</span></a><span style="font-weight: 400;"> is one of the most comprehensive indexes of scientific knowledge. It has been around for decades, but it’s probably something most readers have never even heard of, let alone interacted with. Most of us don’t have access to research papers, and even when we do, they’re dense, hard to understand, and packaged as a PDF — a format designed for printing, not for the web. </span><span style="font-weight: 400;"><br /> </span><span style="font-weight: 400;"><br /> </span><span style="font-weight: 400;">Because scientific papers are not easily accessible, we can’t easily use the data to train generative models like GPT-3 or DALL-E.</span> <span style="font-weight: 400;">Can you</span> <span style="font-weight: 400;">imagine if a researcher could propose an experiment and an AI model could instantly tell them if it had been done before (and better yet, give them the result)? Then, once they have data from a novel experiment, the AI could suggest a follow-up experiment based on the result. Finally, imagine the time that could be saved if the researcher could upload their results and the AI model could write the resulting manuscript for </span><span style="font-weight: 400;">them. The closest we&#8217;ve ever come to a DALL-E of science is Google Scholar, but it’s not a sustainable or scalable solution. IBM Watson also set out to achieve much of what I describe here, but most of the work came ahead of recent advances in large language models and didn’t utilize appropriate or sufficient data to match the marketing hype.</span></p> <p><span style="font-weight: 400;">For the kind of value unlock I’m describing, we need long-term investment, commitment, and vision. As proposed </span><a href="https://future.com/publomics-replication-crisis/"><span style="font-weight: 400;">recently</span></a><span style="font-weight: 400;"> in </span><i><span style="font-weight: 400;">Future</span></i><span style="font-weight: 400;">, we need to treat scientific publications as substrates to be combined and analyzed at scale. Once we remove the barriers, we will be able to use science to feed data-hungry generative AI models. These models have immense potential to accelerate science and increase scientific literacy, such as through training them to generate new scientific ideas, helping scientists manage and navigate the vast scientific literature, help identify flawed or even falsified research, and synthesize and translate complex research findings into ordinary human speech.</span></p> <h2><b><div id="section--2" data-toc-header="How do we get a DALL-E or GPT-3 for science?"></div>How do we get a DALL-E or GPT-3 for science?</b></h2> <p><span style="font-weight: 400;">If you’re in tech, showing a friend outputs from generative AI models like </span><a href="https://openai.com/blog/dall-e/"><span style="font-weight: 400;">DALL-E</span></a><span style="font-weight: 400;"> or </span><a href="https://openai.com/api/"><span style="font-weight: 400;">GPT-3</span></a><span style="font-weight: 400;"> is like showing them magic. These tools represent the next generation of the web. They derive from the synthesis of massive amounts of information, beyond a simple linkage, to create tools with generative capacity. So how can we create a similarly magical experience in science, where anyone can ask a question of the scientific literature in plain language and get an understandable answer backed by evidence? How can we help researchers create, develop, refine, and test their hypotheses? How can we potentially avoid wasting billions of dollars on </span><a href="https://www.statnews.com/2019/06/25/alzheimers-cabal-thwarted-progress-toward-cure/"><span style="font-weight: 400;">failing hypotheses in Alzheimer&#8217;s research</span></a><span style="font-weight: 400;"> and </span><a href="https://www.theatlantic.com/science/archive/2019/05/waste-1000-studies/589684/"><span style="font-weight: 400;">erroneous connections between genetics and depression</span></a><span style="font-weight: 400;">? </span></p> <p><span style="font-weight: 400;">The solutions to these questions might sound like science fiction, but there is proof that we can do amazing and unthinkable things when scientific work is used for more than just the sum of its parts. Indeed, utilizing nearly 200,000 protein structures</span><span style="font-weight: 400;"> in the </span><a href="https://academic.oup.com/nar/article/47/D1/D520/5144142?login=true"><span style="font-weight: 400;">Protein Data Bank</span></a><span style="font-weight: 400;"> has given </span><a href="https://www.nature.com/articles/s41586-021-03819-2"><span style="font-weight: 400;">AlphaFold</span></a><span style="font-weight: 400;"> the ability </span><span style="font-weight: 400;">to accurately predict protein structures, something that was just done for </span><a href="https://www.deepmind.com/blog/alphafold-reveals-the-structure-of-the-protein-universe"><span style="font-weight: 400;">every protein ever documented</span></a><span style="font-weight: 400;"> (over 200 million!).</span><span style="font-weight: 400;"> Leveraging research papers in a manner similar to protein structures would be a natural next step. </span></p> <h3><span style="font-weight: 400;">Decompose papers into their minimal components</span></h3> <p><span style="font-weight: 400;">Research papers are full of valuable information, including figures, charts, statistical relationships, and references to other papers. Breaking them down into various components and using them at scale could help us train machines for different types of science-related jobs, prompts or queries. Simple questions might be answered with training on one component type, but more complex questions or prompts would require incorporation of multiple component types, and an understanding of their relation to each other.  </span></p> <p><span style="font-weight: 400;">Some examples of complex potential prompts are:</span></p> <p><span style="font-weight: 400;">“Tell me why this hypothesis is wrong”<br /> “Tell me why my treatment idea won&#8217;t work”<br /> “Generate a new treatment idea”<br /> “What evidence is there to support social policy X?”<br /> “Who has published the most reliable research in this field?”<br /> “Write me a scientific paper based on my data”<br /> </span></p> <p><span style="font-weight: 400;">Some groups are making headway on this vision. For example, </span><a href="https://elicit.org/"><span style="font-weight: 400;">Elicit</span></a><span style="font-weight: 400;"> applies GPT-3 to millions of paper titles and abstracts to help answer researchers’ questions — kind of like Alexa, but for science. </span><a href="https://www.system.com/"><span style="font-weight: 400;">System</span></a><span style="font-weight: 400;"> extracts statistical relations between entities showing how different concepts and entities are linked. </span><a href="https://primer.ai/"><span style="font-weight: 400;">Primer</span></a><span style="font-weight: 400;"> doesn’t focus on research papers per se, but it does work with arXiv and provides a dashboard of information used by corporations and governments to synthesize and understand large amounts of data from many sources. </span></p> <h3><span style="font-weight: 400;">Access all the components</span></h3> <p><span style="font-weight: 400;">Unfortunately, these groups primarily rely upon titles and abstracts only, not the full texts, since roughly five out of six articles are not freely or easily accessible. For the groups like Web of Science and Google that have the data or the papers, their licenses and scope of use are </span><a href="https://scholar.google.com/intl/en/scholar/publishers.html#policies"><span style="font-weight: 400;">limited or undefined</span></a><span style="font-weight: 400;">. In the case of Google, it is unclear why there have been no publicly announced efforts to train AI models on the full-text scientific research in Google Scholar. Amazingly, this didn’t even change in the midst of the COVID-19 pandemic, which brought the world to a standstill. The Google AI team stepped up, prototyping a way for the public to ask </span><a href="https://ai.googleblog.com/2020/05/an-nlu-powered-tool-to-explore-covid-19.html"><span style="font-weight: 400;">about COVID-19</span></a><span style="font-weight: 400;">. But — and here’s the kicker — they did so using only open access papers from PubMed, not Google Scholar. </span></p> <p><span style="font-weight: 400;">The issue of getting access to papers and using them for more than just reading them one at a time is something groups have advocated for decades. I have personally worked on it for nearly a decade myself, launching an open access publishing platform called </span><a href="https://thewinnower.com/"><span style="font-weight: 400;">The Winnower</span></a><span style="font-weight: 400;"> during the last year of my PhD, and then working to build the </span><a href="https://ar5iv.labs.arxiv.org/html/1709.07020"><span style="font-weight: 400;">article of the future</span></a><span style="font-weight: 400;"> at another startup called </span><a href="https://www.authorea.com/"><span style="font-weight: 400;">Authorea</span></a><span style="font-weight: 400;">. While neither of those initiatives fully panned out the way I wanted them to, they led me to my current work at </span><a href="https://scite.ai/"><span style="font-weight: 400;">scite</span></a><span style="font-weight: 400;">, which has, at least partially, solved the access issue by working directly with publishers. </span></p> <h3><span style="font-weight: 400;">Connect the components and define relationships</span></h3> <p><span style="font-weight: 400;">Our aim at </span><a href="https://scite.ai/"><span style="font-weight: 400;">scite</span></a><span style="font-weight: 400;"> is to introduce the </span><a href="https://direct.mit.edu/qss/article/2/3/882/102990/scite-A-smart-citation-index-that-displays-the"><span style="font-weight: 400;">next generation of citations</span></a><span style="font-weight: 400;"> — called Smart Citations — which show how and why any article, researcher, journal, or topic has been cited and more generally discussed in the literature. By working with publishers, we extract the sentences directly from full-text articles where they use their references in-text. These sentences offer a qualitative insight into how papers were cited by newer work. It’s a bit like Rotten Tomatoes for research.</span><span style="font-weight: 400;"><br /> </span></p> <p><span style="font-weight: 400;">This requires access to full-text articles, and cooperation with publishers, so that we can use machine learning to extract and analyze citation statements at scale. Because there were enough Open Access articles to get started, we were able to build out the proof of concept and one by one, we demonstrated to publishers the increased discoverability of articles indexed in our system and provided them with a system to </span><a href="https://www.wiley.com/network/archive/endorsingdoraforresponsibleresearchassessment"><span style="font-weight: 400;">show better metrics</span></a><span style="font-weight: 400;"> for more responsible research assessment. What we saw as expert statements, they saw as previews of their articles. Publishers have now signed on en masse and we have indexed over 1.1 billion Smart Citations from more than half of all articles published. </span></p> <h3><span style="font-weight: 400;">Use relational data to train AI models</span></h3> <p><span style="font-weight: 400;">The components and relations extracted from papers could be used to train new large language models for research. GPT-3, while very powerful, was not built to work on science and </span><a href="https://arxiv.org/abs/2009.03300"><span style="font-weight: 400;">does poorly at answering questions you might see on the SAT</span></a><span style="font-weight: 400;">. When GPT-2 (an earlier version of GPT-3) was </span><a href="https://aclanthology.org/2022.bigscience-1.12.pdf"><span style="font-weight: 400;">adapted by training it on millions of research papers</span></a><span style="font-weight: 400;">, it worked better than GPT-2 alone on specific knowledge tasks. This highlights that the data used to train the models is exceedingly important. </span></p> <p><span style="font-weight: 400;"> </span><span style="font-weight: 400;">Some groups have recently </span><a href="https://www.scientificamerican.com/article/we-asked-gpt-3-to-write-an-academic-paper-about-itself-then-we-tried-to-get-it-published/"><span style="font-weight: 400;">used GPT-3 to write academic papers</span></a><span style="font-weight: 400;">, and while this is impressive, the facts or arguments they might purport to show could be very wrong. If the model can’t get simple SAT-style questions right, can we trust it to write a full paper? </span><a href="https://pdos.csail.mit.edu/archive/scigen/"><span style="font-weight: 400;">SCIgen</span></a><span style="font-weight: 400;">, which predates GPT-3 by nearly 20 years, showed that generating papers that look real is relatively easy. Their system, while much simpler, generated papers that were </span><a href="https://news.mit.edu/2015/how-three-mit-students-fooled-scientific-journals-0414"><span style="font-weight: 400;">accepted into various conferences</span></a><span style="font-weight: 400;">. We need a model that doesn&#8217;t just look scientific but is scientific, and that requires a system to verify claims for machines and humans. Meta recently introduced a </span><a href="https://tech.fb.com/artificial-intelligence/2022/07/how-ai-could-help-make-wikipedia-entries-more-accurate/"><span style="font-weight: 400;">system for verifying Wikipedia citations</span></a><span style="font-weight: 400;">, something some publishers have vocally </span><a href="https://twitter.com/IanMulvany/status/1550392984348332033?s=20&amp;t=mPkOz6PW2DQKaEUaUdeJYg"><span style="font-weight: 400;">wished they had for scholarly publications</span></a><span style="font-weight: 400;">. </span></p> <h2><b><div id="section--3" data-toc-header="Current progress"></div>Current progress</b></h2> <p><span style="font-weight: 400;">Again, one key blocker to bringing this system to fruition is a lack of access to the papers and resources to create it. Where papers or information become available to use at scale, we do see </span><span style="font-weight: 400;">tools and new models flourish. The Google Patent team used </span><a href="https://cloud.google.com/blog/products/ai-machine-learning/how-ai-improves-patent-analysis"><span style="font-weight: 400;">100 million patents to train a system for help with patent analysis</span></a><span style="font-weight: 400;">, effectively a GooglePatentBERT. Others have introduced models like </span><a href="https://arxiv.org/abs/1901.08746"><span style="font-weight: 400;">BioBERT</span></a><span style="font-weight: 400;"> and </span><a href="https://arxiv.org/abs/1903.10676"><span style="font-weight: 400;">SciBERT</span></a><span style="font-weight: 400;">, and despite the fact that they have only been trained on about ~1% of scientific texts in only specific subject domains, they are impressive at scholarly tasks, including our citation classification system at scite. </span></p> <p><span style="font-weight: 400;">More recently, a <a href="https://arxiv.org/pdf/2205.11342v1.pdf">ScholarBERT</a> model has been released, which effectively does use all of the scientific literature to train BERT. They overcome the access issue but are notably mum on how, simply emphasizing their use to be “non-consumptive.” This use case might open the doors to </span><span style="font-weight: 400;">others using articles without express permission from publishers and could be an important step in creating a DALL-E of science. Surprisingly, however, ScholarBERT did worse at various specialized knowledge tasks than smaller science language models like SciBERT. </span></p> <p><span style="font-weight: 400;">Importantly, BERT-style models are much smaller scale than the large language models like GPT-3, and they don’t allow the same kind of generic prompting and in-context learning that has powered much of the GPT-3 hype. The question remains: what if we applied the same data from ScholarBERT to train a scaled-up generative model like GPT-3? What if we could somehow show where the answers from the machine were sourced, perhaps tying them directly to the literature (like Smart Citations)?</span></p> <h2><b><div id="section--4" data-toc-header="Why now?"></div>Why now?</b></h2> <p><span style="font-weight: 400;">Fortunately, papers are becoming more open and machines are becoming more powerful. We can now begin using the data contained within papers and connected repositories to train machines to answer questions and synthesize new ideas based on research. This could be transformative for healthcare, policy, technology, and everything around us. Imagine, if we didn&#8217;t search just for document titles but specifically for answers, how that would impact research and workflows across all disciplines. </span></p> <p><span style="font-weight: 400;"> </span><span style="font-weight: 400;">Liberating the world’s scientific knowledge from the twin barriers of accessibility and understandability will help drive the transition from a web focused on clicks, views, likes, and attention to one focused on evidence, data, and veracity. Pharma is clearly incentivized to bring this to fruition, hence the growing number of startups identifying potential drug targets using AI — but I believe the public, governments, and anyone using Google might be willing to forgo free searches in an effort for trust and time-saving. The world desperately needs such a system, and it needs it fast. </span></p> <p><span style="font-weight: 400;"><br /> </span><span style="font-weight: 400;"> </span></p> <p>&nbsp;</p> <p>&nbsp;</p> <p><span style="font-weight: 400;"> </span></p> <div class="content-area-end"></div> <div class="article-footer"> Posted <time>August 18, 2022</time> </div> <ul class="author-list"> <li> <div class="avatar"> <a href="https://future.com/author/josh-nicholson/"> <img alt='' src='https://future.com/wp-content/themes/future/assets/images/no-image-default.jpg' srcset='https://future.com/wp-content/themes/future/assets/images/no-image-default.jpg 2x' class='avatar avatar-150 photo avatar-default' height='150' width='150' decoding='async'/> </a> </div> <div class="flex"> <div class="description"> <a href="https://future.com/author/josh-nicholson/"> <p> <strong>Josh Nicholson</strong> is co-founder and CEO of scite. He holds a PhD in Cell Biology from Virginia Tech and has built and sold two companies aimed at improving how researchers collaborate and publish their work. </p> </a> </div> <div class="follow-link"> <strong>Follow</strong> <a href="https://twitter.com/joshmnicholson" target="_blank" rel="noreferrer noopener">Twitter</a> </div> </div> </li> </ul> <div class="related-articles"> <strong class="title">Related Articles</strong> <div class="post-row"> <a href="https://future.com/why-applying-machine-learning-to-biology-is-hard-but-worth-it/"> <h3> Why Applying Machine Learning to Biology is Hard – But Worth It</h3> <span class="author">Jimmy Lin, Nicole Neuman</span> </a> </div> <div class="post-row"> <a href="https://future.com/what-synthetic-embryos-can-do/"> <h3> What Synthetic Embryos Can and Can&#8217;t Do, Now and in the Future</h3> <span class="author">Magda Zernicka-Goetz, Nicole Neuman</span> </a> </div> <div class="post-row"> <a href="https://future.com/applications-ai-models-of-the-brain-aka-neuroai/"> <h3> AI’s Next Frontier: Brains on Demand</h3> <span class="author">Patrick Mineault</span> </a> </div> <div class="post-row"> <a href="https://future.com/first-half-2022-web3-decentralized-science-desci/"> <h3> Mid-year Recap: Web3 and Science Collide</h3> <span class="author">Future Editorial</span> </a> </div> <div class="post-row"> <a href="https://future.com/koller-insitro-drug-discovery-ai-alphafold/"> <h3> The Two Things We’ll Need for the Next AlphaFold</h3> <span class="author">Daphne Koller, Nicole Neuman</span> </a> </div> </div> <div id="popular-articles--mobile"></div> </article> </div> <div class="col-xs-12 col-md-3 sidebar-post-right"> <div class="sidebar-post-right-sticky"> <div class="inner-wrapper-sticky"> <div class="wpp-shortcode"><script data-cfasync="false" src="/cdn-cgi/scripts/5c5dd728/cloudflare-static/email-decode.min.js"></script><script type="application/json">{"title":"","limit":"3","offset":0,"range":"last30days","time_quantity":24,"time_unit":"hour","freshness":false,"order_by":"views","post_type":"post","pid":"","cat":"","taxonomy":"category","term_id":"","author":"","shorten_title":{"active":false,"length":0,"words":false},"post-excerpt":{"active":false,"length":0,"keep_format":false,"words":false},"thumbnail":{"active":false,"build":"manual","width":0,"height":0},"rating":false,"stats_tag":{"comment_count":false,"views":true,"author":false,"date":{"active":false,"format":"F j, Y"},"category":false,"taxonomy":{"active":false,"name":"category"}},"markup":{"custom_html":true,"wpp-start":"<ul class=\"wpp-list\">","wpp-end":"<\/ul>","title-start":"<h2>","title-end":"<\/h2>","post-html":"<li class=\"{current_class}\">{thumb} {title} <span class=\"wpp-meta post-stats\">{stats}<\/span><\/li>"},"theme":{"name":""}}</script><div class="wpp-shortcode-placeholder"></div></div> </div> </div> </div> </div> </div> </div> </div> <div id="footer"> <div class="f1"> <div class="container"> <div class="logo"> <img src="https://future.com/wp-content/themes/future/assets/images/future.svg" alt="Future Logo"> </div> <div class="footer-panel"> <ul id="menu-footer-navigation" class="footer-nav"><li id="menu-item-36" class="menu-item menu-item-type-custom menu-item-object-custom menu-item-36"><a target="_blank" rel="noopener" href="https://a16z.com/tos-privacy/">Terms of Use &#038; Privacy</a></li> <li id="menu-item-34" class="menu-item menu-item-type-custom menu-item-object-custom menu-item-34"><a target="_blank" rel="noopener" href="https://a16z.com/about/">About a16z</a></li> <li id="menu-item-1327" class="menu-item menu-item-type-custom menu-item-object-custom menu-item-1327"><a href="/feed">RSS</a></li> </ul> <span class="copyright">© 2024 Future. All Rights Reserved.</span> <ul class="foo-social"> <li><a href="https://twitter.com/future" class="icon-twitter"></a></li> <li><a href="https://www.linkedin.com/showcase/future-by-a16z/" class="icon-linkedin"></a></li> <li><a href="https://www.facebook.com/FutureMedia/" class="icon-facebook"></a></li> </ul> </div> </div> </div> </div> <script type="text/javascript" src="https://future.com/wp-includes/js/dist/hooks.min.js?ver=2810c76e705dd1a53b18" id="wp-hooks-js"></script> <script type="text/javascript" src="https://future.com/wp-includes/js/dist/i18n.min.js?ver=5e580eb46a90c2b997e6" id="wp-i18n-js"></script> <script type="text/javascript" id="wp-i18n-js-after"> /* <![CDATA[ */ wp.i18n.setLocaleData( { 'text direction\u0004ltr': [ 'ltr' ] } ); /* ]]> */ </script> <script type="text/javascript" src="https://future.com/wp-content/plugins/wp-parsely/build/loader.js?ver=b681bb9905652ac12735" id="wp-parsely-loader-js"></script> <script type="text/javascript" data-parsely-site="future.a16z.com" src="https://cdn.parsely.com/keys/future.a16z.com/p.js?ver=3.17.0" id="parsely-cfg"></script> <script type="text/javascript" src="https://future.com/wp-content/themes/future/assets/compiled/js/app.js?id=3e6de9f5a73ec2db613f" id="base-theme-script-js"></script> <!-- Google Tag Manager (noscript) --> <noscript><iframe src="https://www.googletagmanager.com/ns.html?id=GTM-M59QZLF" height="0" width="0" style="display:none;visibility:hidden"></iframe></noscript> <!-- End Google Tag Manager (noscript) --> </div> </body> </html>

Pages: 1 2 3 4 5 6 7 8 9 10