CINXE.COM

Tutorial | Semantic Scholar Academic Graph API

<!DOCTYPE html><!-- Last Published: Fri Nov 15 2024 22:53:53 GMT+0000 (Coordinated Universal Time) --><html data-wf-domain="webflow.semanticscholar.org" data-wf-page="6584745360a4872a287a891b" data-wf-site="605236bb767e9a5bb229c63c" lang="en"><head><meta charset="utf-8"/><title>Tutorial | Semantic Scholar Academic Graph API</title><meta content="Quick guide to get started with Semantic Scholar Academic Graph API." name="description"/><meta content="Tutorial | Semantic Scholar Academic Graph API" property="og:title"/><meta content="Quick guide to get started with Semantic Scholar Academic Graph API." property="og:description"/><meta content="https://cdn.prod.website-files.com/605236bb767e9a5bb229c63c/60a2cb67437bf2de419d5137_s2-og.png" property="og:image"/><meta content="Tutorial | Semantic Scholar Academic Graph API" property="twitter:title"/><meta content="Quick guide to get started with Semantic Scholar Academic Graph API." property="twitter:description"/><meta content="https://cdn.prod.website-files.com/605236bb767e9a5bb229c63c/60a2cb67437bf2de419d5137_s2-og.png" property="twitter:image"/><meta property="og:type" content="website"/><meta content="summary_large_image" name="twitter:card"/><meta content="width=device-width, initial-scale=1" name="viewport"/><link href="https://cdn.prod.website-files.com/605236bb767e9a5bb229c63c/css/semanticscholar.66787fc22.min.css" rel="stylesheet" type="text/css"/><link href="https://fonts.googleapis.com" rel="preconnect"/><link href="https://fonts.gstatic.com" rel="preconnect" crossorigin="anonymous"/><script src="https://ajax.googleapis.com/ajax/libs/webfont/1.6.26/webfont.js" type="text/javascript"></script><script type="text/javascript">WebFont.load({ google: { families: ["Lato:100,100italic,300,300italic,400,400italic,700,700italic,900,900italic","Roboto Slab:300,regular,500,700","Roboto:300,regular,500,700,900","Roboto Mono:regular","Roboto Mono:100,200,300,regular"] }});</script><script type="text/javascript">!function(o,c){var n=c.documentElement,t=" w-mod-";n.className+=t+"js",("ontouchstart"in o||o.DocumentTouch&&c instanceof DocumentTouch)&&(n.className+=t+"touch")}(window,document);</script><link href="https://cdn.prod.website-files.com/605236bb767e9a5bb229c63c/609add8e5f5ce7570f656904_favicon.png" rel="shortcut icon" type="image/x-icon"/><link href="https://cdn.prod.website-files.com/605236bb767e9a5bb229c63c/609adda9bd029148c37023a9_webclip.png" rel="apple-touch-icon"/><link href="/api/1/user/webflow.css" rel="stylesheet" type="text/css"> <!-- Heap Analytics Snippet --> <script type="text/javascript"> window.heap=window.heap||[],heap.load=function(e,t){window.heap.appid=e,window.heap.config=t=t||{};var r=t.forceSSL||"https:"===document.location.protocol,a=document.createElement("script");a.type="text/javascript",a.async=!0,a.src=(r?"https:":"http:")+"//cdn.heapanalytics.com/js/heap-"+e+".js";var n=document.getElementsByTagName("script")[0];n.parentNode.insertBefore(a,n);for(var o=function(e){return function(){heap.push([e].concat(Array.prototype.slice.call(arguments,0)))}},p=["addEventProperties","addUserProperties","clearEventProperties","identify","removeEventProperty","setEventProperties","track","unsetEventProperty"],c=0;c<p.length;c++)heap[p[c]]=o(p[c])}; heap.load("2424575119"); </script> <!--End Heap Analytics Snippet--> <style type="text/css"> /* Auth */ .site-header__navigation-auth { display: var(--has-auth--block) !important; } .site-header__navigation-not-auth { display: var(--no-auth--none) !important; } /* Dropdown Menu Top Bar */ .dropdown .dropdown__menu:before{ border-color: transparent transparent #1857B6 transparent; border-style: solid; border-width: 0 8px 8px 8px; content:""; height: 0; position: absolute; right: 12px; top: -12px; width: 0; } /* Embedded Newsletter Hubspot Form */ .newsletter .hbspt-form label{ margin: 0; } .newsletter .hbspt-form .hs-form { align-items: end; display: flex; } .newsletter .hbspt-form .hs-form-field { flex: 1; position: relative; } .newsletter .hbspt-form .hs-form-field .hs-input { border: 1px solid #546973; font-size: 16px; height: 36px; line-height: 36px; padding: 8px; width: 100%; } .newsletter .hbspt-form .hs-form-field .hs-input.error { border-color: #a92020; } .newsletter .hbspt-form .hs-form-field .hs-error-msgs { background: #a92020; bottom: -44px; left: 4px; list-style: none; margin: 0; padding: 6px 12px; position: absolute; } .newsletter .hbspt-form .hs-form-field .hs-error-msgs:after { border-color: transparent transparent transparent #a92020; border-style: solid; border-width: 8px 0 0 8px; content: ""; height: 0; left: 0; position: absolute; top: -8px; width: 0; } .newsletter .hbspt-form .hs-form-field .hs-error-msg { color: #fff; font-size: 14px; } .newsletter .hbspt-form .hs-submit { flex: 0 0 auto; } .newsletter .hbspt-form .hs-submit .hs-button { background: #1857B6; border: none; border-radius: 0 3px 3px 0; color: #fff; cursor: pointer; font-size: 14px; height: 36px; line-height: 36px; margin: 0; padding: 0 14px; transition: background-color 250ms cubic-bezier(.25, .46, .45, .94); } .newsletter .hbspt-form .hs-submit .hs-button:hover { background: #0f3875; } .newsletter .hbspt-form .hs_error_rollup { display: none; } .newsletter .hbspt-form .submitted-message{ border: 1px solid #1857B6; border-radius: 3px; padding: 12px; } .newsletter .hbspt-form .submitted-message p { color: #fff; margin: 0; text-align: left !important; } .newsletter-embed--accessibility .hbspt-form label{ color: #fff; font-family: "Roboto Slab", Serif; font-size: 18px; font-weight: 400; text-align: center; padding-bottom: 12px; } /* Paper Object */ .paper{ filter: drop-shadow(0 1px 2px rgba(0,0,0,.1)); } .paper:after{ background: #D9DADB; clip-path: polygon(0 0, 100% 100%, 0 100%); content: " "; height: 24px; position: absolute; right: 0; top: 0; width: 24px; } .paper__content{ clip-path: polygon(0 0, calc(100% - 24px) 0%, 100% calc(0% + 24px), 100% 100%, 0% 100%); } /* Testimonials */ .testimonial__citation:after{ content: ""; position: absolute; top: 36px; width: 0; height: 0; border-style: solid; border-width: 8px 0 8px 8px; border-color: transparent transparent transparent #f5f6f7; left: -24px; } .testimonial__citation.testimonial__citation--alt:after{ left: auto; right: -24px; border-width: 8px 8px 8px 0; border-color: transparent #f5f6f7 transparent transparent; } @media screen and (max-width: 767px){ .testimonial__citation:after{ display: none; } } </style></head><body class="body"><header class="site-header site-header--fixed"><div class="site-header__content"><a href="https://www.semanticscholar.org" class="site-header__logo w-inline-block"><img src="https://cdn.prod.website-files.com/605236bb767e9a5bb229c63c/6053b48e21b11570b9788241_s2-logo-small.svg" loading="lazy" alt="Semantic Scholar" height="36" class="logo-small"/><img src="https://cdn.prod.website-files.com/605236bb767e9a5bb229c63c/605274dd4af9b0ca8ac84182_s2-logo.svg" loading="lazy" alt="Semantic Scholar" height="36" class="logo-large"/></a><div class="site-header__search w-form"><form id="wf-form-Search" name="wf-form-Search" data-name="Search" action="https://www.semanticscholar.org/search" method="get" class="search__form" data-wf-page-id="6584745360a4872a287a891b" data-wf-element-id="6ab03a1e-944e-9291-1968-b70fe5f1160b"><input class="search__field w-input" maxlength="256" name="q" data-name="q" placeholder="Search over 214 million papers from all fields of science" type="text" id="q"/><input type="submit" data-wait="Please wait..." class="search__submit w-button" value="Search"/></form><div class="w-form-done"><div>Thank you! Your submission has been received!</div></div><div class="w-form-fail"><div>Oops! Something went wrong while submitting the form.</div></div></div><div class="site-header__navigation site-header__navigation-auth"><div class="site-header__navigation-wrapper"><a data-w-id="6ab03a1e-944e-9291-1968-b70fe5f11616" href="#" class="site-header__navigation-close w-inline-block"><img src="https://cdn.prod.website-files.com/605236bb767e9a5bb229c63c/60908306c5938d99543d2b58_close.svg" loading="lazy" alt=""/></a><div data-hover="false" data-delay="0" class="site-header__navigation dropdown w-dropdown"><div class="site-header__navigation dropdown button button--secondary w-dropdown-toggle"><img src="https://cdn.prod.website-files.com/605236bb767e9a5bb229c63c/605b9e8398d437113ca1d650_icon-account.svg" loading="lazy" alt="Icon - Account" height="12" class="dropdown image"/><div class="dropdown icon w-icon-dropdown-toggle"></div><div class="dropdown dropdown__text">Account</div></div><nav class="dropdown dropdown__menu w-dropdown-list"><a href="https://www.semanticscholar.org/me/research" class="dropdown dropdown__link w-inline-block"><img src="https://cdn.prod.website-files.com/605236bb767e9a5bb229c63c/62dafc4d8c585c6c25f98ce9_menu-icon-dashboard.svg" loading="lazy" width="18" height="18" alt="" class="dropdown dropdown__image"/><div>Research Dashboard</div></a><a href="https://www.semanticscholar.org/me/recommendations" class="dropdown dropdown__link w-inline-block"><img src="https://cdn.prod.website-files.com/605236bb767e9a5bb229c63c/62dafc4d4e4b2383249840d1_menu-icon-feeds.svg" loading="lazy" width="18" height="18" alt="" class="dropdown dropdown__image"/><div>Research Feeds</div></a><a href="https://www.semanticscholar.org/me/library/all" class="dropdown dropdown__link w-inline-block"><img src="https://cdn.prod.website-files.com/605236bb767e9a5bb229c63c/62dafc4c42e988c961b5809a_menu-icon-library.svg" loading="lazy" width="18" height="18" alt="" class="dropdown dropdown__image"/><div>Library</div></a><a href="https://www.semanticscholar.org/me/account" class="dropdown dropdown__link dropdown__link--section w-inline-block"><img src="https://cdn.prod.website-files.com/605236bb767e9a5bb229c63c/62dafc4d3f5f3076275ec271_menu-icon-settings.svg" loading="lazy" width="18" height="18" alt="" class="dropdown dropdown__image"/><div>Settings</div></a><a href="https://www.semanticscholar.org/me/research" class="dropdown dropdown__link w-inline-block"><img src="https://cdn.prod.website-files.com/605236bb767e9a5bb229c63c/62dafc4c2ab4d9438fee8d1c_menu-icon-logout.svg" loading="lazy" width="18" height="18" alt="" class="dropdown dropdown__image"/><div>Sign Out</div></a></nav></div></div><a data-w-id="6ab03a1e-944e-9291-1968-b70fe5f11630" href="#" class="site-header__navigation-open">Menu</a></div><div class="site-header__navigation site-header__navigation-not-auth"><div class="site-header__navigation-wrapper"><a data-w-id="6ab03a1e-944e-9291-1968-b70fe5f11634" href="#" class="site-header__navigation-close w-inline-block"><img src="https://cdn.prod.website-files.com/605236bb767e9a5bb229c63c/60908306c5938d99543d2b58_close.svg" loading="lazy" alt=""/></a><a href="https://www.semanticscholar.org/sign-in" class="site-header__navigation button button--secondary w-button">Sign In</a><a href="https://www.semanticscholar.org/sign-in" class="site-header__navigation button w-button">Create Free Account</a></div><a data-w-id="6ab03a1e-944e-9291-1968-b70fe5f11662" href="#" class="site-header__navigation-open">Menu</a></div></div></header><main class="main"><div class="section-navigation"><div class="section-navigation__container"><div class="section-navigation__intro"><img src="https://cdn.prod.website-files.com/605236bb767e9a5bb229c63c/624235ea65058666ddc08d70_S2AG.svg" loading="lazy" alt="Logo for Semantic Scholar Academic Graph API" class="section-navigation__logo"/><a href="/product/api" class="section-navigation__title">Semantic Scholar API</a></div><ul role="list" class="section-navigation__links"><li><a href="/product/api" class="section-navigation__link">Overview</a></li><li><a href="/product/api/tutorial" aria-current="page" class="section-navigation__link w--current">Tutorial</a></li><li><a href="https://api.semanticscholar.org/api-docs/" class="section-navigation__link">Documentation</a></li><li><a href="/product/api/gallery" class="section-navigation__link">Gallery</a></li><li><a href="https://www.semanticscholar.org/paper/The-Semantic-Scholar-Open-Data-Platform-Kinney-Anastasiades/cb92a7f9d9dbcf9145e32fdfa0e70e2a6b828eb1" target="_blank" class="section-navigation__link section-navigation__link__cta">Cite the Paper</a></li></ul></div></div><div class="blade blade--white"><div class="blade__grid blade__grid--2-1 blade__grid--v-centered"><div class="blade__content"><h1>Semantic Scholar API - Tutorial</h1><p class="p__intro p__intro--header"><strong>Get Started with Semantic Scholar API</strong></p><p class="p__intro">Learn to search for papers and authors, download datasets, and more</p></div><img src="https://cdn.prod.website-files.com/605236bb767e9a5bb229c63c/6462a2d97df4b1fcdf6b814e_api-tutorials.png" loading="lazy" id="w-node-c3d2d69f-2104-52d5-a5b6-7621789aedfe-287a891b" sizes="(max-width: 479px) 95vw, (max-width: 767px) 94vw, (max-width: 991px) 93vw, 30vw" alt="Quick guide to get started with Semantic Scholar API" srcset="https://cdn.prod.website-files.com/605236bb767e9a5bb229c63c/6462a2d97df4b1fcdf6b814e_api-tutorials-p-500.png 500w, https://cdn.prod.website-files.com/605236bb767e9a5bb229c63c/6462a2d97df4b1fcdf6b814e_api-tutorials-p-800.png 800w, https://cdn.prod.website-files.com/605236bb767e9a5bb229c63c/6462a2d97df4b1fcdf6b814e_api-tutorials.png 1000w"/></div></div><div id="Documentation" class="blade"><div class="blade__grid"><div class="card"><ul role="list" class="navigation__anchor w-list-unstyled"><li class="navigation__anchor-item"><a href="#introduction" data-section="start" class="navigation_anchor-link"><strong>Introduction</strong></a></li><li class="navigation__anchor-item navigation__anchor-item--nested"><a href="#" data-section="start" class="navigation_anchor-link">What is an API?</a></li><li class="navigation__anchor-item navigation__anchor-item--nested"><a href="#endpoint" data-section="start" class="navigation_anchor-link">The Semantic Scholar APIs</a></li><li class="navigation__anchor-item navigation__anchor-item--nested"><a href="#base-urls" data-section="start" class="navigation_anchor-link">How to make requests faster and more efficiently</a></li><li class="navigation__anchor-item navigation__anchor-item--nested"><a href="#request" data-section="start" class="navigation_anchor-link">Example: Request paper details</a></li><li class="navigation__anchor-item navigation__anchor-item--header"><a href="#author" data-section="author" class="navigation_anchor-link"><strong>Make Calls to the Semantic Scholar API</strong></a></li><li class="navigation__anchor-item navigation__anchor-item--nested"><a href="#step-1-guide" data-section="author" class="navigation_anchor-link">Step 1: Keyword search for relevant papers</a></li><li class="navigation__anchor-item navigation__anchor-item--nested"><a href="#step-2-guide" data-section="author" class="navigation_anchor-link">Step 2: Get recommended papers</a></li><li class="navigation__anchor-item navigation__anchor-item--nested"><a href="#step-3-guide" data-section="author" class="navigation_anchor-link">Step 3: Look up authors</a></li><li class="navigation__anchor-item navigation__anchor-item--header"><a href="#datasets" data-section="datasets" class="navigation_anchor-link"><strong>Additional Resources</strong></a></li><li class="navigation__anchor-item navigation__anchor-item--nested"><a href="#pagination" data-section="datasets" class="navigation_anchor-link">Pagination</a></li><li class="navigation__anchor-item navigation__anchor-item--nested-copy"><a href="#search-query-params" data-section="datasets" class="navigation_anchor-link">Examples using search query parameters</a></li><li class="navigation__anchor-item navigation__anchor-item--nested-copy"><a href="#download-full-datasets" data-section="datasets" class="navigation_anchor-link">How to download full datasets</a></li><li class="navigation__anchor-item navigation__anchor-item--nested-copy"><a href="#incremental-diffs" data-section="datasets" class="navigation_anchor-link">How to update datasets with incremental diffs</a></li><li class="navigation__anchor-item navigation__anchor-item--nested-copy"><a href="#working-with-downloaded-datasets" data-section="datasets" class="navigation_anchor-link">Tips for working with downloaded datasets</a></li></ul></div><div id="w-node-_6ab34d5c-779c-d883-f516-9b39b8fa3a82-287a891b" class="accordion"><div id="start" class="accordion__section"><h2 id="introduction" class="accordion__header">Introduction</h2><div class="accordion__content accordion__content--open"><p>The Semantic Scholar REST API uses standard HTTP verbs, response codes, and authentication. This tutorial will teach you how to interact with the API by sending requests and analyzing the responses. All code examples are shown in Python. If you prefer a code-free experience, follow along using the <a href="https://www.postman.com/science-operator-43364886/workspace/semantic-scholar-examples/collection/37460422-e99f1d74-d11c-48c8-93a8-f33ec0e0aea1">Semantic Scholar Postman Collection</a>, which lets you test out the API on Postman, a popular and free API testing platform.</p><h3 id="checklist" class="documentation__header margin-top--none">What is an Application Programming Interface (API)?</h3><p>An API is a structured way for applications to communicate with each other. Applications can send API requests to one another, for instance to retrieve data.</p><p>Each API request consists of:<br/></p><ul role="list"><li>An API endpoint, which is the URL that requests are sent to. The URL consists of the API’s base URL and the specific endpoint’s resource path (See Figure 1).</li><li>A request method, such as GET or POST. This is sent in the HTTP request and tells the API what type of action to perform.</li></ul><img src="https://cdn.prod.website-files.com/605236bb767e9a5bb229c63c/65553b12d016d1fc01d52f2d_GS_endpointUrlExample.png" loading="lazy" width="500" sizes="(max-width: 479px) 87vw, (max-width: 767px) 85vw, 500px" alt="a diagram displaying the base url [https://api.semanticscholar.org/graph/v1/] and resource path [/paper/search]" srcset="https://cdn.prod.website-files.com/605236bb767e9a5bb229c63c/65553b12d016d1fc01d52f2d_GS_endpointUrlExample-p-500.png 500w, https://cdn.prod.website-files.com/605236bb767e9a5bb229c63c/65553b12d016d1fc01d52f2d_GS_endpointUrlExample-p-800.png 800w, https://cdn.prod.website-files.com/605236bb767e9a5bb229c63c/65553b12d016d1fc01d52f2d_GS_endpointUrlExample.png 1034w" class="image-2"/><p>Figure 1. The endpoint for Semantic Scholar’s <a href="https://api.semanticscholar.org/api-docs/#tag/Paper-Data/operation/get_graph_paper_relevance_search">paper relevance search endpoint</a>.<br/></p><p>Each API request may also include:<br/></p><ul role="list"><li>Query parameters, which are appended to the end of the URL, after the resource path.</li><li>A request header, which may contain information about the API key being used.</li><li>A request body, which contains data being sent to the API.</li></ul><p>After the request is sent, the API will return a response. The response includes a status code indicating whether the request was successful and any requested data. The response may also include requested data.</p><p>Common status codes are:<br/></p><ul role="list"><li><strong>200, OK</strong>. The request was successful.</li><li><strong>400, Bad Request</strong>. The server could not understand your request. Check your parameters.</li><li><strong>401, Unauthorized</strong>. You&#x27;re not authenticated or your credentials are invalid.</li><li><strong>403, Forbidden.</strong> The server understood the request but refused it. You don&#x27;t have permission to access the requested resource.</li><li><strong>404, Not Found. </strong>The requested resource or endpoint does not exist.</li><li><strong>429, Too Many Requests. </strong>You&#x27;ve hit the rate limit, slow down your requests.</li><li><strong>500, Internal Server Error. </strong>Something went wrong on the server’s side.</li></ul><h3 id="endpoint" class="documentation__header">The Semantic Scholar APIs</h3><p>Semantic Scholar contains three APIs, each with its own unique base URL:<br/></p><ul role="list"><li>Academic Graph API returns details about papers, paper authors, paper citations and references. Base URL: <a href="https://api.semanticscholar.org/graph/v1">https://api.semanticscholar.org/graph/v1</a><br/></li><li>Recommendations API recommends papers based on other papers you give it. Base URL: <a href="https://api.semanticscholar.org/recommendations/v1">https://api.semanticscholar.org/recommendations/v1</a></li><li>Datasets API lets you download Semantic Scholar’s datasets onto your local machine, so you can host the data yourself and do custom queries. Base URL: <a href="https://api.semanticscholar.org/datasets/v1/">https://api.semanticscholar.org/datasets/v1</a></li></ul><p>See the <a href="https://api.semanticscholar.org/api-docs/">Semantic Scholar API documentation</a> for more information about each API and their endpoints. The documentation describes how to correctly format requests and parse responses for each endpoint.<br/></p><h3 id="base-urls" class="documentation__header">How to make requests faster and more efficiently</h3><p>Heavy use of the API can cause a slowdown for everyone. Here are some tips to avoid hitting rate limit ceilings and slowdowns when making requests:<br/></p><ul role="list"><li><strong>Use an API Key. </strong>Users without API keys are affected by the traffic from all other unauthenticated users, who share a single API key. But using an individual API key automatically gives a user a 1 request per second rate across all endpoints. In some cases, users may be granted a slightly higher rate following a review. Learn more about API keys and how to request one <a href="https://www.semanticscholar.org/product/api#api-key">here</a>.<br/></li><li><strong>Use batch endpoints. </strong>Some endpoints have a corresponding batch or bulk endpoint that returns more results in a single response. Examples include the <a href="https://api.semanticscholar.org/api-docs/#tag/Paper-Data/operation/get_graph_paper_relevance_search">paper relevance search</a> (bulk version: <a href="https://api.semanticscholar.org/api-docs/#tag/Paper-Data/operation/get_graph_paper_bulk_search">paper bulk search</a>) and the <a href="https://api.semanticscholar.org/api-docs/#tag/Paper-Data/operation/get_graph_get_paper">paper details endpoint</a> (batch version: <a href="https://api.semanticscholar.org/api-docs/#tag/Paper-Data/operation/post_graph_get_papers">paper batch endpoint</a>). When requesting large quantities of data, use the bulk or batch versions whenever possible.</li><li><strong>Limit “fields” parameters. </strong>Most endpoints in the API contain the “fields” query parameter, which allows users to specify what data they want returned in the response. Avoid including more fields than you need, because that can slow down the response rate.</li><li><strong>Download Semantic Scholar Datasets. </strong>When you need a request rate that is higher than the rate provided by API keys, you can download Semantic Scholar’s datasets and run queries locally. The <a href="https://api.semanticscholar.org/api-docs/datasets">Datasets API</a> provides endpoints for easily downloading and maintaining Semantic Scholar datasets. See the <a href="#download-full-datasets">How to Download Full Datasets</a> section of the tutorial under Additional Resources for more details.</li></ul><h3 id="request" class="documentation__header">Example: Request paper details (using Python)</h3><p>Now we’ll make a request to the <a href="https://api.semanticscholar.org/api-docs/graph#tag/Paper-Data/operation/get_graph_get_paper">paper details endpoint</a> by running Python code. Complete the steps listed under Prerequisites below before proceeding. If you prefer to follow along in Postman, the same request in Postman is located <a href="https://www.postman.com/science-operator-43364886/semantic-scholar-examples/request/nvkscgu/details-about-a-paper">here</a>. For more examples of API requests using Python, see the section <a href="#author">Make Calls to the Semantic Scholar API</a>.<br/></p><p><strong>Prerequisites:</strong><br/></p><ul role="list"><li>Install Python if it is not already on your machine.<br/></li><li>Install pip, Python&#x27;s package manager, if it is not already on your machine.</li></ul><p>According to the Academic Graph API documentation, the paper details endpoint is a GET method and its resource path is <strong>/paper/{paper_id}</strong>.<br/></p><img src="https://cdn.prod.website-files.com/605236bb767e9a5bb229c63c/66e9d70caf2154892b901a69_paper%20details%20path.png" loading="lazy" width="Auto" sizes="(max-width: 479px) 87vw, (max-width: 991px) 89vw, 800px" alt="a diagram displaying the base url [https://api.semanticscholar.org/graph/v1/] and resource path [/paper/search]" srcset="https://cdn.prod.website-files.com/605236bb767e9a5bb229c63c/66e9d70caf2154892b901a69_paper%20details%20path-p-500.png 500w, https://cdn.prod.website-files.com/605236bb767e9a5bb229c63c/66e9d70caf2154892b901a69_paper%20details%20path-p-800.png 800w, https://cdn.prod.website-files.com/605236bb767e9a5bb229c63c/66e9d70caf2154892b901a69_paper%20details%20path-p-1080.png 1080w, https://cdn.prod.website-files.com/605236bb767e9a5bb229c63c/66e9d70caf2154892b901a69_paper%20details%20path-p-1600.png 1600w, https://cdn.prod.website-files.com/605236bb767e9a5bb229c63c/66e9d70caf2154892b901a69_paper%20details%20path.png 1802w" class="image-2"/><p class="margin-top--sm">Figure 2. Each endpoint&#x27;s resource path is listed in the API documentation.<br/></p><p>When combined with the Academic Graph base URL, the endpoint’s URL is: <a href="https://api.semanticscholar.org/graph/v1/paper/%7Bpaper_id">https://api.semanticscholar.org/graph/v1/paper/{paper_id}</a><br/></p><p>The curly brackets in the resource path indicate that <strong>paper_id</strong> is a path parameter, which is replaced by a value when the request is sent. Accepted formats for the value of <strong>paper_id</strong> are detailed in the Path Parameters section of the documentation.<br/></p><img src="https://cdn.prod.website-files.com/605236bb767e9a5bb229c63c/66e9dbb3be85dfbfe383219a_paper%20details%20parameters.png" loading="lazy" width="700" sizes="(max-width: 479px) 87vw, (max-width: 767px) 89vw, (max-width: 991px) 87vw, 700px" alt="a diagram displaying the base url [https://api.semanticscholar.org/graph/v1/] and resource path [/paper/search]" srcset="https://cdn.prod.website-files.com/605236bb767e9a5bb229c63c/66e9dbb3be85dfbfe383219a_paper%20details%20parameters-p-500.png 500w, https://cdn.prod.website-files.com/605236bb767e9a5bb229c63c/66e9dbb3be85dfbfe383219a_paper%20details%20parameters-p-800.png 800w, https://cdn.prod.website-files.com/605236bb767e9a5bb229c63c/66e9dbb3be85dfbfe383219a_paper%20details%20parameters-p-1080.png 1080w, https://cdn.prod.website-files.com/605236bb767e9a5bb229c63c/66e9dbb3be85dfbfe383219a_paper%20details%20parameters.png 1232w" class="image-2"/><p class="margin-top--sm">Figure 3. Accepted formats are listed in the Path Parameters section.<br/></p><p>The Query Parameters section of the documentation only lists a single optional parameter: <strong>fields</strong>. The <strong>fields</strong> parameter takes a string of comma-separated field names, which tell the API what information to return in the response.<br/></p><img src="https://cdn.prod.website-files.com/605236bb767e9a5bb229c63c/67003dbebdebc17eb7101eb8_query%20params%20fields.png" loading="lazy" width="700" srcset="https://cdn.prod.website-files.com/605236bb767e9a5bb229c63c/67003dbebdebc17eb7101eb8_query%20params%20fields-p-500.png 500w, https://cdn.prod.website-files.com/605236bb767e9a5bb229c63c/67003dbebdebc17eb7101eb8_query%20params%20fields-p-800.png 800w, https://cdn.prod.website-files.com/605236bb767e9a5bb229c63c/67003dbebdebc17eb7101eb8_query%20params%20fields-p-1080.png 1080w, https://cdn.prod.website-files.com/605236bb767e9a5bb229c63c/67003dbebdebc17eb7101eb8_query%20params%20fields.png 1366w" alt="a diagram displaying the base url [https://api.semanticscholar.org/graph/v1/] and resource path [/paper/search]" sizes="(max-width: 479px) 87vw, (max-width: 767px) 89vw, (max-width: 991px) 87vw, 700px" class="image-2"/><p class="margin-top--sm">Figure 4. Fields that can be returned in the response are listed in the Response Schema section of Responses.<br/></p><p>For our Python request, we&#x27;ll query the same paper ID given in the documentation’s example. We&#x27;ll request the paper’s <strong>title</strong>, the <strong>year</strong> of publication, the <strong>abstract</strong>, and the <strong>citationCount </strong>fields:</p><pre contenteditable="false" id="w-node-d0b48fdc-5d98-e9be-75db-2ca5c4d2f153-287a891b" class="code-block w-code-block" style="display:block;overflow-x:auto;background:#2b2b2b;color:#f8f8f2;padding:0.5em"><code class="language-python" style="white-space:pre"><span style="color:#dcc6e0">import</span><span> requests </span> <span>paperId = </span><span style="color:#abe338">&quot;649def34f8be52c8b66281af98ae884c09aef38b&quot;</span><span> </span> <span></span><span style="color:#d4d0ab"># Define the API endpoint URL</span><span> </span><span>url = </span><span style="color:#abe338">f&quot;http://api.semanticscholar.org/graph/v1/paper/</span><span class="hljs-subst" style="color:#abe338">{paperId}</span><span style="color:#abe338">&quot;</span><span> </span> <span></span><span style="color:#d4d0ab"># Define the query parameters</span><span> </span><span>query_params = {</span><span style="color:#abe338">&quot;fields&quot;</span><span>: </span><span style="color:#abe338">&quot;title,year,abstract,citationCount&quot;</span><span>} </span> <span></span><span style="color:#d4d0ab"># Directly define the API key (Reminder: Securely handle API keys in production environments)</span><span> </span><span>api_key = </span><span style="color:#abe338">&quot;your api key goes here&quot;</span><span> </span><span style="color:#d4d0ab"># Replace with the actual API key</span><span> </span> <span></span><span style="color:#d4d0ab"># Define headers with API key</span><span> </span><span>headers = {</span><span style="color:#abe338">&quot;x-api-key&quot;</span><span>: api_key} </span> <span></span><span style="color:#d4d0ab"># Send the API request</span><span> </span>response = requests.get(url, params=query_params, headers=headers) <span></span><span style="color:#d4d0ab"># Check response status</span><span> </span><span></span><span style="color:#dcc6e0">if</span><span> response.status_code == </span><span style="color:#f5ab35">200</span><span>: </span> response_data = response.json() <span> </span><span style="color:#d4d0ab"># Process and print the response data as needed</span><span> </span><span> </span><span style="color:#f5ab35">print</span><span>(response_data) </span><span></span><span style="color:#dcc6e0">else</span><span>: </span><span> </span><span style="color:#f5ab35">print</span><span>(</span><span style="color:#abe338">f&quot;Request failed with status code </span><span class="hljs-subst" style="color:#abe338">{response.status_code}</span><span style="color:#abe338">: </span><span class="hljs-subst" style="color:#abe338">{response.text}</span><span style="color:#abe338">&quot;</span><span>)</span></code></pre><p><br/>Note that this request is using an API key. The use of API keys is optional but recommended. Learn more about API keys and how to get one <a href="https://www.semanticscholar.org/product/api#api-key">here</a>.</p><p>We are using the Python Requests library to send the request. So we know the <a href="https://www.w3schools.com/python/ref_requests_response.asp">response has a property</a> named <strong>status_code</strong> that returns the response status. We check the <strong>status_code</strong> and either print the successfully returned data or the error message.</p><p>See the API documentation for how the response is formatted. Each Status Code section expands with further details about the response data that is returned.</p><img src="https://cdn.prod.website-files.com/605236bb767e9a5bb229c63c/66e9dfc4fd5ef4d187cdfa62_paper%20details%20responses.png" loading="lazy" width="700" sizes="(max-width: 479px) 87vw, (max-width: 767px) 89vw, (max-width: 991px) 87vw, 700px" alt="a diagram displaying the base url [https://api.semanticscholar.org/graph/v1/] and resource path [/paper/search]" srcset="https://cdn.prod.website-files.com/605236bb767e9a5bb229c63c/66e9dfc4fd5ef4d187cdfa62_paper%20details%20responses-p-500.png 500w, https://cdn.prod.website-files.com/605236bb767e9a5bb229c63c/66e9dfc4fd5ef4d187cdfa62_paper%20details%20responses-p-800.png 800w, https://cdn.prod.website-files.com/605236bb767e9a5bb229c63c/66e9dfc4fd5ef4d187cdfa62_paper%20details%20responses-p-1080.png 1080w, https://cdn.prod.website-files.com/605236bb767e9a5bb229c63c/66e9dfc4fd5ef4d187cdfa62_paper%20details%20responses.png 1306w" class="image-2"/><p>Figure 5. The Responses section describes how responses are formatted.</p><p>When the request is successful, the JSON object returned in the response is:</p><pre contenteditable="false" id="w-node-_34ccd5be-7933-116f-a464-85a9b47b1592-287a891b" class="code-block w-code-block" style="display:block;overflow-x:auto;background:#2b2b2b;color:#f8f8f2;padding:0.5em"><code class="language-json" style="white-space:pre"><span>{ </span><span> </span><span class="hljs-attr">&quot;paperId&quot;</span><span>: </span><span style="color:#abe338">&quot;649def34f8be52c8b66281af98ae884c09aef38b&quot;</span><span>, </span><span> </span><span class="hljs-attr">&quot;title&quot;</span><span>: </span><span style="color:#abe338">&quot;Construction of the Literature Graph in Semantic Scholar&quot;</span><span>, </span><span> </span><span class="hljs-attr">&quot;abstract&quot;</span><span>: </span><span style="color:#abe338">&quot;We describe a deployed scalable system for organizing published ...&quot;</span><span>, </span><span> </span><span class="hljs-attr">&quot;year&quot;</span><span>: </span><span style="color:#f5ab35">2018</span><span>, </span><span> </span><span class="hljs-attr">&quot;citationCount&quot;</span><span>: </span><span style="color:#f5ab35">365</span><span> </span>}</code></pre><p><br/>See the <a href="#author">Make Calls to the Semantic Scholar API</a> section for more Python examples using the paper search, paper recommendations, and authors endpoints.</p></div></div><div id="author" class="accordion__section"><h2 class="accordion__header">Make Calls to the Semantic Scholar API</h2><div class="accordion__content accordion__content--open"><p><strong>Use Case: Let&#x27;s suppose you are an early-career academic researcher interested in doing research in generative AI. You would like to learn about recent research developments in the generative AI field and discover what areas are most exciting for future research.</strong><br/></p><p>How to use Semantic Scholar: You can do a keyword search for relevant papers on generative AI.<strong> </strong>You can pick out a few papers that seem the most interesting, then recommend more papers that are similar to them. You can examine the list of recommended papers to see which are the most cited and which authors worked on them, then look up other research by those same authors.</p><p>Let’s walk through those scenarios together. We’re going to use Python, but you can follow along using the <a href="https://www.postman.com/science-operator-43364886/semantic-scholar-examples/collection/g4giumx/getting-started-with-semantic-scholar-api">Postman collection</a>.</p><h2 id="step-1-guide" class="documentation__header margin-top--none margin-bottom--sm">Step 1: Keyword search for relevant papers</h2><p><strong>Use Case: We want to learn more about generative AI, so we’ll start by searching for recent generative AI research papers. </strong><br/></p><p>Two <a href="https://api.semanticscholar.org/api-docs/graph">Academic Graph API</a> endpoints use Semantic Scholar’s <a href="https://blog.allenai.org/building-a-better-search-engine-for-semantic-scholar-ea23a0b661e7">custom-trained ranker</a> to perform keyword searches: the <a href="https://api.semanticscholar.org/api-docs/#tag/Paper-Data/operation/get_graph_paper_relevance_search">paper relevance search</a> endpoint and the <a href="https://api.semanticscholar.org/api-docs/#tag/Paper-Data/operation/get_graph_paper_bulk_search">paper bulk search</a> endpoint.<br/></p><p>Paper bulk search should be used in most cases because paper relevance search is more resource intensive. The paper relevance search endpoint is able to return more detailed information about each paper’s authors, its referenced papers, and the papers that cite it. The paper bulk search supports sorting and special syntax in the query parameter. In this scenario, we will use the paper bulk search endpoint.<br/></p><h4><strong>Get the Endpoint URL</strong></h4><p>The Academic Graph API endpoint’s <a href="https://www.semanticscholar.org/product/api/tutorial#start">base URL</a> is: http://api.semanticscholar.org/graph/v1/ </p><p>Whenever we want to retrieve data from or send data to an endpoint in the Academic Graph, that’s how the URL starts. The <a href="https://api.semanticscholar.org/api-docs/#tag/Paper-Data/operation/get_graph_paper_bulk_search">API documentation for paper bulk search</a> endpoint lists its resource path as <strong>/paper/search/bulk</strong>, so the endpoint’s full URL is:</p><ul role="list"><li>http://api.semanticscholar.org/graph/v1/paper/search/bulk <br/></li></ul><h4><strong>Set the Query Parameters</strong></h4><p>The paper bulk search API documentation lists the following query parameters:</p><ul role="list"><li><strong>query </strong>sets the search term<br/></li><li><strong>token</strong> automatically handles pagination<br/></li><li><strong>fields</strong> determines what data the API endpoint will return to you<br/></li><li><strong>sort </strong>allows users to sort the results by the paperId, publicationDate, or citationCount fields<br/></li><li><strong>publicationTypes</strong> filters results by paper publication type (e.g. journal articles)<br/></li><li><strong>openAccessPdf</strong> filters results by whether they contain public PDFs of papers<br/></li><li><strong>minCitationCount</strong> filters results by whether they have at least a given number of citations<br/></li><li><strong>publicationDateOrYear</strong> filters results by a date range<br/></li><li><strong>year</strong> filters results by a year range<br/></li><li><strong>venue</strong> filters results by publication venue<br/></li><li><strong>fieldsOfStudy</strong> filters results by the paper’s field of study<br/></li></ul><p>Only the first query parameter, <strong>query</strong>, is required in every request. The <strong>token</strong> query parameter isn’t included in the original request. Instead, it is returned in the response to the original request, then included in subsequent requests to automatically handle <a href="#pagination">pagination</a>.</p><p>In our request, we will include 3 query parameters: <strong>query</strong>, <strong>fields</strong>, and <strong>year</strong>:</p><ul role="list"><li>Use quotation marks in the <strong>query</strong> to search for the phrase “generative AI”. See the Additional Help section for more examples of using <a href="#search-query-params">search query syntax</a>.<br/></li><li>In <strong>fields</strong>, include the title, url, type of publication, date of publication, and link to the pdf of the paper. Separate field names with commas, without spaces. See the API documentation for all available field names.<br/></li><li>Filter for papers published during or after the year 2023 by using the “2023–” syntax.<br/></li></ul><p>These query parameters are appended to the end of the URL, so the complete URL looks like this: http://api.semanticscholar.org/graph/v1/paper/search/bulk?query=&quot;generative ai&quot;&amp;fields=title,url,publicationTypes,publicationDate,openAccessPdf&amp;year=2023-</p><h4><strong>Send the Request</strong></h4><p>The URL is long and hard to read, so in our code we’ll break it up a bit:</p><pre contenteditable="false" class="code-block margin-bottom--sm w-code-block" style="display:block;overflow-x:auto;background:#2b2b2b;color:#f8f8f2;padding:0.5em"><code class="language-python" style="white-space:pre"><span style="color:#dcc6e0">import</span><span> requests </span><span></span><span style="color:#dcc6e0">import</span><span> json </span> <span></span><span style="color:#d4d0ab"># Specify the search term</span><span> </span><span>query = </span><span style="color:#abe338">&#x27;&quot;generative ai&quot;&#x27;</span><span> </span> <span></span><span style="color:#d4d0ab"># Define the API endpoint URL</span><span> </span><span>url = </span><span style="color:#abe338">&quot;http://api.semanticscholar.org/graph/v1/paper/search/bulk&quot;</span><span> </span> <span></span><span style="color:#d4d0ab"># Define the query parameters</span><span> </span>query_params = { <span> </span><span style="color:#abe338">&quot;query&quot;</span><span>: </span><span style="color:#abe338">&#x27;&quot;generative ai&quot;&#x27;</span><span>, </span><span> </span><span style="color:#abe338">&quot;fields&quot;</span><span>: </span><span style="color:#abe338">&quot;title,url,publicationTypes,publicationDate,openAccessPdf&quot;</span><span>, </span><span> </span><span style="color:#abe338">&quot;year&quot;</span><span>: </span><span style="color:#abe338">&quot;2023-&quot;</span><span> </span>} <span></span><span style="color:#d4d0ab"># Directly define the API key (Reminder: Securely handle API keys in production environments)</span><span> </span><span>api_key = </span><span style="color:#abe338">&quot;your api key goes here&quot;</span><span> </span><span style="color:#d4d0ab"># Replace with the actual API key</span><span> </span> <span></span><span style="color:#d4d0ab"># Define headers with API key</span><span> </span><span>headers = {</span><span style="color:#abe338">&quot;x-api-key&quot;</span><span>: api_key} </span> <span></span><span style="color:#d4d0ab"># Send the API request</span><span> </span>response = requests.get(url, params=query_params, headers=headers).json()</code></pre><p>The request is formatted and sent to the API endpoint, and the response is captured in the variable <strong>response</strong>.</p><p>According to the API documentation, if the request was successful, with status code 200, the <strong>response</strong> variable contains three fields:</p><img src="https://cdn.prod.website-files.com/605236bb767e9a5bb229c63c/66eb4661fd8c190e301b9031_keyword%20search%20response.png" loading="lazy" sizes="(max-width: 479px) 87vw, (max-width: 991px) 89vw, 800px" srcset="https://cdn.prod.website-files.com/605236bb767e9a5bb229c63c/66eb4661fd8c190e301b9031_keyword%20search%20response-p-500.png 500w, https://cdn.prod.website-files.com/605236bb767e9a5bb229c63c/66eb4661fd8c190e301b9031_keyword%20search%20response-p-800.png 800w, https://cdn.prod.website-files.com/605236bb767e9a5bb229c63c/66eb4661fd8c190e301b9031_keyword%20search%20response-p-1080.png 1080w, https://cdn.prod.website-files.com/605236bb767e9a5bb229c63c/66eb4661fd8c190e301b9031_keyword%20search%20response-p-1600.png 1600w, https://cdn.prod.website-files.com/605236bb767e9a5bb229c63c/66eb4661fd8c190e301b9031_keyword%20search%20response.png 1862w" alt="Postman Request"/><p>Figure 6. The API documentation lists the data format of the response schema.<br/></p><p>The <strong>total</strong> parameter is an estimate of how many papers were found that matched the search request, the <strong>token</strong> parameter is used for <a href="https://www.semanticscholar.org/product/api/tutorial#pagination">pagination</a>, and the <strong>data</strong> parameter contains the data returned from the endpoint. Note that the paper bulk search endpoint’s use of tokens to handle pagination is unlike the paper relevance search endpoint’s use of the <strong>offset</strong> and <strong>limit</strong> query parameters for <a href="#pagination">pagination</a>.<br/></p><p>The next part of our code saves the data returned from the endpoint to a json file titled <strong>papers.json</strong> and prints the code’s progress to the console. If the <strong>token</strong> parameter is present, fetch the next batch of responses.<br/></p><pre contenteditable="false" class="code-block margin-bottom--sm w-code-block" style="display:block;overflow-x:auto;background:#2b2b2b;color:#f8f8f2;padding:0.5em"><code class="language-python" style="white-space:pre"><span style="color:#f5ab35">print</span><span>(</span><span style="color:#abe338">f&quot;Will retrieve an estimated </span><span class="hljs-subst" style="color:#abe338">{response[</span><span class="hljs-subst" style="color:#abe338">&#x27;total&#x27;</span><span class="hljs-subst" style="color:#abe338">]}</span><span style="color:#abe338"> documents&quot;</span><span>) </span><span>retrieved = </span><span style="color:#f5ab35">0</span><span> </span> <span></span><span style="color:#d4d0ab"># Write results to json file and get next batch of results</span><span> </span><span></span><span style="color:#dcc6e0">with</span><span> </span><span style="color:#f5ab35">open</span><span>(</span><span style="color:#abe338">f&quot;papers.json&quot;</span><span>, </span><span style="color:#abe338">&quot;a&quot;</span><span>) </span><span style="color:#dcc6e0">as</span><span> file: </span><span> </span><span style="color:#dcc6e0">while</span><span> </span><span style="color:#f5ab35">True</span><span>: </span><span> </span><span style="color:#dcc6e0">if</span><span> </span><span style="color:#abe338">&quot;data&quot;</span><span> </span><span style="color:#dcc6e0">in</span><span> response: </span><span> retrieved += </span><span style="color:#f5ab35">len</span><span>(response[</span><span style="color:#abe338">&quot;data&quot;</span><span>]) </span><span> </span><span style="color:#f5ab35">print</span><span>(</span><span style="color:#abe338">f&quot;Retrieved </span><span class="hljs-subst" style="color:#abe338">{retrieved}</span><span style="color:#abe338"> papers...&quot;</span><span>) </span><span> </span><span style="color:#dcc6e0">for</span><span> paper </span><span style="color:#dcc6e0">in</span><span> response[</span><span style="color:#abe338">&quot;data&quot;</span><span>]: </span><span> </span><span style="color:#f5ab35">print</span><span>(json.dumps(paper), file=file) </span><span> </span><span style="color:#d4d0ab"># checks for continuation token to get next batch of results</span><span> </span><span> </span><span style="color:#dcc6e0">if</span><span> </span><span style="color:#abe338">&quot;token&quot;</span><span> </span><span style="color:#dcc6e0">not</span><span> </span><span style="color:#dcc6e0">in</span><span> response: </span><span> </span><span style="color:#dcc6e0">break</span><span> </span><span> response = requests.get(</span><span style="color:#abe338">f&quot;</span><span class="hljs-subst" style="color:#abe338">{url}</span><span style="color:#abe338">&amp;token=</span><span class="hljs-subst" style="color:#abe338">{response[</span><span class="hljs-subst" style="color:#abe338">&#x27;token&#x27;</span><span class="hljs-subst" style="color:#abe338">]}</span><span style="color:#abe338">&quot;</span><span>).json() </span> <span></span><span style="color:#f5ab35">print</span><span>(</span><span style="color:#abe338">f&quot;Done! Retrieved </span><span class="hljs-subst" style="color:#abe338">{retrieved}</span><span style="color:#abe338"> papers total&quot;</span><span>)</span></code></pre><p>Each data object in the <strong>papers.json</strong> file contains the fields we requested, as well as the paperId.<br/></p><pre contenteditable="false" class="code-block margin-bottom--sm w-code-block" style="display:block;overflow-x:auto;background:#2b2b2b;color:#f8f8f2;padding:0.5em"><code class="language-json" style="white-space:pre"><span>{ </span><span> </span><span class="hljs-attr">&quot;paperId&quot;</span><span>: </span><span style="color:#abe338">&quot;001720a782840652b573bb4794774aee826510ca&quot;</span><span>, </span><span> </span><span class="hljs-attr">&quot;url&quot;</span><span>: </span><span style="color:#abe338">&quot;https://www.semanticscholar.org/paper/001720a782840652b573bb4794774aee826510ca&quot;</span><span>, </span><span> </span><span class="hljs-attr">&quot;title&quot;</span><span>: </span><span style="color:#abe338">&quot;Developing Design Features to Facilitate AI-Assisted User Interactions&quot;</span><span>, </span><span> </span><span class="hljs-attr">&quot;openAccessPdf&quot;</span><span>: </span><span style="color:#f5ab35">null</span><span>, </span><span> </span><span class="hljs-attr">&quot;publicationTypes&quot;</span><span>: </span><span style="color:#f5ab35">null</span><span>, </span><span> </span><span class="hljs-attr">&quot;publicationDate&quot;</span><span>: </span><span style="color:#abe338">&quot;2024-05-03&quot;</span><span> </span>} { <span> </span><span class="hljs-attr">&quot;paperId&quot;</span><span>: </span><span style="color:#abe338">&quot;0019e876188f781fdca0c0ed3bca39d0c70c2ad2&quot;</span><span>, </span><span> </span><span class="hljs-attr">&quot;url&quot;</span><span>: </span><span style="color:#abe338">&quot;https://www.semanticscholar.org/paper/0019e876188f781fdca0c0ed3bca39d0c70c2ad2&quot;</span><span>, </span><span> </span><span class="hljs-attr">&quot;title&quot;</span><span>: </span><span style="color:#abe338">&quot;Artificial intelligence prompt engineering as a new digital competence: Analysis of generative AI technologies such as ChatGPT&quot;</span><span>, </span><span> </span><span class="hljs-attr">&quot;openAccessPdf&quot;</span><span>: { </span><span> </span><span class="hljs-attr">&quot;url&quot;</span><span>: </span><span style="color:#abe338">&quot;https://eber.uek.krakow.pl/index.php/eber/article/view/2142/863&quot;</span><span>, </span><span> </span><span class="hljs-attr">&quot;status&quot;</span><span>: </span><span style="color:#abe338">&quot;GOLD&quot;</span><span> </span> }, <span> </span><span class="hljs-attr">&quot;publicationTypes&quot;</span><span>: [ </span><span> </span><span style="color:#abe338">&quot;JournalArticle&quot;</span><span>, </span><span> </span><span style="color:#abe338">&quot;Review&quot;</span><span> </span> ], <span> </span><span class="hljs-attr">&quot;publicationDate&quot;</span><span>: </span><span style="color:#f5ab35">null</span><span> </span>}</code></pre><p>See the S2folks GitHub code examples for <a href="https://github.com/allenai/s2-folks/tree/main/examples/python/search_bulk">another version of this call</a>.<br/></p><h2 id="step-2-guide" class="documentation__header margin-top--none margin-bottom--sm">Step 2: Get recommended papers</h2><p><strong>Use Case: In this section, we want to get a list of recommended papers based on a few interesting seed papers. We’re interested in the most cited papers, so we’ll request information about the recommended papers’ citation counts. </strong><br/></p><p>Two <a href="https://api.semanticscholar.org/api-docs/recommendations">Recommendations API</a> endpoints can recommend papers: one gives recommendations <a href="https://api.semanticscholar.org/api-docs/recommendations#tag/Paper-Recommendations/operation/get_papers_for_paper">based on a single seed paper</a> from the user, while <a href="https://api.semanticscholar.org/api-docs/recommendations#tag/Paper-Recommendations/operation/post_papers">the other takes a list</a> of positive seed papers and a list of negative seed papers from the user. Both endpoints return an array of papers in descending order of relevance.<br/></p><p>We will use the endpoint that takes two lists of positive and negative seed papers, with the URL: http://api.semanticscholar.org/recommendations/v1/papers<br/></p><p>This is a POST request, so we need to check the endpoint’s Request Body Schema section in the API documentation, to see the format in which the data must be sent in the body of the request.<br/></p><img src="https://cdn.prod.website-files.com/605236bb767e9a5bb229c63c/66ec6711bb047c13032e7e6d_keyword%20search%20request%20body.png" loading="lazy" sizes="(max-width: 479px) 87vw, (max-width: 991px) 89vw, 800px" srcset="https://cdn.prod.website-files.com/605236bb767e9a5bb229c63c/66ec6711bb047c13032e7e6d_keyword%20search%20request%20body-p-500.png 500w, https://cdn.prod.website-files.com/605236bb767e9a5bb229c63c/66ec6711bb047c13032e7e6d_keyword%20search%20request%20body-p-800.png 800w, https://cdn.prod.website-files.com/605236bb767e9a5bb229c63c/66ec6711bb047c13032e7e6d_keyword%20search%20request%20body-p-1080.png 1080w, https://cdn.prod.website-files.com/605236bb767e9a5bb229c63c/66ec6711bb047c13032e7e6d_keyword%20search%20request%20body.png 1402w" alt=""/><p>Figure 7. The Request Body Schema section describes how to format the data in a POST request.<br/></p><p>The positive and negative seed paperIds need to be sent as two arrays, <strong>positivePaperIds</strong> and <strong>negativePaperIds</strong>.<br/></p><p>For <strong>positivePaperIds</strong>, we’ll use two positive seed papers:<br/></p><ul role="list"><li><em>Human-Centred AI in Education in the Age of Generative AI Tools</em>, paperId 02138d6d094d1e7511c157f0b1a3dd4e5b20ebee<br/></li><li><em>Responsible Adoption of Generative AI in Higher Education: Developing a &quot;Points to Consider” Approach Based on Faculty Perspectives</em>, paperId 018f58247a20ec6b3256fd3119f57980a6f37748<br/></li></ul><p>For <strong>negativePaperIds</strong>, we’ll use one negative seed paper:<br/></p><ul role="list"><li><em>A Novel Generative AI-Based Framework for Anomaly Detection in Multicast Messages in Smart Grid Communications, paperId 0045ad0c1e14a4d1f4b011c92eb36b8df63d65bc</em><br/></li></ul><p>In our request to this API endpoint, we provide the following query parameters:<br/></p><ul role="list"><li>The <strong>fields</strong> query parameter, with the <strong>citationCount</strong> field, which returns how much that particular paper is cited by other papers. We won’t include the <strong>influentialCitationCount</strong> field here, but that field keeps track of how often the paper has a big influence on other papers.</li><li>The <strong>limit</strong> query parameter, which limits the number of recommended papers returned. We’ll set this to the max value of 500.<br/></li></ul><p>In a new Python script, the request is formatted and sent to the API endpoint.<br/></p><pre contenteditable="false" class="code-block margin-bottom--sm w-code-block" style="display:block;overflow-x:auto;background:#2b2b2b;color:#f8f8f2;padding:0.5em"><code class="language-python" style="white-space:pre"><span style="color:#dcc6e0">import</span><span> requests </span><span></span><span style="color:#dcc6e0">import</span><span> json </span> <span></span><span style="color:#d4d0ab"># Define the API endpoint URL</span><span> </span><span>url = </span><span style="color:#abe338">&quot;https://api.semanticscholar.org/recommendations/v1/papers&quot;</span><span> </span> <span></span><span style="color:#d4d0ab"># Define the query parameters</span><span> </span>query_params = { <span> </span><span style="color:#abe338">&quot;fields&quot;</span><span>: </span><span style="color:#abe338">&quot;title,url,citationCount,authors&quot;</span><span>, </span><span> </span><span style="color:#abe338">&quot;limit&quot;</span><span>: </span><span style="color:#abe338">&quot;500&quot;</span><span> </span>} <span></span><span style="color:#d4d0ab"># Define the request data</span><span> </span>data = { <span> </span><span style="color:#abe338">&quot;positivePaperIds&quot;</span><span>: [ </span><span> </span><span style="color:#abe338">&quot;02138d6d094d1e7511c157f0b1a3dd4e5b20ebee&quot;</span><span>, </span><span> </span><span style="color:#abe338">&quot;018f58247a20ec6b3256fd3119f57980a6f37748&quot;</span><span> </span> ], <span> </span><span style="color:#abe338">&quot;negativePaperIds&quot;</span><span>: [ </span><span> </span><span style="color:#abe338">&quot;0045ad0c1e14a4d1f4b011c92eb36b8df63d65bc&quot;</span><span> </span> ] } <span></span><span style="color:#d4d0ab"># Directly define the API key (Reminder: Securely handle API keys in production environments)</span><span> </span><span>api_key = </span><span style="color:#abe338">&quot;your api key goes here&quot;</span><span> </span><span style="color:#d4d0ab"># Replace with the actual API key</span><span> </span> <span></span><span style="color:#d4d0ab"># Define headers with API key</span><span> </span><span>headers = {</span><span style="color:#abe338">&quot;x-api-key&quot;</span><span>: api_key} </span> <span></span><span style="color:#d4d0ab"># Send the API request</span><span> </span>response = requests.post(url, params=query_params, json=data, headers=headers).json() <span></span><span style="color:#d4d0ab"># Sort the recommended papers by citation count</span><span> </span><span>papers = response[</span><span style="color:#abe338">&quot;recommendedPapers&quot;</span><span>] </span><span>papers.sort(key=</span><span style="color:#dcc6e0">lambda</span><span> paper: paper[</span><span style="color:#abe338">&quot;citationCount&quot;</span><span>], reverse=</span><span style="color:#f5ab35">True</span><span>) </span> <span></span><span style="color:#dcc6e0">with</span><span> </span><span style="color:#f5ab35">open</span><span>(</span><span style="color:#abe338">&#x27;recommended_papers_sorted.json&#x27;</span><span>, </span><span style="color:#abe338">&#x27;w&#x27;</span><span>) </span><span style="color:#dcc6e0">as</span><span> output: </span> json.dump(papers, output)</code></pre><p>A successful request returns a response with the <strong>recommendedPapers</strong> parameter. Since we want to see the most cited papers, the papers are sorted by the <strong>citationCount</strong> parameter, then the results are written to a JSON file, recommended_papers_sorted.json:</p><pre contenteditable="false" class="code-block margin-bottom--sm w-code-block" style="display:block;overflow-x:auto;background:#2b2b2b;color:#f8f8f2;padding:0.5em"><code class="language-json" style="white-space:pre"><span>[ </span> { <span> </span><span class="hljs-attr">&quot;paperId&quot;</span><span>: </span><span style="color:#abe338">&quot;833ff07d2d1be9be7b12e88487d5631c141a2e95&quot;</span><span>, </span><span> </span><span class="hljs-attr">&quot;url&quot;</span><span>: </span><span style="color:#abe338">&quot;https://www.semanticscholar.org/paper/833ff07d2d1be9be7b12e88487d5631c141a2e95&quot;</span><span>, </span><span> </span><span class="hljs-attr">&quot;title&quot;</span><span>: </span><span style="color:#abe338">&quot;Teacher Professional Development on Self-Determination Theory\u2013Based Design Thinking in STEM Education&quot;</span><span>, </span><span> </span><span class="hljs-attr">&quot;citationCount&quot;</span><span>: </span><span style="color:#f5ab35">24</span><span>, </span><span> </span><span class="hljs-attr">&quot;authors&quot;</span><span>: [ </span> { <span> </span><span class="hljs-attr">&quot;authorId&quot;</span><span>: </span><span style="color:#abe338">&quot;2281351310&quot;</span><span>, </span><span> </span><span class="hljs-attr">&quot;name&quot;</span><span>: </span><span style="color:#abe338">&quot;Thomas K. F. Chiu&quot;</span><span> </span> }, { <span> </span><span class="hljs-attr">&quot;authorId&quot;</span><span>: </span><span style="color:#abe338">&quot;2281342663&quot;</span><span>, </span><span> </span><span class="hljs-attr">&quot;name&quot;</span><span>: </span><span style="color:#abe338">&quot;C. Chai&quot;</span><span> </span> }, { <span> </span><span class="hljs-attr">&quot;authorId&quot;</span><span>: </span><span style="color:#abe338">&quot;2300302076&quot;</span><span>, </span><span> </span><span class="hljs-attr">&quot;name&quot;</span><span>: </span><span style="color:#abe338">&quot;P. J. Williams&quot;</span><span> </span> }, { <span> </span><span class="hljs-attr">&quot;authorId&quot;</span><span>: </span><span style="color:#abe338">&quot;2300141520&quot;</span><span>, </span><span> </span><span class="hljs-attr">&quot;name&quot;</span><span>: </span><span style="color:#abe338">&quot;Tzung-Jin Lin&quot;</span><span> </span> } ] }, { <span> </span><span class="hljs-attr">&quot;paperId&quot;</span><span>: </span><span style="color:#abe338">&quot;144b8d9c10ea111598aa239100cd6ed5c6137b1c&quot;</span><span>, </span><span> </span><span class="hljs-attr">&quot;url&quot;</span><span>: </span><span style="color:#abe338">&quot;https://www.semanticscholar.org/paper/144b8d9c10ea111598aa239100cd6ed5c6137b1c&quot;</span><span>, </span><span> </span><span class="hljs-attr">&quot;title&quot;</span><span>: </span><span style="color:#abe338">&quot;Artificial intelligence as part of future practices in the architect\u2019s work: MidJourney generative tool as part of a process of creating an architectural form&quot;</span><span>, </span><span> </span><span class="hljs-attr">&quot;citationCount&quot;</span><span>: </span><span style="color:#f5ab35">19</span><span>, </span><span> </span><span class="hljs-attr">&quot;authors&quot;</span><span>: [ </span> { <span> </span><span class="hljs-attr">&quot;authorId&quot;</span><span>: </span><span style="color:#abe338">&quot;2300748516&quot;</span><span>, </span><span> </span><span class="hljs-attr">&quot;name&quot;</span><span>: </span><span style="color:#abe338">&quot;Anna Jaruga-Rozdolska&quot;</span><span> </span> } ] },</code></pre><p>The recommended papers are now sorted in descending order of citation count, with a paper with 24 citations at the top of the list.</p><p>See the S2folks GitHub <a href="https://github.com/allenai/s2-folks/tree/main/examples/python/find_and_recommend_papers">for an example</a> of using the recommendations endpoint that takes a single seed paper.</p><h2 id="step-3-guide" class="documentation__header margin-top--none margin-bottom--sm">Step 3: Look up authors</h2><p><strong>Use Case: We want to get more information about the authors of the highest cited paper.</strong></p><p>The <a href="https://api.semanticscholar.org/api-docs/graph#tag/Author-Data/operation/post_graph_get_authors">batch authors endpoint</a> in Academic Graph API can return information about multiple authors: https://api.semanticscholar.org/graph/v1/author/batch.<strong> </strong></p><p>This endpoint is a POST, and it accepts an array of <strong>authorId</strong>s called <strong>ids</strong> in the request body. The array includes <strong>authorId</strong>s of the four authors of the most cited paper:</p><ul role="list"><li>2281351310<br/></li><li>2281342663<br/></li><li>2300302076<br/></li><li>2300141520<br/></li></ul><p>The only query parameter accepted by the endpoint is <strong>fields</strong>, where we can request more detailed information about the authors, including:</p><ul role="list"><li>The author <strong>name</strong><br/></li><li>The <strong>url</strong> of the author’s page on Semantic Scholar<br/></li><li>Their number of papers in Semantic Scholar, called <strong>paperCount</strong><br/></li><li>The author’s <strong>hIndex</strong>, a measure of their research impact<br/></li><li>An array of all <strong>papers</strong> by the author in Semantic Scholar<br/></li></ul><p>In a new Python script, the request is sent to the API endpoint.</p><pre contenteditable="false" class="code-block margin-bottom--sm w-code-block" style="display:block;overflow-x:auto;background:#2b2b2b;color:#f8f8f2;padding:0.5em"><code class="language-python" style="white-space:pre"><span style="color:#dcc6e0">import</span><span> requests </span><span></span><span style="color:#dcc6e0">import</span><span> json </span> <span></span><span style="color:#d4d0ab"># Define the API endpoint URL</span><span> </span><span>url = </span><span style="color:#abe338">&quot;https://api.semanticscholar.org/graph/v1/author/batch&quot;</span><span> </span> <span></span><span style="color:#d4d0ab"># Define the query parameters</span><span> </span>query_params = { <span> </span><span style="color:#abe338">&quot;fields&quot;</span><span>: </span><span style="color:#abe338">&quot;name,url,paperCount,hIndex,papers&quot;</span><span> </span>} <span></span><span style="color:#d4d0ab"># Define the request data</span><span> </span>data = { <span> </span><span style="color:#abe338">&quot;ids&quot;</span><span>: [</span><span style="color:#abe338">&quot;2281351310&quot;</span><span>,</span><span style="color:#abe338">&quot;2281342663&quot;</span><span>,</span><span style="color:#abe338">&quot;2300302076&quot;</span><span>,</span><span style="color:#abe338">&quot;2300141520&quot;</span><span>] </span>} <span></span><span style="color:#d4d0ab"># Directly define the API key (Reminder: Securely handle API keys in production environments)</span><span> </span><span>api_key = </span><span style="color:#abe338">&quot;your api key goes here&quot;</span><span> </span><span style="color:#d4d0ab"># Replace with the actual API key</span><span> </span> <span></span><span style="color:#d4d0ab"># Define headers with API key</span><span> </span><span>headers = {</span><span style="color:#abe338">&quot;x-api-key&quot;</span><span>: api_key} </span> <span></span><span style="color:#d4d0ab"># Send the API request</span><span> </span>response = requests.post(url, params=query_params, json=data, headers=headers).json() <span></span><span style="color:#d4d0ab"># Save the results to json file</span><span> </span><span></span><span style="color:#dcc6e0">with</span><span> </span><span style="color:#f5ab35">open</span><span>(</span><span style="color:#abe338">&#x27;author_information.json&#x27;</span><span>, </span><span style="color:#abe338">&#x27;w&#x27;</span><span>) </span><span style="color:#dcc6e0">as</span><span> output: </span> json.dump(response, output)</code></pre><p>The successful request returns an array of objects that contain author information.</p><pre contenteditable="false" class="code-block margin-bottom--sm w-code-block" style="display:block;overflow-x:auto;background:#2b2b2b;color:#f8f8f2;padding:0.5em"><code class="language-json" style="white-space:pre"><span>[ </span> { <span> </span><span class="hljs-attr">&quot;authorId&quot;</span><span>: </span><span style="color:#abe338">&quot;2281351310&quot;</span><span>, </span><span> </span><span class="hljs-attr">&quot;url&quot;</span><span>: </span><span style="color:#abe338">&quot;https://www.semanticscholar.org/author/2281351310&quot;</span><span>, </span><span> </span><span class="hljs-attr">&quot;name&quot;</span><span>: </span><span style="color:#abe338">&quot;Thomas K. F. Chiu&quot;</span><span>, </span><span> </span><span class="hljs-attr">&quot;paperCount&quot;</span><span>: </span><span style="color:#f5ab35">2</span><span>, </span><span> </span><span class="hljs-attr">&quot;hIndex&quot;</span><span>: </span><span style="color:#f5ab35">1</span><span>, </span><span> </span><span class="hljs-attr">&quot;papers&quot;</span><span>: [ </span> { <span> </span><span class="hljs-attr">&quot;paperId&quot;</span><span>: </span><span style="color:#abe338">&quot;630642b7040a0c396967e4dab93cf73094fa4f8f&quot;</span><span>, </span><span> </span><span class="hljs-attr">&quot;title&quot;</span><span>: </span><span style="color:#abe338">&quot;An experiential learning approach to learn AI in an online workshop&quot;</span><span> </span> }, { <span> </span><span class="hljs-attr">&quot;paperId&quot;</span><span>: </span><span style="color:#abe338">&quot;833ff07d2d1be9be7b12e88487d5631c141a2e95&quot;</span><span>, </span><span> </span><span class="hljs-attr">&quot;title&quot;</span><span>: </span><span style="color:#abe338">&quot;Teacher Professional Development on Self-Determination Theory\u2013Based Design Thinking in STEM Education&quot;</span><span> </span> } ] },</code></pre><p>See the S2folks GitHub for <a href="https://github.com/allenai/s2-folks/tree/main/examples/python/find_coauthored_papers">other interesting examples</a> of using the author endpoints.</p></div></div><div id="datasets" class="accordion__section"><h2 class="accordion__header">Additional Resources</h2><div class="accordion__content accordion__content--open"><h2 id="pagination" class="documentation__header margin-top--none margin-bottom--sm">Pagination</h2><p>Pagination is a technique used in APIs to manage and retrieve large sets of data in smaller, manageable chunks. This is particularly useful when dealing with extensive datasets to improve efficiency and reduce the load on both the client and server.<br/></p><p>Some Semantic Scholar endpoints, like <a href="https://api.semanticscholar.org/api-docs/#tag/Paper-Data/operation/get_graph_paper_relevance_search" target="_blank">paper relevance search</a>, require the use of the <strong>limit</strong> and <strong>offset</strong> parameters to handle pagination:<br/></p><ul role="list"><li><strong>Limit:</strong> Specifies the maximum number of items (e.g., papers) to be returned in a single API response. For example, in the request <a href="https://api.semanticscholar.org/graph/v1/paper/search?query=halloween&amp;limit=3">https://api.semanticscholar.org/graph/v1/paper/search?query=halloween&amp;limit=3</a>, the <strong>limit=3</strong> indicates that the response should include a maximum of 3 papers.</li><li><strong>Offset: </strong>Represents the starting point from which the API should begin fetching items. It helps skip a certain number of items. For example, if <strong>offset=10</strong>, the API will start retrieving items from the 11th item onward.</li></ul><p>Other endpoints, like <a href="https://api.semanticscholar.org/api-docs/#tag/Paper-Data/operation/get_graph_paper_bulk_search" target="_blank">paper bulk search</a>, require the use of the <strong>token</strong> parameter to handle pagination:<br/></p><ul role="list"><li><strong>Token: </strong>A “next” token or identifier provided in the response, pointing to the next set of items. It allows fetching the next page of results.</li></ul><p>In either case, the client requests the API for the first page of results. The API responds with a limited number of items. If there are more items to retrieve, the client can use the offset parameter or the next token in subsequent requests to get the next page of results until all items are fetched. This way, pagination allows clients to retrieve large datasets efficiently, page by page, based on their needs.</p><h2 id="search-query-params" class="documentation__header margin-top--none margin-bottom--sm">Examples using search query parameters</h2><p>Semantic Scholar’s <a href="https://api.semanticscholar.org/api-docs/#tag/Paper-Data/operation/get_graph_paper_bulk_search">paper bulk search</a> supports a variety of operators that enable advanced filtering and precise specifications in search queries. All keywords in the search query are matched against words in the paper’s <strong>title</strong> and <strong>abstract</strong>. Refer to the <a href="https://api.semanticscholar.org/api-docs/#tag/Paper-Data/operation/get_graph_paper_bulk_search">API Documentation</a> for all supported operators. Below are examples of varying complexity to help you get started.<br/></p><p><strong>Example 1.</strong><br/></p><pre contenteditable="false" class="code-block w-code-block" style="display:block;overflow-x:auto;background:#2b2b2b;color:#f8f8f2;padding:0.5em"><code class="language-markdown" style="white-space:pre"><span>((cloud computing) | virtualization) +security -privacy</span></code></pre><p>Matches papers containing the words &quot;cloud” and “computing&quot;, OR the word &quot;virtualization&quot; in their title or abstract. The paper title or abstract must also include the term &quot;security&quot; but should exclude the word &quot;privacy&quot;. For example, a paper with the title &quot;Ensuring Security in Cloud Computing Environments&quot; could be included, unless its abstract contains the word “privacy”.<br/></p><p><strong>Example 2.</strong><br/></p><pre contenteditable="false" class="code-block w-code-block" style="display:block;overflow-x:auto;background:#2b2b2b;color:#f8f8f2;padding:0.5em"><code class="language-markdown" style="white-space:pre"><span>&quot;red blood cell&quot; + artificial intelligence</span></code></pre><p>Matches papers where the title or abstract contains the exact phrase “red blood cell” along with the words “artificial” and “intelligence”. For example, a paper with the title &quot;Applications of Artificial Intelligence in Healthcare&quot; would be included if it also contained the phrase “red blood cell” in its abstract.<br/></p><p><strong>Example 3.</strong><br/></p><pre contenteditable="false" class="code-block w-code-block" style="display:block;overflow-x:auto;background:#2b2b2b;color:#f8f8f2;padding:0.5em"><code class="language-markdown" style="white-space:pre"><span>fish</span><span style="font-style:italic">*</span></code></pre><p>Matches papers where the title or abstract contains words with “fish” in their prefix, such as “fishtank”, “fishes”, or “fishy”. For example a paper with the title &quot;Ecology of Deep-Sea Fishes&quot; would be included.<br/></p><p><strong>Example 4.</strong><br/></p><pre contenteditable="false" class="code-block w-code-block" style="display:block;overflow-x:auto;background:#2b2b2b;color:#f8f8f2;padding:0.5em"><code class="language-markdown" style="white-space:pre"><span>bugs~3</span></code></pre><p>Matches<strong> </strong>papers where the title or abstract contains words with an edit distance of 3 from the word “bugs”, such as “buggy”, “but”, &quot;buns&quot;, “busg”, etc. An edit is the addition, removal, or change of a single character.<br/></p><p><strong>Example 5.</strong><br/></p><pre contenteditable="false" class="code-block w-code-block" style="display:block;overflow-x:auto;background:#2b2b2b;color:#f8f8f2;padding:0.5em"><code class="language-markdown" style="white-space:pre"><span>“blue lake” ~3</span></code></pre><p>Matches papers<strong> </strong>where the title or abstract contains phrases with up to 3 terms between the words specified in the phrase. For example, a paper titled “Preserving blue lakes during the winter” or with an abstract containing a phrase such as <strong>“</strong>blue fishes in the lake” would be included.<br/></p><h2 id="download-full-datasets" class="documentation__header margin-top--none margin-bottom--sm">How to download full datasets</h2><p>Semantic Scholar datasets contain data on papers, authors, abstracts, embeddings, and more. Datasets are grouped by releases, and each release is a snapshot of the datasets at the time of that release date. Make requests to the <a href="https://api.semanticscholar.org/api-docs/datasets">Datasets API</a> to see the list of available release dates, to list the datasets contained in a given release, and to download links to datasets.<br/></p><p>All Semantic Scholar datasets are delivered in JSON format.<br/></p><h4><strong>Step 1: See all release dates</strong></h4><p>Use the <a href="https://api.semanticscholar.org/api-docs/datasets#tag/Release-Data/operation/get_releases">list of available releases</a> endpoint to see all dataset release dates.<br/></p><pre contenteditable="false" class="code-block w-code-block" style="display:block;overflow-x:auto;background:#2b2b2b;color:#f8f8f2;padding:0.5em"><code class="language-python" style="white-space:pre"><span style="color:#dcc6e0">import</span><span> requests </span> <span></span><span style="color:#d4d0ab"># Define base URL for datasets API</span><span> </span><span>base_url = </span><span style="color:#abe338">&quot;https://api.semanticscholar.org/datasets/v1/release/&quot;</span><span> </span> <span></span><span style="color:#d4d0ab"># To get the list of available releases make a request to the base url. No additional parameters needed.</span><span> </span>response = requests.get(base_url) <span></span><span style="color:#d4d0ab"># Print the response data</span><span> </span><span></span><span style="color:#f5ab35">print</span><span>(response.json())</span></code></pre><p><br/>The response is a list of release dates, which contain all releases through the date the request was made:<br/></p><img src="https://cdn.prod.website-files.com/605236bb767e9a5bb229c63c/655bd7bb84b2a03165c620d7_Screen%20Shot%202023-11-20%20at%202.00.38%20PM.png" loading="lazy" width="500" sizes="(max-width: 479px) 87vw, (max-width: 767px) 85vw, 500px" alt="" srcset="https://cdn.prod.website-files.com/605236bb767e9a5bb229c63c/655bd7bb84b2a03165c620d7_Screen%20Shot%202023-11-20%20at%202.00.38%20PM-p-500.png 500w, https://cdn.prod.website-files.com/605236bb767e9a5bb229c63c/655bd7bb84b2a03165c620d7_Screen%20Shot%202023-11-20%20at%202.00.38%20PM-p-800.png 800w, https://cdn.prod.website-files.com/605236bb767e9a5bb229c63c/655bd7bb84b2a03165c620d7_Screen%20Shot%202023-11-20%20at%202.00.38%20PM.png 914w"/><h4><strong>Step 2: See all datasets for a given release date</strong></h4><p>Use the <a href="https://api.semanticscholar.org/api-docs/datasets#tag/Release-Data/operation/get_release">list of datasets in a release</a> endpoint to see all datasets contained in a given release. The endpoint takes the <strong>release_id</strong>, which is simply the release date, as a query parameter. The <strong>release_id</strong> can also be set to “latest” instead of the actual date value to retrieve datasets from the latest release.</p><pre contenteditable="false" class="code-block w-code-block" style="display:block;overflow-x:auto;background:#2b2b2b;color:#f8f8f2;padding:0.5em"><code class="language-python" style="white-space:pre"><span style="color:#dcc6e0">import</span><span> requests </span> <span>base_url = </span><span style="color:#abe338">&quot;https://api.semanticscholar.org/datasets/v1/release/&quot;</span><span> </span> <span></span><span style="color:#d4d0ab"># Set the release id</span><span> </span><span>release_id = </span><span style="color:#abe338">&quot;2023-10-31&quot;</span><span> </span> <span></span><span style="color:#d4d0ab"># Make a request to get datasets available the latest release</span><span> </span>response = requests.get(base_url + release_id) <span></span><span style="color:#d4d0ab"># Print the response data</span><span> </span><span></span><span style="color:#f5ab35">print</span><span>(response.json())</span></code></pre><img src="https://cdn.prod.website-files.com/605236bb767e9a5bb229c63c/655bdc2deb85e4b98b73c257_Screen%20Shot%202023-11-20%20at%202.19.07%20PM.png" loading="lazy" width="1000" sizes="(max-width: 479px) 87vw, (max-width: 991px) 89vw, 800px" alt="" srcset="https://cdn.prod.website-files.com/605236bb767e9a5bb229c63c/655bdc2deb85e4b98b73c257_Screen%20Shot%202023-11-20%20at%202.19.07%20PM-p-500.png 500w, https://cdn.prod.website-files.com/605236bb767e9a5bb229c63c/655bdc2deb85e4b98b73c257_Screen%20Shot%202023-11-20%20at%202.19.07%20PM-p-800.png 800w, https://cdn.prod.website-files.com/605236bb767e9a5bb229c63c/655bdc2deb85e4b98b73c257_Screen%20Shot%202023-11-20%20at%202.19.07%20PM-p-1080.png 1080w, https://cdn.prod.website-files.com/605236bb767e9a5bb229c63c/655bdc2deb85e4b98b73c257_Screen%20Shot%202023-11-20%20at%202.19.07%20PM-p-1600.png 1600w, https://cdn.prod.website-files.com/605236bb767e9a5bb229c63c/655bdc2deb85e4b98b73c257_Screen%20Shot%202023-11-20%20at%202.19.07%20PM.png 1816w"/><h4><strong>Step 3: Get download links for datasets</strong></h4><p>Use the <a href="https://api.semanticscholar.org/api-docs/datasets#tag/Release-Data/operation/get_dataset">download links for a dataset</a> endpoint to get download links for a specific dataset at a specific release date. This step requires the use of a Semantic Scholar API key.</p><pre contenteditable="false" class="code-block w-code-block" style="display:block;overflow-x:auto;background:#2b2b2b;color:#f8f8f2;padding:0.5em"><code class="language-python" style="white-space:pre"><span style="color:#dcc6e0">import</span><span> requests </span> <span>base_url = </span><span style="color:#abe338">&quot;https://api.semanticscholar.org/datasets/v1/release/&quot;</span><span> </span> <span></span><span style="color:#d4d0ab"># This endpoint requires authentication via api key</span><span> </span><span>api_key = </span><span style="color:#abe338">&quot;your api key goes here&quot;</span><span> </span><span>headers = {</span><span style="color:#abe338">&quot;x-api-key&quot;</span><span>: api_key} </span> <span></span><span style="color:#d4d0ab"># Set the release id</span><span> </span><span>release_id = </span><span style="color:#abe338">&quot;2023-10-31&quot;</span><span> </span> <span></span><span style="color:#d4d0ab"># Define dataset name you want to download</span><span> </span><span>dataset_name = </span><span style="color:#abe338">&#x27;papers&#x27;</span><span> </span> <span></span><span style="color:#d4d0ab"># Send the GET request and store the response in a variable</span><span> </span><span>response = requests.get(base_url + release_id + </span><span style="color:#abe338">&#x27;/dataset/&#x27;</span><span> + dataset_name, headers=headers) </span> <span></span><span style="color:#d4d0ab"># Process and print the response data</span><span> </span><span></span><span style="color:#f5ab35">print</span><span>(response.json())</span></code></pre><p><br/>The response contains the dataset name, description, a README with license and usage information, and temporary, pre-signed download links for the dataset files:</p><img src="https://cdn.prod.website-files.com/605236bb767e9a5bb229c63c/655be047f947be0e9b7aab23_Screen%20Shot%202023-11-20%20at%202.38.16%20PM.png" loading="lazy" width="1000" sizes="(max-width: 479px) 87vw, (max-width: 991px) 89vw, 800px" alt="" srcset="https://cdn.prod.website-files.com/605236bb767e9a5bb229c63c/655be047f947be0e9b7aab23_Screen%20Shot%202023-11-20%20at%202.38.16%20PM-p-500.png 500w, https://cdn.prod.website-files.com/605236bb767e9a5bb229c63c/655be047f947be0e9b7aab23_Screen%20Shot%202023-11-20%20at%202.38.16%20PM-p-800.png 800w, https://cdn.prod.website-files.com/605236bb767e9a5bb229c63c/655be047f947be0e9b7aab23_Screen%20Shot%202023-11-20%20at%202.38.16%20PM-p-1080.png 1080w, https://cdn.prod.website-files.com/605236bb767e9a5bb229c63c/655be047f947be0e9b7aab23_Screen%20Shot%202023-11-20%20at%202.38.16%20PM.png 1388w"/><h2 id="incremental-diffs" class="documentation__header margin-top--none margin-bottom--sm">How to update datasets with incremental diffs</h2><p>The <a href="https://api.semanticscholar.org/api-docs/datasets#tag/Incremental-Updates/operation/get_diff">incremental diffs</a> endpoint in the Datasets API allows users to get a comprehensive list of changes—or “diffs”—between any two releases. Full datasets can be updated from one release to another to avoid downloading and processing data that hasn&#x27;t changed. This endpoint requires the use of a Semantic Scholar API key.</p><p>This endpoint returns a list of all the &quot;diffs&quot; required to catch a given dataset up from the start release date to the end release date, with each “diff” object containing only the changes from one release to the next sequential release.</p><p>Each &quot;diff&quot; object itself contains two lists of files: an &quot;update files&quot; list and a &quot;delete files&quot; list. Records in the &quot;update files&quot; list need to be inserted or replaced by their primary key. Records in the &quot;delete files&quot; list should be removed from your dataset.</p><pre contenteditable="false" class="code-block w-code-block" style="display:block;overflow-x:auto;background:#2b2b2b;color:#f8f8f2;padding:0.5em"><code class="language-python" style="white-space:pre"><span style="color:#dcc6e0">import</span><span> requests </span> <span></span><span style="color:#d4d0ab"># Set the path parameters</span><span> </span><span>start_release_id = </span><span style="color:#abe338">&quot;2023-10-31&quot;</span><span> </span><span>end_release_id = </span><span style="color:#abe338">&quot;2023-11-14&quot;</span><span> </span><span>dataset_name = </span><span style="color:#abe338">&quot;authors&quot;</span><span> </span> <span></span><span style="color:#d4d0ab"># Set the API key. For best practice, store and retrieve API keys via environment variables</span><span> </span><span>api_key = </span><span style="color:#abe338">&quot;your api key goes here&quot;</span><span> </span><span>headers = {</span><span style="color:#abe338">&quot;x-api-key&quot;</span><span>: api_key} </span> <span></span><span style="color:#d4d0ab"># Construct the complete endpoint URL with the path parameters</span><span> </span><span>url = </span><span style="color:#abe338">f&quot;https://api.semanticscholar.org/datasets/v1/diffs/</span><span class="hljs-subst" style="color:#abe338">{start_release_id}</span><span style="color:#abe338">/to/</span><span class="hljs-subst" style="color:#abe338">{end_release_id}</span><span style="color:#abe338">/</span><span class="hljs-subst" style="color:#abe338">{dataset_name}</span><span style="color:#abe338">&quot;</span><span> </span> <span></span><span style="color:#d4d0ab"># Make the API request</span><span> </span>response = requests.get(url, headers=headers) <span></span><span style="color:#d4d0ab"># Extract the diffs from the response</span><span> </span><span>diffs = response.json()[</span><span style="color:#abe338">&#x27;diffs&#x27;</span><span>] </span><span></span><span style="color:#f5ab35">print</span><span>(diffs)</span></code></pre><img src="https://cdn.prod.website-files.com/605236bb767e9a5bb229c63c/6584a5de65866f1255d31b92_image3.png" loading="lazy" width="1000" sizes="(max-width: 479px) 87vw, (max-width: 991px) 89vw, 800px" alt="" srcset="https://cdn.prod.website-files.com/605236bb767e9a5bb229c63c/6584a5de65866f1255d31b92_image3-p-500.png 500w, https://cdn.prod.website-files.com/605236bb767e9a5bb229c63c/6584a5de65866f1255d31b92_image3-p-800.png 800w, https://cdn.prod.website-files.com/605236bb767e9a5bb229c63c/6584a5de65866f1255d31b92_image3-p-1080.png 1080w, https://cdn.prod.website-files.com/605236bb767e9a5bb229c63c/6584a5de65866f1255d31b92_image3-p-1600.png 1600w, https://cdn.prod.website-files.com/605236bb767e9a5bb229c63c/6584a5de65866f1255d31b92_image3.png 1638w"/><h2 id="working-with-downloaded-datasets" class="documentation__header margin-top--none margin-bottom--sm">Tips for working with downloaded datasets</h2><p>Explore the following sections for inspiration on leveraging your downloaded data. Please be aware that the tools, libraries, and frameworks mentioned below are not a comprehensive list and their performance will vary based on the size of your data and machine’s capabilities. They are all external tools with no affiliation to Semantic Scholar, and are simply offered as suggestions to facilitate your initial exploration of our data.</p><h4><strong>Command line tools</strong></h4><p>Perhaps the simplest way to view your downloaded data is via the command line through commands like more and tools like <a href="https://jqlang.github.io/jq/">jq</a>.</p><p><strong>1. The more command</strong></p><p>You can use the <strong>more</strong> command without installing any external tool or library. This command is used to display the contents of a file in a paginated manner and lets you page through the contents of your downloaded file in chunks without loading up the entire dataset. It shows one screen of text at a time and allows you to navigate through the file using the <strong>spacebar</strong> (move forward one screen) and <strong>Enter</strong> (move forward one line) commands.</p><p><strong>Example</strong>: You downloaded the papers dataset, and renamed the file to “papersDataset”. Use the “more papersDataset” command to view the file:</p><img src="https://cdn.prod.website-files.com/605236bb767e9a5bb229c63c/659454c9457f8c0452e28db5_image7.png" loading="lazy" alt="more papersDataset output"/><p><strong>2. The </strong><a href="https://jqlang.github.io/jq/" target="_blank"><strong>jq</strong></a><strong> tool</strong><br/></p><p><a href="https://jqlang.github.io/jq/" target="_blank">jq</a> is a lightweight and flexible command-line tool for exploring and manipulating JSON data. With <em>jq</em>, you can easily view formatted json output, select and view specific fields, filter data based on conditions, and more.<br/></p><p><strong>Example</strong>: You downloaded the papers dataset, and renamed the file to “papersDataset”. The <em>jq </em>command to format output is <span class="monospace">jq ‘.’ &lt;file-name&gt;</span>, so use the  <span class="monospace">jq . papersDataset</span> command to view the formatted file:<br/></p><img src="https://cdn.prod.website-files.com/605236bb767e9a5bb229c63c/659455b9acf84e2f7019920b_image13.png" loading="lazy" sizes="(max-width: 750px) 87vw, 653px" srcset="https://cdn.prod.website-files.com/605236bb767e9a5bb229c63c/659455b9acf84e2f7019920b_image13-p-500.png 500w, https://cdn.prod.website-files.com/605236bb767e9a5bb229c63c/659455b9acf84e2f7019920b_image13.png 653w" alt="jq . papersDataset output"/><p><strong>Example:</strong> You want to filter publication venues that are only journals. You can use <em>jq</em> to filter json objects by a condition with the command <strong> </strong><span class="monospace">jq ‘ . | select(has(“type”) and .type == “journal”)’ publicationVenues</span></p><img src="https://cdn.prod.website-files.com/605236bb767e9a5bb229c63c/659474d82ee7700345492769_image1.png" loading="lazy" sizes="(max-width: 479px) 87vw, (max-width: 732px) 86vw, 630px" srcset="https://cdn.prod.website-files.com/605236bb767e9a5bb229c63c/659474d82ee7700345492769_image1-p-500.png 500w, https://cdn.prod.website-files.com/605236bb767e9a5bb229c63c/659474d82ee7700345492769_image1.png 630w" alt="jq Output"/><h4><strong>Python Pandas library</strong></h4><p><a href="https://pandas.pydata.org/docs/index.html" target="_blank">Pandas</a> is a powerful and easy-to-use data analysis and manipulation library available in Python. Using Pandas, you can effortlessly import, clean, and explore your data. One of the key structures in Pandas is a <a href="https://pandas.pydata.org/docs/user_guide/dsintro.html#dataframe">DataFrame</a>, which can be thought of as a table of information, akin to a spreadsheet with rows and columns. Each column has a name, similar to a header in Excel, and each row represents a set of related data. With a DataFrame, tasks like sorting, filtering, and analyzing your data are straightforward. Now we will see how to leverage basic Pandas functions to view and explore our Semantic Scholar data in a DataFrame.<br/></p><p><strong>Example</strong>: The <a href="https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.head.html#pandas-dataframe-head"><strong>head</strong></a> function. In Pandas you can use the <em>head( )</em> function to view the initial few rows of your dataframe.<br/></p><pre contenteditable="false" class="code-block margin-bottom--sm w-code-block" style="display:block;overflow-x:auto;background:#2b2b2b;color:#f8f8f2;padding:0.5em"><code class="language-python" style="white-space:pre"><span style="color:#dcc6e0">import</span><span> pandas </span><span style="color:#dcc6e0">as</span><span> pd </span> <span></span><span style="color:#d4d0ab"># Read JSON file into Pandas DataFrame. The ‘lines’ parameter indicates that our file contains one json object per line</span><span> </span><span>df = pd.read_json(</span><span style="color:#abe338">&#x27;publication venues dataset&#x27;</span><span>, lines=</span><span style="color:#f5ab35">True</span><span>) </span> <span></span><span style="color:#d4d0ab"># Print the first few rows of the DataFrame</span><span> </span><span></span><span style="color:#f5ab35">print</span><span>(df.head())</span></code></pre><p>The output is below. You will notice that this is a very wide dataframe, where each column represents a field in our json object (e.g. id, name, issn, url, etc.). By default pandas only shows the first and last columns. To view all the columns, you can configure the pandas display settings before printing your output, with pd.set_option(&#x27;display.max_columns&#x27;, None)<br/></p><img src="https://cdn.prod.website-files.com/605236bb767e9a5bb229c63c/65945831f2fb6b7e8199199d_image4.png" loading="lazy" sizes="(max-width: 479px) 87vw, (max-width: 767px) 89vw, (max-width: 889px) 87vw, 774px" srcset="https://cdn.prod.website-files.com/605236bb767e9a5bb229c63c/65945831f2fb6b7e8199199d_image4-p-500.png 500w, https://cdn.prod.website-files.com/605236bb767e9a5bb229c63c/65945831f2fb6b7e8199199d_image4.png 774w" alt="Pandas head output"/><p class="margin-top--sm"><strong>Example</strong>: The <a href="https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.count.html#pandas-dataframe-count"><strong>count</strong></a> function. We can use the <em>count( )</em> function to count the number of rows that have data in them (e.g. not null). This can be useful to test the quality of your dataset.<br/></p><pre contenteditable="false" class="code-block margin-bottom--sm w-code-block" style="display:block;overflow-x:auto;background:#2b2b2b;color:#f8f8f2;padding:0.5em"><code class="language-python" style="white-space:pre"><span style="color:#d4d0ab"># Display count of non-null values for each column</span><span> </span><span></span><span style="color:#f5ab35">print</span><span>(df.count())</span></code></pre><p>Output:<br/></p><img src="https://cdn.prod.website-files.com/605236bb767e9a5bb229c63c/659459ad86a6cba3dcdbaf42_image17.png" loading="lazy" alt="Pandas count output"/><p class="margin-top--sm"><strong>Example: </strong>Filtering. We can filter our data by specifying conditions. For example, let’s assume we have loaded our authors&#x27; dataset into a dataframe, and want to filter by authors who have written at least 5 papers and been cited at least 10 times. After applying this filter, let&#x27;s select and display only the <em>authorid</em>, <em>name</em>, <em>papercount</em>, and <em>citationcount</em> fields.<br/></p><pre contenteditable="false" class="code-block margin-bottom--sm w-code-block" style="display:block;overflow-x:auto;background:#2b2b2b;color:#f8f8f2;padding:0.5em"><code class="language-python" style="white-space:pre"><span style="color:#d4d0ab">#filter dataframe by authors who have more than 5 publications and have been cited at least 10 times</span><span> </span><span>df = df[(df.papercount &gt;= </span><span style="color:#f5ab35">5</span><span>) &amp; (df.citationcount &gt;= </span><span style="color:#f5ab35">10</span><span>)] </span> <span></span><span style="color:#d4d0ab"># Select and print a subset of the columns in our filtered dataframe</span><span> </span><span></span><span style="color:#f5ab35">print</span><span>(df[[</span><span style="color:#abe338">&#x27;authorid&#x27;</span><span>, </span><span style="color:#abe338">&#x27;name&#x27;</span><span>, </span><span style="color:#abe338">&#x27;papercount&#x27;</span><span>, </span><span style="color:#abe338">&#x27;citationcount&#x27;</span><span>]])</span></code></pre><p>Output:<br/></p><img src="https://cdn.prod.website-files.com/605236bb767e9a5bb229c63c/659467173c5db5bf0ec05946_image12.png" loading="lazy" alt="Python Pandas Filtering Output"/><p class="margin-top--sm"><strong>Example: </strong>Sorting. Pandas offers a variety of sorting functions to organize our data. In the example below, we use the <a href="https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.sort_values.html#pandas-dataframe-sort-values"><em>sort_values( )</em></a> function to sort the dataframe by the “name” column and only display the <em>authorid </em>and <em>name</em> columns. The default is ascending order, so in this case our output will list authors in alphabetical order.   e can filter our data by specifying conditions. For example, let’s assume we have loaded our authors&#x27; dataset into a dataframe, and want to filter by authors who have written at least 5 papers and been cited at least 10 times. After applying this filter, let&#x27;s select and display only the <em>authorid</em>, <em>name</em>, <em>papercount</em>, and <em>citationcount</em> fields.<br/></p><pre contenteditable="false" class="code-block margin-bottom--sm w-code-block" style="display:block;overflow-x:auto;background:#2b2b2b;color:#f8f8f2;padding:0.5em"><code class="language-python" style="white-space:pre"><span style="color:#d4d0ab">#Let&#x27;s sort our authors in alphabetical order</span><span> </span><span>df = df.sort_values(by=</span><span style="color:#abe338">&#x27;name&#x27;</span><span>)</span></code></pre><p>Output:<br/></p><img src="https://cdn.prod.website-files.com/605236bb767e9a5bb229c63c/6594676bbfe185b021779b9c_image6.png" loading="lazy" alt="Python Pandas Sorting Output"/><p class="margin-top--sm"><strong>Example: </strong>Check for missing values. Let’s say we want to assess the quality of our data by checking for missing (null) values. We can count how many missing values we have by using the <a href="https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.isnull.html#pandas-dataframe-isnull">isnull()</a> and<a href="https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.sum.html#pandas-dataframe-sum"> sum()</a> functions.<br/></p><pre contenteditable="false" class="code-block margin-bottom--sm w-code-block" style="display:block;overflow-x:auto;background:#2b2b2b;color:#f8f8f2;padding:0.5em"><code class="language-python" style="white-space:pre"><span style="color:#d4d0ab"># Count and print the number of missing values for each author attribute</span><span> </span><span></span><span style="color:#f5ab35">print</span><span>(df.isnull().</span><span style="color:#f5ab35">sum</span><span>())</span></code></pre><p>Output:<br/></p><img src="https://cdn.prod.website-files.com/605236bb767e9a5bb229c63c/659467bbec1d5d39423ebfa2_image2.png" loading="lazy" alt="Python Pandas Checking for missing values Output"/><h4><strong>Apache Spark (Python examples)</strong></h4><p><a href="https://spark.apache.org/">Apache Spark</a> is a fast and powerful processing engine that can analyze large-scale data faster than traditional methods via in-memory caching and optimized query execution. Spark offers APIs for a variety of programming languages, so you can utilize its capabilities regardless of the language you are coding in. In our examples we will showcase the <a href="https://spark.apache.org/docs/latest/api/python/index.html">Spark Python API</a>, commonly known as <em>PySpark</em>.<br/></p><p><strong>Example:</strong> The <a href="https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/api/pyspark.sql.DataFrame.show.html#pyspark-sql-dataframe-show"><strong>show</strong></a> function. PySpark’s <em>show( )</em> function is similar to <em>print( )</em> or <em>head( ) </em>in pandas and will display the first few rows of data. Let’s load up our <em>publication venues</em> data into a PySpark DataFrame and see how it looks:<br/></p><pre contenteditable="false" class="code-block margin-bottom--sm w-code-block" style="display:block;overflow-x:auto;background:#2b2b2b;color:#f8f8f2;padding:0.5em"><code class="language-python" style="white-space:pre"><span style="color:#dcc6e0">from</span><span> pyspark.sql </span><span style="color:#dcc6e0">import</span><span> SparkSession </span> <span></span><span style="color:#d4d0ab"># Create a Spark session</span><span> </span><span>spark = SparkSession.builder.appName(</span><span style="color:#abe338">&quot;dataset_exploration&quot;</span><span>).getOrCreate() </span> <span></span><span style="color:#d4d0ab"># Read the dataset file named &#x27;publication venues dataset&#x27; into a PySpark DataFrame. Depending on the directory you are working from you may need to include the complete file path.</span><span> </span><span>df = spark.read.json(</span><span style="color:#abe338">&quot;publication venues dataset&quot;</span><span>) </span> <span></span><span style="color:#d4d0ab"># Display the first few rows</span><span> </span>df.show()</code></pre><p>Output:<br/></p><img src="https://cdn.prod.website-files.com/605236bb767e9a5bb229c63c/65945aa7757080f139e86549_image14.png" loading="lazy" sizes="(max-width: 479px) 87vw, (max-width: 991px) 89vw, 800px" srcset="https://cdn.prod.website-files.com/605236bb767e9a5bb229c63c/65945aa7757080f139e86549_image14-p-500.png 500w, https://cdn.prod.website-files.com/605236bb767e9a5bb229c63c/65945aa7757080f139e86549_image14-p-800.png 800w, https://cdn.prod.website-files.com/605236bb767e9a5bb229c63c/65945aa7757080f139e86549_image14.png 1134w" alt="Apache Spark Show output"/><p class="margin-top--sm"><strong>Example: </strong>The <a href="https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/api/pyspark.sql.DataFrame.printSchema.html#pyspark-sql-dataframe-printschema"><strong>printSchema </strong></a>function. PySpark offers a handy <em>printSchema( )</em> function if you want to explore the structure of your data<br/></p><pre contenteditable="false" class="code-block margin-bottom--sm w-code-block" style="display:block;overflow-x:auto;background:#2b2b2b;color:#f8f8f2;padding:0.5em"><code class="language-python" style="white-space:pre"><span style="color:#d4d0ab"># Display the object schema</span><span> </span> df.printSchema()</code></pre><p>Output:<br/></p><img src="https://cdn.prod.website-files.com/605236bb767e9a5bb229c63c/65945b8dacf84e2f701d43e4_image18.png" loading="lazy" alt="Apache Spark print schema output"/><p class="margin-top--sm"><strong>Example: </strong>Summary statistics. PySpark offers a handy <a href="https://spark.apache.org/docs/3.1.1/api/python/reference/api/pyspark.sql.DataFrame.describe.html#pyspark-sql-dataframe-describe"><em>describe( )</em></a> function to delve into and display summary statistics for the specified columns in our dataset. In this example we describe the papercount, <em>citationcount</em>, and orderBy attributes of our author data. In the results we can see the average papercount of authors in this dataset, along with their average <em>citationcount</em>, <em>hindex</em>, and other common statistical measures.<br/></p><pre contenteditable="false" class="code-block margin-bottom--sm w-code-block" style="display:block;overflow-x:auto;background:#2b2b2b;color:#f8f8f2;padding:0.5em"><code class="language-python" style="white-space:pre"><span>df.describe([</span><span style="color:#abe338">&quot;papercount&quot;</span><span>, </span><span style="color:#abe338">&quot;citationcount&quot;</span><span>, </span><span style="color:#abe338">&quot;hindex&quot;</span><span>]).show()</span></code></pre><p>Output:<br/></p><img src="https://cdn.prod.website-files.com/605236bb767e9a5bb229c63c/6594681f97fb083e30e5b889_image5.png" loading="lazy" alt="Apache Spark Summary Statistics Output"/><p class="margin-top--sm"><strong>Example: </strong>Sorting. We can call the <a href="https://spark.apache.org/docs/3.1.1/api/python/reference/api/pyspark.sql.DataFrame.orderBy.html#pyspark-sql-dataframe-orderby">orderBy( )</a> function and specify the column we want to sort by, in this case papercount. We also call the desc() function to sort in descending order (from highest to lowest papercount). We also only want to display the <em>authorid, name, and papercount </em>fields, and display the top 3 records.<br/></p><pre contenteditable="false" class="code-block margin-bottom--sm w-code-block" style="display:block;overflow-x:auto;background:#2b2b2b;color:#f8f8f2;padding:0.5em"><code class="language-python" style="white-space:pre"><span>df = df.orderBy(col(</span><span style="color:#abe338">&quot;papercount&quot;</span><span>).desc()) </span><span>df.select(</span><span style="color:#abe338">&quot;authorid&quot;</span><span>, </span><span style="color:#abe338">&quot;name&quot;</span><span>, </span><span style="color:#abe338">&quot;papercount&quot;</span><span>).show(</span><span style="color:#f5ab35">3</span><span>)</span></code></pre><p>Output:<br/></p><img src="https://cdn.prod.website-files.com/605236bb767e9a5bb229c63c/65946a3beb365dc2cda25cee_image10.png" loading="lazy" alt="Apache Spark Summary Sorting Output"/><h4><strong>MongoDB</strong></h4><p><a href="https://www.mongodb.com/">MongoDB</a> is a fast and flexible database tool built for exploring and analyzing large scale datasets. Think of it as a robust digital warehouse where you can efficiently organize, store, and retrieve large volumes of data. In addition, MongoDB is a NoSQL database that stores data in a flexible schema-less format, scales horizontally, supports various data models, and is optimized for performance. MongoDB offers both <a href="https://www.mongodb.com/try/download/community">on-premise</a> and fully managed cloud options (<a href="https://www.mongodb.com/atlas">Atlas</a>) and can be accessed via the Mongo shell or a GUI (known as <a href="https://www.mongodb.com/products/tools/compass">Mongo Compass</a>). You can check out our guide on <a href="https://docs.google.com/document/d/1Ej5vCd-LZiOxo03b0D7XwgFKQt82v9yz2LaBgWrQQvE/edit#heading=h.2pwe9hxzxy1m">setting up Mongo</a> if you need help getting started. In the example below, we have imported a <em>papers </em>dataset into a Mongo Atlas cluster and show you how to leverage the Mongo Compass GUI to view and explore your data.<br/></p><p>Once you have imported your data, you can view it via Compass as shown in the example below. You can leverage the <a href="https://www.mongodb.com/docs/compass/current/">Compass documentation</a> to discover all its capabilities. We have listed some key items on the user interface to get you acquainted:<br/></p><ul role="list"><li>Data can be viewed in the default list view (shown below), object view, or table view by toggling the button on the upper right hand corner. In the list view, each ‘card’ displays a single record, or in this case a paper object. Notice that MongoDB appends its own ID, known as <em>ObjectId</em> to each record.</li><li>You can filter and analyze your data using the filter pane at the top of the screen, and click on the <em>Explain</em> button to see how your filters were applied to obtain your result set. Note that since Mongo is a NoSQL database, it has a slightly different query language from SQL to use for filtering and manipulation.</li><li>The default tab is the <em>Documents</em> tab where you can view and scroll through your data. You can also switch to the <em>Aggregations tab </em>to transform, filter, group, and perform aggregate operations on your dataset. In the <em>Schema</em> tab, Mongo provides an analysis of the schema of your dataset. When you click on the <em>Indexes</em> tab, you will find that the default index for searches is Mongo’s <em>ObjectId</em>. If you believe you will perform frequent searches using another attribute (e.g. <em>corpusid)</em>, you can add an additional index to optimize performance.</li><li>You can always add more data to your dataset via the green <em>Add Data </em>button right under the filter query bar<em> </em></li></ul><img src="https://cdn.prod.website-files.com/605236bb767e9a5bb229c63c/65945c8bb49bc14ebd8513d0_image8.png" loading="lazy" sizes="(max-width: 479px) 87vw, (max-width: 991px) 89vw, 800px" srcset="https://cdn.prod.website-files.com/605236bb767e9a5bb229c63c/65945c8bb49bc14ebd8513d0_image8-p-500.png 500w, https://cdn.prod.website-files.com/605236bb767e9a5bb229c63c/65945c8bb49bc14ebd8513d0_image8-p-800.png 800w, https://cdn.prod.website-files.com/605236bb767e9a5bb229c63c/65945c8bb49bc14ebd8513d0_image8-p-1080.png 1080w, https://cdn.prod.website-files.com/605236bb767e9a5bb229c63c/65945c8bb49bc14ebd8513d0_image8-p-1600.png 1600w, https://cdn.prod.website-files.com/605236bb767e9a5bb229c63c/65945c8bb49bc14ebd8513d0_image8.png 1655w" alt="MongoDB UI"/><h5>Setting Up MongoDB</h5><p>You have the option of installing MongoDB onto your machine, or using their managed database-as-a-service option on the cloud, otherwise known as <a href="https://www.mongodb.com/atlas">Atlas</a>. Once you set up your database, you can download the GUI tool (<a href="https://www.mongodb.com/products/tools/compass">Mongo Compass</a>) and connect it to your database to visually interact with your data. If you are new to mongo and want to just explore, you can setup a free cluster on Atlas with just a few easy steps:<br/></p><p><strong>Set Up a Free Cluster on MongoDB Atlas:</strong><br/></p><ol role="list"><li>Sign Up/Login:<br/>1.1. Visit the MongoDB Atlas website.<br/>1.2. Sign up for a new account or log in if you already have one.</li><li>Create a New Cluster:<br/>2.1. After logging in, click on &quot;Build a Cluster.&quot;<br/>2.2. Choose the free tier (M0) or another desired plan.<br/>2.3. Select your preferred cloud provider and region.</li><li>Configure Cluster:<br/>3.1. Set up additional configurations, such as cluster name and cluster tier.<br/>3.2. Click &quot;Create Cluster&quot; to initiate the cluster deployment. It may take a few minutes.</li></ol><p><strong>Connect to MongoDB Compass:</strong><br/></p><ol role="list"><li>Download and Install MongoDB Compass:<br/>1.1. Download MongoDB Compass from the official website.<br/>1.2. Install the Compass application on your computer.</li><li>Retrieve Connection String:<br/>2.1. In MongoDB Atlas, go to the &quot;Clusters&quot; section.<br/>2.2. Click on &quot;Connect&quot; for your cluster.<br/>2.3. Choose &quot;Connect Your Application.&quot;<br/>2.4. Copy the connection string.</li><li>Connect Compass to Atlas:<br/>3.1. Open MongoDB Compass.<br/>3.2. Paste the connection string in the connection dialog.<br/>3.3. Modify the username, password, and database name if needed.<br/>3.4. Click &quot;Connect.&quot;</li></ol><p><strong>Import Data:</strong><br/></p><ol role="list"><li>Create a Database and Collection:<br/>1.1. In MongoDB Compass, navigate to the &quot;Database&quot; tab.<br/>1.2. Create a new database and collection by clicking &quot;Create Database&quot; and &quot;Add My Own Data.&quot;</li><li>Import Data:<br/>2.1. In the new collection, click &quot;Add Data&quot; and choose &quot;Import File.&quot;<br/>2.2. Select your JSON or CSV file containing the data.<br/>2.3. Map fields if necessary and click &quot;Import.&quot;</li><li>Verify Data:<br/>3.1. Explore the imported data in MongoDB Compass to ensure it&#x27;s displayed correctly.</li></ol><p>Now, you have successfully set up a free cluster on MongoDB Atlas, connected MongoDB Compass to the cluster, and imported data into your MongoDB database. This process allows you to start working with your data using MongoDB&#x27;s powerful tools.<br/></p><div class="card card--tip margin-bottom--sm"><p><strong>TIP: </strong>We recommend checking the Mongo website for the latest installation instructions and FAQ in case you run into any issues.<br/></p></div><p><strong>Example: </strong><a href="https://www.mongodb.com/docs/compass/current/query/filter/">Querying, Filtering, and Sorting</a>. Using the Mongo Compass GUI we can filter and sort our dataset per our needs. For example, let&#x27;s see which papers in Medicine were cited the most in the last 5 years, and exclude any papers with under 50 citations. In the <em>project </em>field we choose which fields we would like to display in the output, and we sort in descending order by <em>citationcount </em><br/></p><pre contenteditable="false" class="code-block w-code-block" style="display:block;overflow-x:auto;background:#2b2b2b;color:#f8f8f2;padding:0.5em"><code class="language-json" style="white-space:pre"><span>{ </span> &#x27;s2fieldsofstudy.category&#x27;: &#x27;Medicine&#x27;, &#x27;citationcount&#x27;: { <span> &#x27;$gte&#x27;: </span><span style="color:#f5ab35">50</span><span> </span> }, &#x27;year&#x27;: { <span> &#x27;$gte&#x27;: </span><span style="color:#f5ab35">2019</span><span>, </span><span> &#x27;$lte&#x27;: </span><span style="color:#f5ab35">2023</span><span> </span> } }</code></pre><img src="https://cdn.prod.website-files.com/605236bb767e9a5bb229c63c/6594740968bc21f84a11da81_image11.png" loading="lazy" sizes="(max-width: 479px) 87vw, (max-width: 991px) 89vw, 800px" srcset="https://cdn.prod.website-files.com/605236bb767e9a5bb229c63c/6594740968bc21f84a11da81_image11-p-500.png 500w, https://cdn.prod.website-files.com/605236bb767e9a5bb229c63c/6594740968bc21f84a11da81_image11-p-800.png 800w, https://cdn.prod.website-files.com/605236bb767e9a5bb229c63c/6594740968bc21f84a11da81_image11.png 1005w" alt="MongoDB"/><p><strong>Output:</strong><br/></p><img src="https://cdn.prod.website-files.com/605236bb767e9a5bb229c63c/65947442eea5f60bdc1b7328_image16.png" loading="lazy" sizes="(max-width: 479px) 87vw, (max-width: 767px) 673px, (max-width: 991px) 87vw, 673px" srcset="https://cdn.prod.website-files.com/605236bb767e9a5bb229c63c/65947442eea5f60bdc1b7328_image16-p-500.png 500w, https://cdn.prod.website-files.com/605236bb767e9a5bb229c63c/65947442eea5f60bdc1b7328_image16.png 673w" alt="MongoDB Output"/><h3 id="Working-with-Multiple-Datasets" class="documentation__sub-header">Working with Multiple Datasets</h3><p>Oftentimes we may want to combine information from multiple datasets to gather insights. Consider the following example:<br/></p><p><strong>Use case: </strong>Let’s delve into a publication venue, such as the “Journal of the Geological Society”, and learn more about the papers that have been published in it. Perhaps we would like to gather the names of authors who have published a paper in this journal, but only those whose papers have been cited at least 15 times. We can combine information from the <em>publication venues</em> dataset and the <em>papers </em>dataset to find the authors that meet this criteria. To do this, we can load our datasets into pandas dataframes and retrieve the publication venue ID associated with the “Journal of the Geological Society” from the <em>publication venues</em> dataset. Then we can search the <em>papers </em>dataset for papers that have a <em>citationcount</em> of at least 15 and are tagged to that venue ID. Finally we can collect the names of authors associated with each of those papers that met our criteria. From this point you can explore other possibilities, such as viewing other papers published by those authors, checking out their homepage on the Semantic Scholar website, and more.<br/></p><p><strong>Python Example:</strong><br/></p><pre contenteditable="false" class="code-block margin-bottom--sm w-code-block" style="display:block;overflow-x:auto;background:#2b2b2b;color:#f8f8f2;padding:0.5em"><code class="language-python" style="white-space:pre"><span style="color:#dcc6e0">import</span><span> pandas </span><span style="color:#dcc6e0">as</span><span> pd </span> <span></span><span style="color:#d4d0ab"># Create Pandas DataFrames</span><span> </span><span>papers_df = pd.read_json(</span><span style="color:#abe338">&#x27;papersDataset&#x27;</span><span>, lines=</span><span style="color:#f5ab35">True</span><span>) </span><span>venues_df = pd.read_json(</span><span style="color:#abe338">&#x27;publicationVenuesDataset&#x27;</span><span>, lines=</span><span style="color:#f5ab35">True</span><span>) </span> <span></span><span style="color:#d4d0ab"># Find the venue id for our publication venue of interest - &quot;Journal of the Geological Society&quot;</span><span> </span><span>publication_venue_id = venues_df.loc[venues_df[</span><span style="color:#abe338">&quot;name&quot;</span><span>] == </span><span style="color:#abe338">&quot;Journal of the Geological Society&quot;</span><span>, </span><span style="color:#abe338">&quot;id&quot;</span><span>].values[</span><span style="color:#f5ab35">0</span><span>] </span> <span></span><span style="color:#d4d0ab"># Filter papers based on the venue id with a citation count of at least 15</span><span> </span>filtered_geology_papers = papers_df.loc[ <span> (papers_df[</span><span style="color:#abe338">&quot;publicationvenueid&quot;</span><span>] == publication_venue_id) &amp; (papers_df[</span><span style="color:#abe338">&quot;citationcount&quot;</span><span>] &gt;= </span><span style="color:#f5ab35">15</span><span>) </span>] <span></span><span style="color:#d4d0ab"># Traverse the list of authors for each paper that met our filter criteria and collect their names into a list</span><span> </span>author_names = [] <span></span><span style="color:#dcc6e0">for</span><span> authors_list </span><span style="color:#dcc6e0">in</span><span> filtered_geology_papers[</span><span style="color:#abe338">&quot;authors&quot;</span><span>]: </span><span> author_names.extend(author[</span><span style="color:#abe338">&quot;name&quot;</span><span>] </span><span style="color:#dcc6e0">for</span><span> author </span><span style="color:#dcc6e0">in</span><span> authors_list) </span> <span></span><span style="color:#d4d0ab"># Print the resulting author names, with each name on a new line</span><span> </span><span></span><span style="color:#f5ab35">print</span><span>(</span><span style="color:#abe338">&quot;Authors associated with papers from the Journal of the Geological Society:&quot;</span><span>) </span><span></span><span style="color:#f5ab35">print</span><span>(*author_names, sep=</span><span style="color:#abe338">&quot;\n&quot;</span><span>)</span></code></pre><p><strong>Output:</strong><br/></p><img src="https://cdn.prod.website-files.com/605236bb767e9a5bb229c63c/6594759dbfe185b0217fe46c_image15.png" loading="lazy" sizes="(max-width: 479px) 87vw, (max-width: 713px) 86vw, 614px" srcset="https://cdn.prod.website-files.com/605236bb767e9a5bb229c63c/6594759dbfe185b0217fe46c_image15-p-500.png 500w, https://cdn.prod.website-files.com/605236bb767e9a5bb229c63c/6594759dbfe185b0217fe46c_image15.png 614w" alt="Multiple Datasets Output"/></div></div></div></div></div><section id="api-form" class="blade blade--dark"><div class="blade__grid blade__grid--full"><div id="w-node-_1817117b-9c29-1007-b535-11341765bc29-287a891b" class="blade__content"><div class="card card--m card--centered"><div class="w-embed w-script"><!--[if lte IE 8]> <script charset="utf-8" type="text/javascript" src="//js.hsforms.net/forms/v2-legacy.js"></script> <![endif]--> <script charset="utf-8" type="text/javascript" src="//js.hsforms.net/forms/v2.js"></script> <script> hbspt.forms.create({ region: "na1", portalId: "5910970", formId: "105f3885-cfd8-4b57-a570-6174c5c1650a" }); </script></div></div></div></div></section></main><div class="cta__blade"><h4 class="cta__header">Join the Semantic Scholar API Community Slack Channel</h4><a href="https://join.slack.com/t/semanticschol-xyj3882/shared_invite/zt-2e98pwubp-vzoxaTgITyurw~~WK1OntQ" target="_blank" class="button button--hero w-button">Get Started</a></div><div class="post__list"><h4 class="post__list-heading">Latest News &amp; Updates</h4><div class="w-dyn-list"><div role="list" class="post__grid w-dyn-items"><div role="listitem" class="post w-dyn-item"><a href="https://blog.allenai.org/case-study-iterative-design-for-skimming-support-5563dbe0899e" target="_blank" class="post__link w-inline-block"><div class="post__image-wrapper"><img src="https://cdn.prod.website-files.com/605ba9b55a4a92803e45a32b/652055861194e6fc6bb5983a_skimming_2.0.png" loading="lazy" alt="Case Study: Iterative Design for Skimming Support" sizes="(max-width: 479px) 85vw, (max-width: 767px) 84vw, (max-width: 991px) 87vw, 24vw" srcset="https://cdn.prod.website-files.com/605ba9b55a4a92803e45a32b/652055861194e6fc6bb5983a_skimming_2.0-p-500.png 500w, https://cdn.prod.website-files.com/605ba9b55a4a92803e45a32b/652055861194e6fc6bb5983a_skimming_2.0-p-800.png 800w, https://cdn.prod.website-files.com/605ba9b55a4a92803e45a32b/652055861194e6fc6bb5983a_skimming_2.0-p-1080.png 1080w, https://cdn.prod.website-files.com/605ba9b55a4a92803e45a32b/652055861194e6fc6bb5983a_skimming_2.0-p-1600.png 1600w, https://cdn.prod.website-files.com/605ba9b55a4a92803e45a32b/652055861194e6fc6bb5983a_skimming_2.0-p-2000.png 2000w, https://cdn.prod.website-files.com/605ba9b55a4a92803e45a32b/652055861194e6fc6bb5983a_skimming_2.0-p-2600.png 2600w, https://cdn.prod.website-files.com/605ba9b55a4a92803e45a32b/652055861194e6fc6bb5983a_skimming_2.0-p-3200.png 3200w, https://cdn.prod.website-files.com/605ba9b55a4a92803e45a32b/652055861194e6fc6bb5983a_skimming_2.0.png 5834w" class="post__image"/></div><h4 class="post__title">Case Study: Iterative Design for Skimming Support</h4><div class="post__meta"><div class="post__date">Oct 6, 2023</div><div class="post__read-time">7 min read</div></div><p class="post__intro">How might we help researchers quickly assess the relevance of scientific literature? Take a closer look at Skimming, Semantic Reader’s latest AI feature, and the collaborative design process behind it.</p></a><div class="post__author">Cassidy Trier</div></div><div role="listitem" class="post w-dyn-item"><a href="https://blog.allenai.org/behind-the-scenes-of-semantic-scholars-new-author-influence-design-d7e007ba6a84" target="_blank" class="post__link w-inline-block"><div class="post__image-wrapper"><img src="https://cdn.prod.website-files.com/605ba9b55a4a92803e45a32b/64de863a294e5ef80fee0187_Screen%20Shot%202023-02-06%20at%2011.36.31%20AM.png" loading="lazy" alt="Behind the Scenes of Semantic Scholar’s New Author Influence Design" sizes="(max-width: 479px) 85vw, (max-width: 767px) 84vw, (max-width: 991px) 87vw, 24vw" srcset="https://cdn.prod.website-files.com/605ba9b55a4a92803e45a32b/64de863a294e5ef80fee0187_Screen%20Shot%202023-02-06%20at%2011.36.31%20AM-p-500.png 500w, https://cdn.prod.website-files.com/605ba9b55a4a92803e45a32b/64de863a294e5ef80fee0187_Screen%20Shot%202023-02-06%20at%2011.36.31%20AM-p-800.png 800w, https://cdn.prod.website-files.com/605ba9b55a4a92803e45a32b/64de863a294e5ef80fee0187_Screen%20Shot%202023-02-06%20at%2011.36.31%20AM-p-1080.png 1080w, https://cdn.prod.website-files.com/605ba9b55a4a92803e45a32b/64de863a294e5ef80fee0187_Screen%20Shot%202023-02-06%20at%2011.36.31%20AM-p-1600.png 1600w, https://cdn.prod.website-files.com/605ba9b55a4a92803e45a32b/64de863a294e5ef80fee0187_Screen%20Shot%202023-02-06%20at%2011.36.31%20AM-p-2000.png 2000w, https://cdn.prod.website-files.com/605ba9b55a4a92803e45a32b/64de863a294e5ef80fee0187_Screen%20Shot%202023-02-06%20at%2011.36.31%20AM.png 2150w" class="post__image"/></div><h4 class="post__title">Behind the Scenes of Semantic Scholar’s New Author Influence Design</h4><div class="post__meta"><div class="post__date">Aug 17, 2023</div><div class="post__read-time">5 min read</div></div><p class="post__intro">We released a new version of Author Influence interface to help scholars better discover other scholars in their fields. Here&#x27;s how we identified user insights and made those design choices.</p></a><div class="post__author">Cassidy Trier, Evie Cheng, Ashley Lee</div></div><div role="listitem" class="post w-dyn-item"><a href="https://www.nature.com/articles/d41586-023-01907-z" target="_blank" class="post__link w-inline-block"><div class="post__image-wrapper"><img src="https://cdn.prod.website-files.com/605ba9b55a4a92803e45a32b/64d15e16ad0f9fd89058273b_nature.webp" loading="lazy" alt="Artificial-intelligence search engines wrangle academic literature" sizes="(max-width: 479px) 85vw, (max-width: 767px) 84vw, (max-width: 991px) 87vw, 24vw" srcset="https://cdn.prod.website-files.com/605ba9b55a4a92803e45a32b/64d15e16ad0f9fd89058273b_nature-p-500.webp 500w, https://cdn.prod.website-files.com/605ba9b55a4a92803e45a32b/64d15e16ad0f9fd89058273b_nature-p-800.webp 800w, https://cdn.prod.website-files.com/605ba9b55a4a92803e45a32b/64d15e16ad0f9fd89058273b_nature-p-1080.webp 1080w, https://cdn.prod.website-files.com/605ba9b55a4a92803e45a32b/64d15e16ad0f9fd89058273b_nature.webp 1248w" class="post__image"/></div><h4 class="post__title">Artificial-intelligence search engines wrangle academic literature</h4><div class="post__meta"><div class="post__date">Aug 7, 2023</div><div class="post__read-time">5 min read</div></div><p class="post__intro">Nature had a chat with Dan Weld, Chief Scientist at Semantic Scholar, to discuss how search engines are helping scientists explore and innovate by making it easier to draw connections from a massive collection of scientific literature.</p></a><div class="post__author">Amanda Heidt</div></div></div></div></div><footer class="site-footer"><div class="site-footer__top"><div class="site-footer__top-container"><div class="site-footer__about"><h6 class="site-footer site-footer__title">What Is Semantic Scholar?</h6><p class="site-footer site-footer__text">Semantic Scholar is a free, AI-powered research tool for scientific literature, based at Ai2.</p><a href="/about" class="site-footer site-footer__link">Learn More</a></div><div class="site-footer__navigation"><ul role="list" class="site-footer site-footer__list w-list-unstyled"><li><h6 class="site-footer site-footer__title">About</h6></li><li><a href="/about" class="site-footer site-footer__link">About Us<br/></a></li><li><a href="/about/team" class="site-footer site-footer__link">Meet the Team<br/></a></li><li><a href="/about/publishers" class="site-footer site-footer__link">Publishers</a></li><li><a href="https://medium.com/ai2-blog/semantic-scholar/home" target="_blank" class="site-footer site-footer__link">Blog</a></li><li><a href="https://allenai.org/careers?team=semantic+scholar#current-openings" target="_blank" class="site-footer site-footer__link">Ai2 Careers</a></li></ul><ul role="list" class="site-footer site-footer__list w-list-unstyled"><li><h6 class="site-footer site-footer__title">Product</h6></li><li><a href="/product" class="site-footer site-footer__link">Product Overview</a></li><li><a href="/product/semantic-reader" class="site-footer site-footer__link">Semantic Reader</a></li><li><a href="/product/scholars-hub" class="site-footer site-footer__link">Scholar&#x27;s Hub</a></li><li><a href="/product/beta-program" class="site-footer site-footer__link">Beta Program</a></li><li><a href="/product/release-notes" class="site-footer site-footer__link">Release Notes</a></li></ul><ul role="list" class="site-footer site-footer__list w-list-unstyled"><li><h6 class="site-footer site-footer__title">API</h6></li><li><a href="/product/api" class="site-footer site-footer__link">API Overview</a></li><li><a href="/product/api/tutorial" aria-current="page" class="site-footer site-footer__link w--current">API Tutorials</a></li><li><a href="https://api.semanticscholar.org/api-docs/" class="site-footer site-footer__link">API Documentation</a></li><li><a href="/product/api/gallery" class="site-footer site-footer__link">API Gallery</a></li></ul><ul id="w-node-_80db44ed-17f7-2024-a450-ff6046e68512-46e684e3" role="list" class="site-footer site-footer__list w-list-unstyled"><li><h6 class="site-footer site-footer__title">Research</h6></li><li><a href="https://allenai.org/papers?tag=Semantic%20Scholar" class="site-footer site-footer__link">Publications</a></li><li><a href="/research/research-team" class="site-footer site-footer__link">Researchers</a></li><li><a href="/research/careers" class="site-footer site-footer__link">Research Careers</a></li><li><a href="/research/prototypes" class="site-footer site-footer__link">Prototypes</a></li><li><a href="/resources" class="site-footer site-footer__link">Resources</a></li></ul><ul id="w-node-a1cfe8f5-f656-0f8f-b57f-f2c91de1b718-46e684e3" role="list" class="site-footer site-footer__list w-list-unstyled"><li><h6 class="site-footer site-footer__title">Help</h6></li><li><a href="https://www.semanticscholar.org/faq" class="site-footer site-footer__link">FAQ</a></li><li><a href="/about/librarians" class="site-footer site-footer__link">Librarians</a></li><li><a href="/product/tutorials" class="site-footer site-footer__link">Tutorials</a></li><li><a href="#" data-w-id="2cf6e605-c551-b5e7-40a2-70dbdd9705a1" class="site-footer site-footer__link site-footer__contact-trigger">Contact</a></li></ul></div></div></div><div class="site-footer__bottom"><div class="site-footer__bottom-container"><p class="site-footer__legal">Proudly built by <a href="https://allenai.org/" target="_blank" class="site-footer site-footer__link">Ai2</a> with the help of our Collaborators<br/><a href="https://allenai.org/terms.html" target="_blank" class="site-footer site-footer__link">Terms of Service</a>  •  <a href="https://allenai.org/privacy-policy.html" target="_blank" class="site-footer site-footer__link">Privacy Policy</a>  •  <a href="/product/api/license" class="site-footer site-footer__link">API License Agreement</a></p><img src="https://cdn.prod.website-files.com/605236bb767e9a5bb229c63c/66a167e36965869758dd87f8_Ai2_logo_offwhite_RGB.png" loading="lazy" sizes="94.5703125px" srcset="https://cdn.prod.website-files.com/605236bb767e9a5bb229c63c/66a167e36965869758dd87f8_Ai2_logo_offwhite_RGB-p-500.png 500w, https://cdn.prod.website-files.com/605236bb767e9a5bb229c63c/66a167e36965869758dd87f8_Ai2_logo_offwhite_RGB-p-800.png 800w, https://cdn.prod.website-files.com/605236bb767e9a5bb229c63c/66a167e36965869758dd87f8_Ai2_logo_offwhite_RGB-p-1080.png 1080w, https://cdn.prod.website-files.com/605236bb767e9a5bb229c63c/66a167e36965869758dd87f8_Ai2_logo_offwhite_RGB-p-1600.png 1600w, https://cdn.prod.website-files.com/605236bb767e9a5bb229c63c/66a167e36965869758dd87f8_Ai2_logo_offwhite_RGB-p-2000.png 2000w, https://cdn.prod.website-files.com/605236bb767e9a5bb229c63c/66a167e36965869758dd87f8_Ai2_logo_offwhite_RGB.png 2771w" alt="" class="site-footer__logo"/></div></div><div class="contact-modal"><div class="contact-modal__container"><a data-w-id="094e8a79-f899-529e-250c-5240927de9d7" href="#" class="contact-modal__close w-inline-block"><img src="https://cdn.prod.website-files.com/605236bb767e9a5bb229c63c/61e9b4e329b9d877dee723c3_close-light.svg" loading="lazy" alt="Close" class="contact-modal__close-jewel"/></a><h4 class="margin-top--none">Contact Us</h4><div class="contact-modal__form-wrapper w-form"><form id="freshdesk-contact-form" name="wf-form-Contact" data-name="Contact" action="https://www.semanticscholar.org/api/1/feedback" method="post" class="contact-modal__form" data-wf-page-id="6584745360a4872a287a891b" data-wf-element-id="7663d2bf-5ed9-4856-37e4-a4966bfbf84f"><p class="margin-bottom--sm">Please visit our <a href="https://www.semanticscholar.org/faq">FAQ</a> to find helpful information before submitting your question.<br/></p><label for="contact-form-name">Your name</label><input class="w-input" maxlength="256" name="name" data-name="name" placeholder="" type="text" id="contact-form-name"/><label for="contact-form-email-2">Your email</label><input class="w-input" maxlength="256" name="email" data-name="email" placeholder="" type="email" id="contact-form-email" required=""/><label for="contact-form-subject">Subject<br/></label><input class="w-input" maxlength="256" name="subject" data-name="subject" placeholder="" type="text" id="contact-form-subject" required=""/><label for="contact-form-topic">Topic<br/></label><select id="contact-form-topic" name="topic" data-name="topic" required="" class="select-field w-select"><option value="">Select A Topic</option><option value="Takedown Request">Remove A Paper</option><option value="Author Disambiguation">Merge Authors</option><option value="Other Problem">Other</option></select><label for="contact-form-feedback-2">Feedback<br/></label><textarea id="contact-form-feedback" name="feedback" maxlength="5000" data-name="feedback" placeholder="" required="" class="margin-bottom--sm w-input"></textarea><input type="submit" data-wait="Please wait..." class="button w-button" value="Contact Us"/></form><div class="contact-modal__form-success w-form-done"><div><strong>Thanks! </strong>Your feedback has been submitted.</div></div><div class="contact-modal__form-error w-form-fail"><div>Something went wrong while submitting the form, please try again.</div></div></div></div><div data-w-id="094e8a79-f899-529e-250c-5240927de9fe" class="contact-modal__overlay"></div></div></footer><script src="https://d3e54v103j8qbb.cloudfront.net/js/jquery-3.5.1.min.dc5e7f18c8.js?site=605236bb767e9a5bb229c63c" type="text/javascript" integrity="sha256-9/aliU8dGd2tb6OSsuzixeV4y/faTqgFtohetphbbj0=" crossorigin="anonymous"></script><script src="https://cdn.prod.website-files.com/605236bb767e9a5bb229c63c/js/semanticscholar.15d8b03cf.js" type="text/javascript"></script><script> $(document).ready(function() { // Contact Form $('.contact-modal__form').submit(function(e){ // Stops regular form submit e.preventDefault(); // Sets variables, encodes form into json var $this = $(this), $parent = $this.parent(), $success = $parent.find(".contact-modal__form-success"), $error = $parent.find(".contact-modal__form-error"), action = $this.attr('action'), submission = $this.serializeArray().reduce((memo, field) => ({...memo, [field.name]: field.value}), {}); // Record URL submission.url=window.location.href; // Submit $.ajax(action, { method: 'POST', contentType: 'application/json', data: JSON.stringify(submission), cache: false, dataType: 'json', crossDomain: true, processData: false }).always(function(e){ // Hides form, shows success $this.hide(); $success.show(); }); // just in case return false; }); // Listens for links to /about/contact and pops up contact form instead of redirecting. $('.main a[href$="about/contact"]').on('click', function(e){ e.preventDefault(); $('.contact-modal').show(); }); }); </script><script> $(document).ready(function() { // If there's a hashtag var target = window.location.hash; if(target != ''){ var $section; if( $(target).hasClass('accordion__section') ){ $section = $(target + ' .accordion__content'); } else { $section = $(target).parent(); } // Opens the correct section $('.accordion__content--open').removeClass('accordion__content--open'); $section.addClass('accordion__content--open'); // scrolls to correct section document.querySelector(target).scrollIntoView({ behavior: 'smooth' }); } // Controls accordion interactions $('.accordion__header').on('click', function(){ var accordion_content = $(this).next(); accordion_content.toggleClass('accordion__content--open'); }); // Controls navigation $('.navigation__anchor-item a').on('click', function(e){ // Stops webflow behavior e.stopPropagation() // Opens appropriate section var $this = $(this), $sections = $('.accordion__content'), $section = $("#" + $this.attr('data-section') + ' .accordion__content'); $sections.removeClass('accordion__content--open'); $section.addClass('accordion__content--open'); history.pushState(null, null, this.getAttribute('href')); // Scrolls to section document.querySelector(this.getAttribute('href')).scrollIntoView({ behavior: 'smooth' }); // Stops default browser behavior return false; }); }); </script></body></html>

Pages: 1 2 3 4 5 6 7 8 9 10