CINXE.COM

<!DOCTYPE html> <html class='v2' dir='ltr' lang='en'> <head> <link href='https://www.blogger.com/static/v1/widgets/3566091532-css_bundle_v2.css' rel='stylesheet' type='text/css'/> <meta content='width=1100' name='viewport'/> <meta content='text/html; charset=UTF-8' http-equiv='Content-Type'/> <meta content='blogger' name='generator'/> <link href='http://baskauf.blogspot.com/favicon.ico' rel='icon' type='image/x-icon'/> <link href='http://baskauf.blogspot.com/2020/02/' rel='canonical'/> <link rel="alternate" type="application/atom+xml" title="Steve Baskauf's blog - Atom" href="http://baskauf.blogspot.com/feeds/posts/default" /> <link rel="alternate" type="application/rss+xml" title="Steve Baskauf's blog - RSS" href="http://baskauf.blogspot.com/feeds/posts/default?alt=rss" /> <link rel="service.post" type="application/atom+xml" title="Steve Baskauf's blog - Atom" href="https://www.blogger.com/feeds/5299754536670281996/posts/default" />  <meta content='http://baskauf.blogspot.com/2020/02/' property='og:url'/> <meta content='Steve Baskauf's blog' property='og:title'/> <meta content='' property='og:description'/> <title>Steve Baskauf's blog: February 2020</title> <style id='page-skin-1' type='text/css'></style> <style id='template-skin-1' type='text/css'></style> <link href='https://www.blogger.com/dyn-css/authorization.css?targetBlogID=5299754536670281996&zx=d22989f0-9133-4003-89d9-fe7c5d7e0799' media='none' onload='if(media!='all')media='all'' rel='stylesheet'/><noscript><link href='https://www.blogger.com/dyn-css/authorization.css?targetBlogID=5299754536670281996&zx=d22989f0-9133-4003-89d9-fe7c5d7e0799' rel='stylesheet'/></noscript> <meta name='google-adsense-platform-account' content='ca-host-pub-1556223355139109'/> <meta name='google-adsense-platform-domain' content='blogspot.com'/> </head> <body class='loading variant-simplysimple'> <div class='navbar section' id='navbar' name='Navbar'><div class='widget Navbar' data-version='1' id='Navbar1'><script type="text/javascript"> function setAttributeOnload(object, attribute, val) { if(window.addEventListener) { window.addEventListener('load', function(){ object[attribute] = val; }, false); } else { window.attachEvent('onload', function(){ object[attribute] = val; }); } } </script> <div id="navbar-iframe-container"></div> <script type="text/javascript" src="https://apis.google.com/js/platform.js"></script> <script type="text/javascript"> gapi.load("gapi.iframes:gapi.iframes.style.bubble", function() { if (gapi.iframes && gapi.iframes.getContext) { gapi.iframes.getContext().openChild({ url: 'https://www.blogger.com/navbar/5299754536670281996?origin\x3dhttp://baskauf.blogspot.com', where: document.getElementById("navbar-iframe-container"), id: "navbar-iframe" }); } }); </script><script type="text/javascript"> (function() { var script = document.createElement('script'); script.type = 'text/javascript'; script.src = '//pagead2.googlesyndication.com/pagead/js/google_top_exp.js'; var head = document.getElementsByTagName('head')[0]; if (head) { head.appendChild(script); }})(); </script> </div></div> <div class='body-fauxcolumns'> <div class='fauxcolumn-outer body-fauxcolumn-outer'> <div class='cap-top'> <div class='cap-left'></div> <div class='cap-right'></div> </div> <div class='fauxborder-left'> <div class='fauxborder-right'></div> <div class='fauxcolumn-inner'> </div> </div> <div class='cap-bottom'> <div class='cap-left'></div> <div class='cap-right'></div> </div> </div> </div> <div class='content'> <div class='content-fauxcolumns'> <div class='fauxcolumn-outer content-fauxcolumn-outer'> <div class='cap-top'> <div class='cap-left'></div> <div class='cap-right'></div> </div> <div class='fauxborder-left'> <div class='fauxborder-right'></div> <div class='fauxcolumn-inner'> </div> </div> <div class='cap-bottom'> <div class='cap-left'></div> <div class='cap-right'></div> </div> </div> </div> <div class='content-outer'> <div class='content-cap-top cap-top'> <div class='cap-left'></div> <div class='cap-right'></div> </div> <div class='fauxborder-left content-fauxborder-left'> <div class='fauxborder-right content-fauxborder-right'></div> <div class='content-inner'> <header> <div class='header-outer'> <div class='header-cap-top cap-top'> <div class='cap-left'></div> <div class='cap-right'></div> </div> <div class='fauxborder-left header-fauxborder-left'> <div class='fauxborder-right header-fauxborder-right'></div> <div class='region-inner header-inner'> <div class='header section' id='header' name='Header'><div class='widget Header' data-version='1' id='Header1'> <div id='header-inner'> <div class='titlewrapper'> <h1 class='title'> <a href='http://baskauf.blogspot.com/'> Steve Baskauf's blog </a> </h1> </div> <div class='descriptionwrapper'> <p class='description'><span> </span></p> </div> </div> </div></div> </div> </div> <div class='header-cap-bottom cap-bottom'> <div class='cap-left'></div> <div class='cap-right'></div> </div> </div> </header> <div class='tabs-outer'> <div class='tabs-cap-top cap-top'> <div class='cap-left'></div> <div class='cap-right'></div> </div> <div class='fauxborder-left tabs-fauxborder-left'> <div class='fauxborder-right tabs-fauxborder-right'></div> <div class='region-inner tabs-inner'> <div class='tabs no-items section' id='crosscol' name='Cross-Column'></div> <div class='tabs no-items section' id='crosscol-overflow' name='Cross-Column 2'></div> </div> </div> <div class='tabs-cap-bottom cap-bottom'> <div class='cap-left'></div> <div class='cap-right'></div> </div> </div> <div class='main-outer'> <div class='main-cap-top cap-top'> <div class='cap-left'></div> <div class='cap-right'></div> </div> <div class='fauxborder-left main-fauxborder-left'> <div class='fauxborder-right main-fauxborder-right'></div> <div class='region-inner main-inner'> <div class='columns fauxcolumns'> <div class='fauxcolumn-outer fauxcolumn-center-outer'> <div class='cap-top'> <div class='cap-left'></div> <div class='cap-right'></div> </div> <div class='fauxborder-left'> <div class='fauxborder-right'></div> <div class='fauxcolumn-inner'> </div> </div> <div class='cap-bottom'> <div class='cap-left'></div> <div class='cap-right'></div> </div> </div> <div class='fauxcolumn-outer fauxcolumn-left-outer'> <div class='cap-top'> <div class='cap-left'></div> <div class='cap-right'></div> </div> <div class='fauxborder-left'> <div class='fauxborder-right'></div> <div class='fauxcolumn-inner'> </div> </div> <div class='cap-bottom'> <div class='cap-left'></div> <div class='cap-right'></div> </div> </div> <div class='fauxcolumn-outer fauxcolumn-right-outer'> <div class='cap-top'> <div class='cap-left'></div> <div class='cap-right'></div> </div> <div class='fauxborder-left'> <div class='fauxborder-right'></div> <div class='fauxcolumn-inner'> </div> </div> <div class='cap-bottom'> <div class='cap-left'></div> <div class='cap-right'></div> </div> </div>  <div class='columns-inner'> <div class='column-center-outer'> <div class='column-center-inner'> <div class='main section' id='main' name='Main'><div class='widget Blog' data-version='1' id='Blog1'> <div class='blog-posts hfeed'> <div class="date-outer"> <h2 class='date-header'><span>Saturday, February 8, 2020</span></h2> <div class="date-posts"> <div class='post-outer'> <div class='post hentry uncustomized-post-template' itemprop='blogPost' itemscope='itemscope' itemtype='http://schema.org/BlogPosting'> <meta content='https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEj4MaVLvnwqh_fDhADRkaW0V17G5sq-qfwvNDrN-vGjxtddQsjGnhsEGl05ieP0CAYuvQN7wF0j8YDo2jVf2eKXpev9WRnc4CirXeoG78RugOxjHwQgAlSKSXdk6cZrTorEK-GHx77k1xU/s640/diagram17.png' itemprop='image_url'/> <meta content='5299754536670281996' itemprop='blogId'/> <meta content='6983588932676854599' itemprop='postId'/> <a name='6983588932676854599'></a> <h3 class='post-title entry-title' itemprop='name'> <a href='http://baskauf.blogspot.com/2020/02/vanderbot-part-4-preparing-data-to-send.html'>VanderBot part 4: Preparing data to send to Wikidata</a> </h3> <div class='post-header'> <div class='post-header-line-1'></div> </div> <div class='post-body entry-content' id='post-body-6983588932676854599' itemprop='description articleBody'> <br /> <div class="separator" style="clear: both; text-align: center;"> <a href="https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEj4MaVLvnwqh_fDhADRkaW0V17G5sq-qfwvNDrN-vGjxtddQsjGnhsEGl05ieP0CAYuvQN7wF0j8YDo2jVf2eKXpev9WRnc4CirXeoG78RugOxjHwQgAlSKSXdk6cZrTorEK-GHx77k1xU/s1600/diagram17.png" imageanchor="1" style="margin-left: 1em; margin-right: 1em;"><img border="0" data-original-height="601" data-original-width="797" height="482" src="https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEj4MaVLvnwqh_fDhADRkaW0V17G5sq-qfwvNDrN-vGjxtddQsjGnhsEGl05ieP0CAYuvQN7wF0j8YDo2jVf2eKXpev9WRnc4CirXeoG78RugOxjHwQgAlSKSXdk6cZrTorEK-GHx77k1xU/s640/diagram17.png" width="640" /></a></div> <div class="MsoNormal"> <br /></div> <div class="MsoNormal"> In the <a href="http://baskauf.blogspot.com/2020/02/vanderbot-part-3-writing-data-from-csv.html" target="_blank">previous blog post</a>, I described how I used a Python script to upload data stored in a CSV spreadsheet to Wikidata via the Wikidata API.<span style="mso-spacerun: yes;">  </span>I noted that the spreadsheet contained information about whether data were already in Wikidata and if they needed to be written to the API, but I did not say how I acquired those data, nor how I determined whether they needed to be uploaded or not. That data acquisition and processing is the topic of this post.</div> <br /> <div class="MsoNormal"> The overall goal of the VanderBot project is to enter data about Vanderbilt employees (scholars and researchers) and their academic publications into Wikidata. Thus far in the project, I have focused primarily on acquiring and uploading data about the employees. The data acquisition process has three stages:<o:p></o:p></div> <div class="MsoNormal"> <br /></div> <div class="MsoNormal"> 1. Acquiring the names of research employees (faculty, postdocs, and research staff) in departments of the university.<o:p></o:p></div> <div class="MsoNormal"> <br /></div> <div class="MsoNormal"> 2. Determining whether those employees were already present in Wikidata or if items needed to be created for them.<o:p></o:p></div> <div class="MsoNormal"> <br /></div> <div class="MsoNormal"> 3. Generating data required to make key statements about the employees and determining whether those statements (and associated references) had already been asserted in Wikidata.<o:p></o:p></div> <div class="MsoNormal"> <br /></div> <div class="MsoNormal"> The data harvesting script (coded in Python) required to carry out these processes is available via a <a href="https://github.com/HeardLibrary/linked-data/blob/master/publications/process_department.ipynb" target="_blank">Jupyter notebook available onGitHub</a>.<o:p></o:p></div> <div class="MsoNormal"> <br /></div> <div class="separator" style="clear: both; text-align: center;"> <a href="https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEjEkcScUlXLcH27rRh2x6noUdII5ZcrUK8CVZWx_Rni0KJWD-XUiFSmObQHJvytQWyx9zM8bd-RW8XzasRVQ87E9aS2OECqhv0HMuI_S-wvScHrjbeZkQ507fAevvNhOnGTE50fBuaw9V4/s1600/diagram18.png" imageanchor="1" style="margin-left: 1em; margin-right: 1em;"><img border="0" data-original-height="598" data-original-width="780" height="489" src="https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEjEkcScUlXLcH27rRh2x6noUdII5ZcrUK8CVZWx_Rni0KJWD-XUiFSmObQHJvytQWyx9zM8bd-RW8XzasRVQ87E9aS2OECqhv0HMuI_S-wvScHrjbeZkQ507fAevvNhOnGTE50fBuaw9V4/s640/diagram18.png" width="640" /></a></div> <br /> <br /> <h2> Acquire names of research employees at Vanderbilt</h2> <br /> <h4> Scrape departmental website</h4> <div class="MsoNormal"> I've linked employees to Vanderbilt through their departmental affiliations. Therefore, the first task was to create items for departments in the various schools and colleges of Vanderbilt University. I won't go into detail about that process other than to say that the hacky code I used to do it is <a href="https://github.com/HeardLibrary/linked-data/tree/master/publications/departments" target="_blank">on GitHub</a>.<o:p></o:p></div> <div class="MsoNormal"> <br /></div> <div class="MsoNormal"> The actual names of the employees were acquired by scraping departmental faculty and staff web pages. I developed the scraping script based on the web page of my old department, biological sciences. Fortunately, the same page template was used by many other departments in both the College of Arts and Sciences and the Peabody College of Education, so I was able to scrape about 2/3 of the departments in those schools without modifying the script I developed for the biological sciences department. <o:p></o:p></div> <div class="MsoNormal"> <br /></div> <div class="MsoNormal"> Because the departments had differing numbers of researcher pages covering different categories of researchers, I create a JSON configuration file where I recorded the base departmental URLs and the strings appended to that base to generate each of the researcher pages. The configuration file also included some other data needed by the script, such as the department's Wikidata Q ID, a generic description to use for researchers in the department (if they didn’t already have a description), and some strings that I used for fuzzy matching with other records (described later). Some sample JSON is included in the comments near the top of the script.<o:p></o:p></div> <div class="MsoNormal"> <br /></div> <div class="MsoNormal"> The result at the end of the "Scrape departmental website" section of the code was a CSV file with the researcher names and some other data that I made a feeble attempt to scrape, such as their title and affiliation. <o:p></o:p></div> <br /> <br /> <h4> Search ORCID for Vanderbilt employees</h4> <a href="https://orcid.org/" target="_blank">ORCID</a> (Open Researcher and Contributor ID) plays an important part in disambiguating employees. Because ORCIDs are globally unique, associating an employee name with an ORCID allows one to know that the employee is different from someone with the same name who has a different ORCID.<br /> <div class="MsoNormal"> <o:p></o:p></div> <div class="MsoNormal"> <br /></div> <div class="MsoNormal"> For that reason, I began the disambiguation process by performing a search for "Vanderbilt University" using the <a href="https://orcid.org/organizations/integrators/API" target="_blank">ORCID API</a>. The search produced several thousand results. I then dereferenced each of the resulting ORCID URIs to capture the full data about the researcher. That required an API call for each record and I used a quarter second delay per call to avoid hitting the API too fast. As a result, this stage of the process took hours to run.<o:p></o:p></div> <div class="MsoNormal"> <br /></div> <div class="MsoNormal"> I screened the results by recording only those that listed "Vanderbilt University" as part of the employments affiliation organization string. That excluded people who were only students and never employees, and included people whose affiliation was "Vanderbilt University Medical Center", "Vanderbilt University School of Nursing", etc. As part of the data recorded, I included their stated departmental affiliations (some had multiple affiliations if they moved from one department to another during their career). After this stage, I had 2240 name/department records.<o:p></o:p></div> <br /> <br /> <h4> Fuzzy matching of departmental and ORCID records</h4> The next stage of the process was to try to match employees from the department that I was processing with the downloaded ORCID records. I used a Python fuzzy string matching function called <span style="font-family: "courier new" , "courier" , monospace;">fuzz.token_set_ratio()</span> from the <a href="https://github.com/seatgeek/fuzzywuzzy" target="_blank">fuzzywuzzy</a> package. I tested this function along with others in the package and it was highly effective at matching names with minor variations (both people and departmental names). Because this function was insensitive to word order, it matched names like "Department of Microbiology" and "Microbiology Department". However, it also made major errors for name order reversals ("John James" and "James Johns", for example) so I had an extra check for that.<br /> <div class="MsoNormal"> <o:p></o:p></div> <div class="MsoNormal"> <br /></div> <div class="MsoNormal"> If the person's name had a match score of greater than 90 (out of 100), I then performed a match check against the listed department. If it also had a match score of greater than 90, I assigned that ORCID to the person. If no listed department matched had a score over 90, I assigned the ORCID, but flagged that match for manual checking later.<o:p></o:p></div> <br /> <br /> <div class="separator" style="clear: both; text-align: center;"> <a href="https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEjIUr2T9o7_LbDhEU6f24xW3RcOi2c6bGMfG9L0erOD9i5pb1Aanj0-JQlnpkJxLbSSorvfe1qPtBp7wSyXZh3LW6NJ_KyMXyR-veYV53-SHDBUdd2cgvHy2BMtF0i71-sLQdUoJ6gvTWM/s1600/diagram19.png" imageanchor="1" style="margin-left: 1em; margin-right: 1em;"><img border="0" data-original-height="596" data-original-width="779" height="488" src="https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEjIUr2T9o7_LbDhEU6f24xW3RcOi2c6bGMfG9L0erOD9i5pb1Aanj0-JQlnpkJxLbSSorvfe1qPtBp7wSyXZh3LW6NJ_KyMXyR-veYV53-SHDBUdd2cgvHy2BMtF0i71-sLQdUoJ6gvTWM/s640/diagram19.png" width="640" /></a></div> <br /> <h2> Determine whether employees were already in Wikidata</h2> <br /> <h4> Attempt automated matching with people in Wikidata known to work at Vanderbilt</h4> <div class="MsoNormal"> I was then ready to start trying to match people with existing Wikidata records. The low-hanging fruit was people whose records already stated that their employer was Vanderbilt University (Q29052). I ran a SPARQL query for that using the Wikidata Query Service.  For each match, I also recorded the employee's description, ORCID, start date, and end date (where available). <o:p></o:p></div> <div class="MsoNormal"> <br /></div> <div class="MsoNormal"> Once I had those data, I checked each departmental employee's record against the query results. If both the departmental employee and the potential match from Wikidata had the same ORCID, then I knew that they were the same person and I assigned the Wikidata Q ID to that employee. If the employee had an ORCID I could exclude any Wikidata records with non-matching ORCIDs and only check for name matches with Wikidata records that didn't have ORCIDs.<span style="mso-spacerun: yes;">  </span>Getting a name match alone was not a guarantee that the person in Wikidata was the same as the departmental employee, but given that the pool of possible Wikidata matches only included people employed at Vanderbilt, a good name match meant that it was probably the same person. If the person had a description in Wikidata, I printed the two names and the description and visually inspected the matches. For example, if there was a member of the Biological Sciences department named Jacob Reynolds and someone in Wikidata named Jacob C. Reynolds who was a microbiologist, the match was probably good. On the other hand, if Jacob C. Reynolds was a historian, then some manual checking was in order.<span style="mso-spacerun: yes;">  I did</span> a few other tricks that you can see in the code.<o:p></o:p></div> <div class="MsoNormal"> <br /></div> <div class="MsoNormal"> This "smart matching" with minimal human intervention was usually able to match a small fraction of people in the department. But there were plenty of departmental employees who were already in Wikidata without any indication that they worked at Vanderbilt. The obvious way to look for them would be to just do a SPARQL query for their name. There are some features built in to SPARQL that allow for REGEX checks, but those features are impossibly slow for a triplestore the size of Wikidata's. The strategy that I settled for was to generate as many possible variations of the person's name and query for all of them at once. You can see what I did in the <span style="font-family: "courier new" , "courier" , monospace;">generateNameAlternatives()</span> function in the code. I searched labels and aliases for: the full name, names with middle initials with and without periods, first and middle initials with and without periods, etc. This approach was pretty good at matching with the right people, but it also matched with a lot of wrong people. For example, for Jacob C. Reynolds, I would also search for J. C. Reynolds. If John C. Reynolds had J. C. Reynolds as an alias, he would come up as a hit. I could have tried to automate the processing of the returned names more, but there usually weren't a lot of matches and with the other screening criteria I applied, it was pretty easy for me to just look at the results and bypass the false positives.</div> <div class="MsoNormal"> <o:p></o:p></div> <div class="MsoNormal"> <br /></div> <div class="MsoNormal"> When I did the query for the name alternatives, I downloaded the values for several properties that were useful for eliminating hits. One important screen was to eliminate any matching items that were instances of classes (P31) other than human (Q5). I also screened out people who were listed as having died prior to some set data (2000 worked well - some departments still listed recently deceased emeriti and I didn't want to eliminate those).<span style="mso-spacerun: yes;">  </span>If both the employee and the name match in Wikidata had ORCIDs that were different, I also eliminated the hit.<span style="mso-spacerun: yes;">  </span>For all matches that passed these screens, I printed the description, occupation, and employer if they were given in Wikidata. <o:p></o:p></div> <br /> <br /> <h4> Clues from publications in PubMed and Crossref</h4> The other powerful tool I used for disambiguation was to look up any articles linked to the putative Wikidata match.<span style="mso-spacerun: yes;">  </span>For each Wikidata person item who made it this far through the screen, I did a SPARQL query to find works authored by that person. For up to 10 works, I did the following.<span style="mso-spacerun: yes;">  </span>If the article had a PubMed ID, I retrieved the article metadata from the <a href="https://www.ncbi.nlm.nih.gov/books/NBK25501/" target="_blank">PubMed API</a> and tried to match against the author names. When I got a match with an author, I checked for an ORCID match (or excluded if an ORCID mismatch) and also for a fuzzy match against any affiliation that was given.<span style="mso-spacerun: yes;">  </span>If either an ORCID or affiliation matched, I concluded that the departmental employee was the same as the Wikidata match and stopped looking.<br /> <div class="MsoNormal"> <br /></div> <div class="MsoNormal"> If there was no match in PubMed and the article had a DOI, I then retrieved the metadata about the article from the <a href="https://www.crossref.org/services/metadata-delivery/rest-api/" target="_blank">CrossRef API</a> and did the same kind of screening that I did in PubMed.<span style="mso-spacerun: yes;">  </span><o:p></o:p></div> <br /> <br /> <h4> Human intervention</h4> If there was no automatic match via the article searches, I printed out the full set of information (description, employer, articles, etc.) for every name match, along with the name from the department and the name from Wikidata in order for a human to check whether any of the matches seemed plausible. In a lot of cases, it was easy to eliminate matches that had descriptions like "Ming Dynasty person" or occupation = "golfer".   If there was uncertainty, the script printed hyperlinked Wikidata URLs and I could just click on them to examine the Wikidata record manually.<br /> <br /> Here's some typical output:<br /> <br /> <span style="font-family: "courier new" , "courier" , monospace;">--------------------------</span><br /> <span style="font-family: "courier new" , "courier" , monospace;">No Wikidata name match:  Justine Bruy猫re</span><br /> <span style="font-family: "courier new" , "courier" , monospace;"><br /></span> <span style="font-family: "courier new" , "courier" , monospace;">--------------------------</span><br /> <span style="font-family: "courier new" , "courier" , monospace;">No Wikidata name match:  Nicole Chaput Guizani</span><br /> <span style="font-family: "courier new" , "courier" , monospace;"><br /></span> <span style="font-family: "courier new" , "courier" , monospace;">--------------------------</span><br /> <span style="font-family: "courier new" , "courier" , monospace;">SPARQL name search:  Caroline Christopher</span><br /> <span style="font-family: "courier new" , "courier" , monospace;">(no ORCID)</span><br /> <span style="font-family: "courier new" , "courier" , monospace;"><br /></span> <span style="font-family: "courier new" , "courier" , monospace;"><br /></span> <span style="font-family: "courier new" , "courier" , monospace;">0 Wikidata ID:  Q83552019  Name variant:  Caroline Christopher   <a href="https://www.wikidata.org/wiki/Q83552019" target="_blank">https://www.wikidata.org/wiki/Q83552019</a></span><br /> <span style="font-family: "courier new" , "courier" , monospace;">No death date given.</span><br /> <span style="font-family: "courier new" , "courier" , monospace;">description:  human and organizational development educator</span><br /> <span style="font-family: "courier new" , "courier" , monospace;">employer:  Vanderbilt University</span><br /> <span style="font-family: "courier new" , "courier" , monospace;">No articles authored by that person</span><br /> <span style="font-family: "courier new" , "courier" , monospace;">Employee:  Caroline Christopher  vs. name variant:  Caroline Christopher</span><br /> <span style="font-family: "courier new" , "courier" , monospace;"><br /></span> <span style="font-family: "courier new" , "courier" , monospace;">Enter the number of the matched entity, or press Enter/return if none match: 0</span><br /> <span style="font-family: "courier new" , "courier" , monospace;"><br /></span> <span style="font-family: "courier new" , "courier" , monospace;">--------------------------</span><br /> <span style="font-family: "courier new" , "courier" , monospace;">SPARQL name search:  Paul Cobb</span><br /> <span style="font-family: "courier new" , "courier" , monospace;">(no ORCID)</span><br /> <span style="font-family: "courier new" , "courier" , monospace;"><br /></span> <span style="font-family: "courier new" , "courier" , monospace;"><br /></span> <span style="font-family: "courier new" , "courier" , monospace;">0 Wikidata ID:  Q28936750  Name variant:  Paul Cobb   <a href="https://www.wikidata.org/wiki/Q28936750" target="_blank">https://www.wikidata.org/wiki/Q28936750</a></span><br /> <span style="font-family: "courier new" , "courier" , monospace;">No death date given.</span><br /> <span style="font-family: "courier new" , "courier" , monospace;">description:  association football player</span><br /> <span style="font-family: "courier new" , "courier" , monospace;">occupation:  association football player</span><br /> <span style="font-family: "courier new" , "courier" , monospace;">No articles authored by that person</span><br /> <span style="font-family: "courier new" , "courier" , monospace;">Employee:  Paul Cobb  vs. name variant:  Paul Cobb</span><br /> <span style="font-family: "courier new" , "courier" , monospace;"><br /></span> <span style="font-family: "courier new" , "courier" , monospace;"><br /></span> <span style="font-family: "courier new" , "courier" , monospace;">1 Wikidata ID:  Q55746009  Name variant:  Paul Cobb   <a href="https://www.wikidata.org/wiki/Q55746009" target="_blank">https://www.wikidata.org/wiki/Q55746009</a></span><br /> <span style="font-family: "courier new" , "courier" , monospace;">No death date given.</span><br /> <span style="font-family: "courier new" , "courier" , monospace;">description:  American newspaper publisher</span><br /> <span style="font-family: "courier new" , "courier" , monospace;">occupation:  newspaper proprietor</span><br /> <span style="font-family: "courier new" , "courier" , monospace;">No articles authored by that person</span><br /> <span style="font-family: "courier new" , "courier" , monospace;">Employee:  Paul Cobb  vs. name variant:  Paul Cobb</span><br /> <span style="font-family: "courier new" , "courier" , monospace;"><br /></span> <span style="font-family: "courier new" , "courier" , monospace;">Enter the number of the matched entity, or press Enter/return if none match: </span><br /> <span style="font-family: "courier new" , "courier" , monospace;"><br /></span> <span style="font-family: "courier new" , "courier" , monospace;">--------------------------</span><br /> <span style="font-family: "courier new" , "courier" , monospace;">No Wikidata name match:  Molly Collins</span><br /> <span style="font-family: "courier new" , "courier" , monospace;"><br /></span> <span style="font-family: "courier new" , "courier" , monospace;">--------------------------</span><br /> <span style="font-family: "courier new" , "courier" , monospace;">No Wikidata name match:  Ana Christina da Silva [Iddings]</span><br /> <br /> <br /> Although this step did require human intervention, because of the large amount of information that the script collected about the Wikidata matches, it usually only took a few minutes to disambiguate a department with 30 to 50 employees.<br /> <div> <br /></div> <div class="separator" style="clear: both; text-align: center;"> <a href="https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEjPsVESAarTbzNBolXgPC-MtXplJMIIvNvJvwvykHBL9eoLniv5Ln3lLQC421PnPGhInYvHnnHAjGLQjEdKJuhUVIk8c8xmXUMB3ORCPxvKU-IGxpok50NFr6tD1nUKG1lIV68EIkwMpeM/s1600/diagram20.png" imageanchor="1" style="margin-left: 1em; margin-right: 1em;"><img border="0" data-original-height="597" data-original-width="786" height="486" src="https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEjPsVESAarTbzNBolXgPC-MtXplJMIIvNvJvwvykHBL9eoLniv5Ln3lLQC421PnPGhInYvHnnHAjGLQjEdKJuhUVIk8c8xmXUMB3ORCPxvKU-IGxpok50NFr6tD1nUKG1lIV68EIkwMpeM/s640/diagram20.png" width="640" /></a></div> <div> <br /></div> <h2> Generate statements and references and determine which were already in Wikidata</h2> <div> <br /></div> <h4> Generating data for a minimal set of properties</h4> <div> <div class="MsoNormal"> The next to last step was to assign values to a minimal set of properties that I felt each employee should have in a Wikidata record. Here's what I settled on for that minimal set:<o:p></o:p></div> <div class="MsoNormal"> <br /></div> <div class="MsoNormal"> <span style="font-family: "courier new" , "courier" , monospace;">P31 Q5 </span>(<i>instance of human</i>). This was automatically assigned to all records.<o:p></o:p></div> <div class="MsoNormal"> <br /></div> <div class="MsoNormal"> <span style="font-family: "courier new" , "courier" , monospace;">P108 Q29052</span> (<i>employer Vanderbilt University</i>). This applies to all employees in our project - the employer value can be set at the top of the script.<o:p></o:p></div> <div class="MsoNormal"> <br /></div> <div class="MsoNormal"> <span style="font-family: "courier new" , "courier" , monospace;">P1416</span> <span style="font-family: "courier new" , "courier" , monospace;">[Q ID of department]</span> (<i>affiliation with focal department</i>). After searching through many possible properties, I decided that <span style="font-family: "courier new" , "courier" , monospace;">P1416 </span>(<i>affiliation</i>) was the best property to use to assert the employee's connection to the department I was processing. <span style="font-family: "courier new" , "courier" , monospace;">P108 </span>was also possible, but there were a lot of people with dual departmental appointments and I generally didn't know which department was the actual "employer". Affiliation seemed to be an appropriate connection for regular faculty, postdocs, visiting faculty, research staff, and other kinds of statuses where the person would have some kind of research or scholarly output. <o:p></o:p></div> <div class="MsoNormal"> <br /></div> <div class="MsoNormal"> <span style="font-family: "courier new" , "courier" , monospace;">P496 [ORCID identifier]</span>. ORCIDs that I'd acquired for the employees were hard-won and an excellent means for anyone else to carry out disambiguation, so I definitely wanted to include that assertion if I could. <o:p></o:p></div> <div class="MsoNormal"> <br /></div> <div class="MsoNormal"> <span style="font-family: "courier new" , "courier" , monospace;">P21 [sex or gender]</span>. I was really uncomfortable assigning a value of this property, but this is a property often flagged by <a href="https://www.wikidata.org/wiki/Wikidata:Recoin" target="_blank">Recoin</a> as a top missing property and I didn't want some overzealous editor deleting my new items because their metadata were too skimpy. Generally, the departmental web pages had photos to go with the names, so I made a call and manually assigned a value for this property (options: m=male, f=female, i=intersex, tf=transgender female, tm=transgender male). Any time the sex or gender seemed uncertain, I did not provide a value.<o:p></o:p></div> <div class="MsoNormal"> <br /></div> <div class="MsoNormal"> <i>The description</i>.<span style="mso-spacerun: yes;">  </span>I made up a default description for the department, such as "biological science researcher", "historian", or "American Studies scholar" for the Biological Sciences, History, and American Studies departments respectively. I did not overwrite any existing descriptions by default, although as a last step I looked at the table to replace stupid ones like "researcher, ORCID: 0000-0002-1234-5678". These defaults were generally specific enough to prevent collisions where the label/description combination I was creating would collide with the label/description combination for an existing record and kill the record write. <o:p></o:p></div> <div class="MsoNormal"> <br /></div> <div class="MsoNormal"> When it made sense, I added references to the statements I was making. Generally, a reference is not expected for <i>instance of human</i> and I really couldn't give a reference for <i>sex or gender</i>.<span style="mso-spacerun: yes;">  </span>For the <i>employer </i>and <i>affiliation </i>references, I used the web page that I scraped to get their name as the <i>reference URL</i> and provided the current date as the value for <span style="font-family: "courier new" , "courier" , monospace;">P813 </span>(<i>retrieved</i>).<span style="mso-spacerun: yes;">  </span>For ORCID, I created a reference that had a <span style="font-family: "courier new" , "courier" , monospace;">P813 </span>(<i>retrieved</i>) property if I was able to successfully dereference the ORCID URI. <o:p></o:p></div> <div class="MsoNormal"> <br /></div> <div class="MsoNormal"> Because each of these properties had different criteria for assigning values and references, there was no standard code for assigning them. The code for each property is annotated, so if you are interested you can look at it to see how I made the assignments.<o:p></o:p></div> <div class="MsoNormal"> <br /></div> </div> <div class="separator" style="clear: both; text-align: center;"> <a href="https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEg_xaBGWKHzOc5s-xnoSV9tBzXG6NeLT2nSuEfcKaweUBwx00MiZcucXnky3eEqvtOjfzzSX95bOV9tpEaW8lNxjIPrwKqtUOIwSaiTLIg1Fhw7Hbyl-osHe7iLWVu9Ry_BijnysBzuBAg/s1600/diagram21.png" imageanchor="1" style="margin-left: 1em; margin-right: 1em;"><img border="0" data-original-height="528" data-original-width="975" height="346" src="https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEg_xaBGWKHzOc5s-xnoSV9tBzXG6NeLT2nSuEfcKaweUBwx00MiZcucXnky3eEqvtOjfzzSX95bOV9tpEaW8lNxjIPrwKqtUOIwSaiTLIg1Fhw7Hbyl-osHe7iLWVu9Ry_BijnysBzuBAg/s640/diagram21.png" width="640" /></a></div> <div> <br /></div> <h4> Check for existing data in Wikidata</h4> <div> <div class="MsoNormal"> In the earlier posts, I said that I did not want VanderBot to create duplicate items, statements, and references when they already existed in Wikidata. So a critical last step was to check for existing data using SPARQL. One important thing to keep in mind is the Query Service Updater lag that I talked about in the last post. That lag means that changes made up to 8 or 10 hours ago would not be included in this download. However, given that the Wikidata researcher item records I'm dealing with do not change frequently, the lag generally wasn't a problem. I should note that it would be possible to get these data directly from the Wikidata API, but the convenience of getting exactly the information I wanted using SPARQL outweighed my motivation to develop code to do that.<o:p></o:p></div> <div class="MsoNormal"> <br /></div> <div class="MsoNormal"> At this point in the workflow, I've already determined with a fairly high degree of confidence which of the departmental employees were already in Wikidata. That takes care of the potential problem of creating duplicate item records, and it also means that I do not need to check for the presence of statements or references for any of the new items either.<o:p></o:p></div> <div class="MsoNormal"> <br /></div> <div class="MsoNormal"> One interesting feature of SPARQL that I learned from this project was using the <span style="font-family: "courier new" , "courier" , monospace;">VALUES </span>clause. Despite having used SPARQL for years and skimming through the SPARQL specification several times, I missed it. The <span style="font-family: "courier new" , "courier" , monospace;">VALUES </span>clause allows you to specify which values the query should use for a particular variable in its pattern matching.<span style="mso-spacerun: yes;">  </span>That makes querying a large triplestore like Wikidata much faster that without it and it also reduces the number of results that the code has to sort through when results come back from the query service. Here's an example of a query using the <span style="font-family: "courier new" , "courier" , monospace;">VALUES </span>clause that you can test at the <a href="https://query.wikidata.org/" target="_blank">Wikidata Query Service</a>:<br /> <o:p></o:p></div> <div class="MsoNormal"> <br /></div> <div class="MsoNormal"> <span style="font-family: "courier new" , "courier" , monospace;">SELECT DISTINCT ?id ?statement WHERE {<o:p></o:p></span></div> <div class="MsoNormal"> <span style="font-family: "courier new" , "courier" , monospace;">VALUES ?id {<o:p></o:p></span></div> <div class="MsoNormal"> <span style="font-family: "courier new" , "courier" , monospace;"><span style="mso-spacerun: yes;">  </span>wd:Q4958<o:p></o:p></span></div> <div class="MsoNormal"> <span style="font-family: "courier new" , "courier" , monospace;"><span style="mso-spacerun: yes;">  </span>wd:Q39993<o:p></o:p></span></div> <div class="MsoNormal"> <span style="font-family: "courier new" , "courier" , monospace;"><span style="mso-spacerun: yes;">  </span>wd:Q234<o:p></o:p></span></div> <div class="MsoNormal"> <span style="font-family: "courier new" , "courier" , monospace;"><span style="mso-spacerun: yes;">  </span>}<o:p></o:p></span></div> <div class="MsoNormal"> <span style="font-family: "courier new" , "courier" , monospace;">?id p:P31 ?statement.<o:p></o:p></span></div> <div class="MsoNormal"> <span style="font-family: "courier new" , "courier" , monospace;">}</span><o:p></o:p></div> <div class="MsoNormal"> <br /></div> <div class="separator" style="clear: both; text-align: center;"> <br /></div> <div class="MsoNormal"> So the first part of the last step in the workflow is to generate a list of all of the existing item Q IDs for employees in the department. That list is passed to the <span style="font-family: "courier new" , "courier" , monospace;">searchStatementAtWikidata()</span> function as its first argument. <span style="font-family: "courier new" , "courier" , monospace;">searchStatementAtWikidata()</span> is a general purpose function that will search Wikidata for a particular property of items in the generated list. It can be used either to search for a particular property and value (like <span style="font-family: "courier new" , "courier" , monospace;">P108 Q29052</span>, <i>employer Vanderbilt University</i>) and retrieve the references for that statement, or for only the property (like <span style="font-family: "courier new" , "courier" , monospace;">P496</span>, <i>ORCID</i>) and retrieve both the values and references associated with those statements.<span style="mso-spacerun: yes;">  </span>This behavior is controlled by whether an empty string is sent for the value argument or not.<span style="mso-spacerun: yes;">  </span>For each of the minimal set of properties that I'm tracking for departmental employees, the <span style="font-family: "courier new" , "courier" , monospace;">searchStatementAtWikidata()</span> is used to retrieve any available data for the listed employees. Those data are then matched with the appropriate employee records and recorded in the CSV file along with the previously generated property values. <o:p></o:p></div> <div class="MsoNormal"> <br /></div> <div class="MsoNormal"> In addition to the property checks, labels, descriptions, and aliases for the list of employees are retrieved via SPARQL queries. In the cases of labels and descriptions, if there is an existing label or description in Wikidata, it is written to the CSV file. If there is no existing label, the name scraped from the departmental website is written to the CSV as the label. If there is no existing description, the default description for the department is written to the CSV. Whatever alias lists are retrieved from Wikidata (including empty ones) are written to the CSV.<o:p></o:p><br /> <br /></div> </div> <h4> Final manual curation prior to writing to the Wikidata API</h4> <div> <div class="MsoNormal"> In theory, the CSV file resulting from the previous step should contain all of the information needed by the API-writing script that was discussed in the last post. However, I always manually examine the CSV to look for problems or things that are stupid such as bad descriptions. <o:p></o:p></div> <div class="MsoNormal"> <br /></div> <div class="MsoNormal"> If a description or label is changed, the API-writing script will detect that it's different from the current value being provided by the SPARQL endpoint and the new description or label will overwrite the existing one. The API-writing script is currently not very sophisticated about how it handles aliases. If there are more aliases in the CSV than are currently in Wikidata, the script will overwrite existing aliases in Wikidata with those in the spreadsheet. The assumption is that alias lists are only added to, rather than aliases being changed or deleted.<span style="mso-spacerun: yes;">  </span>At some point in the future, I intend to write a separate script that will handle labels and aliases in a more robust way, so I really didn't want to waste time now on making the alias-handling better than it is. </div> <div class="MsoNormal"> <br /></div> <div class="MsoNormal"> A typical situation is to discover a more specific label for the person than already exists in Wikidata. In that case, I usually add the existing label to the alias list, and replace the label value in the CSV with the better new one. <b>WARNING!</b> If you edit the alias list, make sure that your editor uses generic quotes (ASCII <span style="font-family: "courier new" , "courier" , monospace;">32</span>/Unicode <span style="font-family: "courier new" , "courier" , monospace;">+U0022</span>) and not "smart quotes". They have a different Unicode value and will break the script. <a href="https://www.openoffice.org/" target="_blank">Open Office</a>/<a href="https://www.libreoffice.org/" target="_blank">Libre Office</a> (the best applications for editing CSVs in my opinion) default to smart quotes, so this setting must be turned off manually.<o:p></o:p></div> <div class="MsoNormal"> <br /></div> <div class="MsoNormal"> I also just look over the rest of the spreadsheet to convince myself that nothing weird is going on. Usually the script does an effective job of downloading the correct reference properties and values, but I've discovered some odd situations that have caused problems. <o:p></o:p></div> <div class="MsoNormal"> <br /></div> <div class="MsoNormal"> <a href="https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEhJWOeuqH6OVL-NGjf9BqV9_LOK7UoKWSU9oV_640OnwkHtTFFqilJRWFmLek6sD1wcVHkjCwvBBH3kKtK91fIuM_6AIlgBKt-P7vskQxkfLdNd-BGRYMbm6q6Hv5KG7-6UU4wkCCrbasc/s1600/diagram2.png" imageanchor="1" style="margin-left: 1em; margin-right: 1em; text-align: center;"><img border="0" data-original-height="534" data-original-width="975" height="350" src="https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEhJWOeuqH6OVL-NGjf9BqV9_LOK7UoKWSU9oV_640OnwkHtTFFqilJRWFmLek6sD1wcVHkjCwvBBH3kKtK91fIuM_6AIlgBKt-P7vskQxkfLdNd-BGRYMbm6q6Hv5KG7-6UU4wkCCrbasc/s640/diagram2.png" width="640" /></a></div> <div class="MsoNormal"> <br /></div> <div class="MsoNormal"> At this point, steps 1 and 2 in the VanderBot diagram have been completed by the data harvesting script, and the API-writing script described in the last post is ready to take over in step 3.<span style="mso-spacerun: yes;">  </span>When step 4 is complete, the blank cells in the CSV for missing item, statement, and reference identifiers will should all be filled in and the CSV can be filed for future reference. <o:p></o:p><br /> <br /></div> </div> <h2> Final thoughts</h2> <div> <div class="MsoNormal"> <br /> I tried to make the API writing script generic and adaptable for writing statements and references about any kind of entity. That's achievable simply by editing the JSON schema file that maps the columns in the source CSV. However, getting the values for that CSV is the tricky part. If one were confident that only new items were being written, then the table could filled with only the data to be written and without any item, statement, or reference identifiers.<span style="mso-spacerun: yes;">  </span>That would be the case if you were using the script to load your own Wikibase instance. However, for adding data to Wikidata about most items like people or references, one can't know if the data needs to be written or not, and that's why a complex and somewhat idiosyncratic script like the data harvesting script is necessary. So there's no "magic bullet" that will make it possible to automatically know whether you can write data to Wikidata without creating duplicate assertions.<o:p></o:p></div> <div class="MsoNormal"> <br /></div> <div class="MsoNormal"> To find records that VanderBot has put into Wikidata, try this query at the <a href="https://query.wikidata.org/" target="_blank">Wikidata Query Service</a>:</div> <div class="MsoNormal"> <br /></div> <div class="MsoNormal"> <span style="font-family: "courier new" , "courier" , monospace;">select distinct ?employee where {</span></div> <div class="MsoNormal"> <span style="font-family: "courier new" , "courier" , monospace;">  ?employee wdt:P1416/wdt:P749+ wd:Q29052.</span></div> <div class="MsoNormal"> <span style="font-family: "courier new" , "courier" , monospace;">  }</span></div> <div class="MsoNormal"> <span style="font-family: "courier new" , "courier" , monospace;">limit 50</span></div> <div class="MsoNormal"> <br /></div> <div class="MsoNormal"> The triple pattern requires that the employee first have an <i>affiliation</i> (<span style="font-family: "courier new" , "courier" , monospace;">P1416</span>) to some item, and that item be linked by one or more <i>parent organization</i> (<span style="font-family: "courier new" , "courier" , monospace;">P749</span>) links to Vanderbilt University (<span style="font-family: "courier new" , "courier" , monospace;">Q29052</span>). I linked the department items to their parent school or college using <span style="font-family: "courier new" , "courier" , monospace;">P749 </span>and made sure that the University's schools and colleges were all linked to the University by <span style="font-family: "courier new" , "courier" , monospace;">P749 </span>as well. However, some schools like the Blair School of Music do not really have departments, so their employees were affiliated directly to the school or college rather than a department. So the search has to pick up administrative entity items that were either one or two <span style="font-family: "courier new" , "courier" , monospace;">P749 </span>links from the university (hence the "+" property path operator after <span style="font-family: "courier new" , "courier" , monospace;">P749</span>). Since there are a lot of employees, I limited the results to 50. If you click on any of the results, it will take you to the item page and you can view the page history to confirm that VanderBot had made edits to the page.  (At some point, there may be people who were linked in this way by an account other than VanderBot, but thus far, VanderBot is probably the only editor of Vanderbilt employees items that's linking to departments by <span style="font-family: Courier New, Courier, monospace;">P1416</span>, given that I recently created all of the department items from scratch.)</div> <div class="MsoNormal"> <br /></div> <div class="MsoNormal"> A variation of that query will tell you the number of records meeting the criteria of the previous query:</div> <div class="MsoNormal"> <br /></div> <div class="MsoNormal"> <span style="font-family: "courier new" , "courier" , monospace;">select (count(?employee) as ?count) where {</span></div> <div class="MsoNormal"> <span style="font-family: "courier new" , "courier" , monospace;">  ?employee wdt:P1416/wdt:P749+ wd:Q29052.</span></div> <div class="MsoNormal"> <span style="font-family: "courier new" , "courier" , monospace;">  }</span></div> <div class="MsoNormal"> <br /></div> <div class="MsoNormal"> As of 2020-02-08, there are 1221 results. That number should grow as I use VanderBot to process other departments.</div> <div class="MsoNormal"> <br /></div> <div class="MsoNormal"> <br /></div> </div> <div style='clear: both;'></div> </div> <div class='post-footer'> <div class='post-footer-line post-footer-line-1'> <span class='post-author vcard'> Posted by <span class='fn' itemprop='author' itemscope='itemscope' itemtype='http://schema.org/Person'> <meta content='https://www.blogger.com/profile/01896499749604153763' itemprop='url'/> <a class='g-profile' href='https://www.blogger.com/profile/01896499749604153763' rel='author' title='author profile'> <span itemprop='name'>Steve Baskauf</span> </a> </span> </span> <span class='post-timestamp'> at <meta content='http://baskauf.blogspot.com/2020/02/vanderbot-part-4-preparing-data-to-send.html' itemprop='url'/> <a class='timestamp-link' href='http://baskauf.blogspot.com/2020/02/vanderbot-part-4-preparing-data-to-send.html' rel='bookmark' title='permanent link'><abbr class='published' itemprop='datePublished' title='2020-02-08T07:28:00-08:00'>7:28 AM</abbr></a> </span> <span class='post-comment-link'> <a class='comment-link' href='http://baskauf.blogspot.com/2020/02/vanderbot-part-4-preparing-data-to-send.html#comment-form' onclick=''> No comments: </a> </span> <span class='post-icons'> <span class='item-control blog-admin pid-95103704'> <a href='https://www.blogger.com/post-edit.g?blogID=5299754536670281996&postID=6983588932676854599&from=pencil' title='Edit Post'> <img alt='' class='icon-action' height='18' src='https://resources.blogblog.com/img/icon18_edit_allbkg.gif' width='18'/> </a> </span> </span> <div class='post-share-buttons goog-inline-block'> <a class='goog-inline-block share-button sb-email' href='https://www.blogger.com/share-post.g?blogID=5299754536670281996&postID=6983588932676854599&target=email' target='_blank' title='Email This'><span class='share-button-link-text'>Email This</span></a><a class='goog-inline-block share-button sb-blog' href='https://www.blogger.com/share-post.g?blogID=5299754536670281996&postID=6983588932676854599&target=blog' onclick='window.open(this.href, "_blank", "height=270,width=475"); return false;' target='_blank' title='BlogThis!'><span class='share-button-link-text'>BlogThis!</span></a><a class='goog-inline-block share-button sb-twitter' href='https://www.blogger.com/share-post.g?blogID=5299754536670281996&postID=6983588932676854599&target=twitter' target='_blank' title='Share to X'><span class='share-button-link-text'>Share to X</span></a><a class='goog-inline-block share-button sb-facebook' href='https://www.blogger.com/share-post.g?blogID=5299754536670281996&postID=6983588932676854599&target=facebook' onclick='window.open(this.href, "_blank", "height=430,width=640"); return false;' target='_blank' title='Share to Facebook'><span class='share-button-link-text'>Share to Facebook</span></a><a class='goog-inline-block share-button sb-pinterest' href='https://www.blogger.com/share-post.g?blogID=5299754536670281996&postID=6983588932676854599&target=pinterest' target='_blank' title='Share to Pinterest'><span class='share-button-link-text'>Share to Pinterest</span></a> </div> </div> <div class='post-footer-line post-footer-line-2'> <span class='post-labels'> </span> </div> <div class='post-footer-line post-footer-line-3'> <span class='post-location'> </span> </div> </div> </div> </div> </div></div> <div class="date-outer"> <h2 class='date-header'><span>Friday, February 7, 2020</span></h2> <div class="date-posts"> <div class='post-outer'> <div class='post hentry uncustomized-post-template' itemprop='blogPost' itemscope='itemscope' itemtype='http://schema.org/BlogPosting'> <meta content='https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEiDq5PGdWsB-Kh8NxHHT0OxQlCJWCoIdpXcrtf8JasiDWMEvufhX3JDpBYFNJizS3dZ47_9-Yrc0wOHV83vKQ2ueHRnkTh08AUoQwgQuGDLEIxJUAFf_apqmQ6VAJH16rz1iWCik1DAqLg/s640/diagram12.png' itemprop='image_url'/> <meta content='5299754536670281996' itemprop='blogId'/> <meta content='1362477816070901447' itemprop='postId'/> <a name='1362477816070901447'></a> <h3 class='post-title entry-title' itemprop='name'> <a href='http://baskauf.blogspot.com/2020/02/vanderbot-part-3-writing-data-from-csv.html'>VanderBot part 3: Writing data from a CSV file to Wikidata</a> </h3> <div class='post-header'> <div class='post-header-line-1'></div> </div> <div class='post-body entry-content' id='post-body-1362477816070901447' itemprop='description articleBody'> <div class="separator" style="clear: both; text-align: center;"> <a href="https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEiDq5PGdWsB-Kh8NxHHT0OxQlCJWCoIdpXcrtf8JasiDWMEvufhX3JDpBYFNJizS3dZ47_9-Yrc0wOHV83vKQ2ueHRnkTh08AUoQwgQuGDLEIxJUAFf_apqmQ6VAJH16rz1iWCik1DAqLg/s1600/diagram12.png" imageanchor="1" style="margin-left: 1em; margin-right: 1em;"><img border="0" data-original-height="533" data-original-width="974" height="350" src="https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEiDq5PGdWsB-Kh8NxHHT0OxQlCJWCoIdpXcrtf8JasiDWMEvufhX3JDpBYFNJizS3dZ47_9-Yrc0wOHV83vKQ2ueHRnkTh08AUoQwgQuGDLEIxJUAFf_apqmQ6VAJH16rz1iWCik1DAqLg/s640/diagram12.png" width="640" /></a></div> <div class="separator" style="clear: both; text-align: center;"> <br /></div> <div class="separator" style="clear: both; text-align: left;"> In <a href="http://baskauf.blogspot.com/2020/02/vanderbot-part-2-wikibase-data-model.html" target="_blank">the previous post of this series</a>, I described how my investigation of the Wikibase data model led me to settle on a relatively simple spreadsheet layout for tracking what items, statements, and references needed to be created or edited in Wikidata. Since column headers in a CSV spreadsheet don't really have any meaning other than to a human, it's necessary to map columns to features of the Wikibase model so that a script would know how to write the data in those columns to appropriate data items in Wikidata. </div> <br /> <h2> Developing a schema to map spreadsheet columns to the Wikibase model</h2> In <a href="http://baskauf.blogspot.com/2016/10/guid-o-matic-goes-to-china.html" target="_blank">a blog post from 2016</a>, I wrote about a similar problem that I faced when creating an application that would translate tabular CSV data to RDF triples. In that case, I created a mapping CSV table that mapped table headers to particular RDF predicates, and that also indicated the kind of object represented in the table (language-tagged literal, IRI, etc.). That approach worked fine and had the advantage of simplicity, but it had the disadvantage that it was an entirely ad hoc solution that I made up for my own use.<br /> <br /> When I learned about the <a href="https://www.w3.org/TR/csv2rdf/" target="_blank">"Generating RDF from Tabular Data on the Web" W3C Recommendation</a>, I recognized that this was a more standardized way to accomplish a mapping from a CSV table to RDF. When I started working on the VanderBot project I realized that since the Wikibase model can be expressed as an RDF graph, I could construct a schema using this W3C standard to document how my CSV data should be mapped to Wikidata items, properties, references, labels, etc. The most relevant part of the standard is <a href="https://www.w3.org/TR/csv2rdf/#example-events-listing" target="_blank">section 7.3, "Example with single table and using virtual columns to produce multiple subjects per row"</a>.<br /> <br /> An example schema that maps the <a href="https://github.com/HeardLibrary/linked-data/blob/master/publications/departments/engineering-to-write.csv" target="_blank">sample table from last the last post</a> is <a href="https://github.com/HeardLibrary/linked-data/blob/master/publications/departments/csv-metadata.json" target="_blank">here</a>. The schema is written in JSON and if ingested by an application that can transform CSV files in accordance with the W3C specification, it should produce RDF triples identical to triples about the subject items that are stored in the Wikidata Query Service triplestore (not all triples, but many of the ones that would be generated if the CSV data were loaded into the Wikidata API).  I haven't actually tried this since I haven't acquired such an application, but the point is that the JSON schema applied to the CSV data will generate part of the graph that will eventually be present in Wikidata when the data are loaded.<br /> <br /> I will not go into every detail of the example schema, but show several examples of how parts of it map particular columns.<br /> <div> <br /></div> <h4> Column for the item identifier</h4> Each column in the table has a corresponding JSON object in the schema. The first column, with the column header title "wikidataId" is mapped with:<br /> <br /> <span style="font-family: "courier new" , "courier" , monospace;">{</span><br /> <span style="font-family: "courier new" , "courier" , monospace;">"titles": "wikidataId",</span><br /> <span style="font-family: "courier new" , "courier" , monospace;">"name": "wikidataId",</span><br /> <span style="font-family: "courier new" , "courier" , monospace;">"datatype": "string", </span><br /> <span style="font-family: "courier new" , "courier" , monospace;">"suppressOutput": true</span><br /> <span style="font-family: "courier new" , "courier" , monospace;">}</span><br /> <br /> This JSON simply associates a variable name (<span style="font-family: "courier new" , "courier" , monospace;">wikidataId</span>) with the Wikidata Q ID for the item that's the subject of each row. (For simplicity, I've chosen to make the variable names the same as the column titles, but that isn't required.)  The "true" value for <span style="font-family: "courier new" , "courier" , monospace;">suppressOutput</span> means that no statement is directly generated from this column.<br /> <div> <br /></div> <h4> Column for the label</h4> <div> The "labelEn" column is mapped with this JSON object:</div> <div> <div> <br /></div> <div> <span style="font-family: "courier new" , "courier" , monospace;">{</span></div> <div> <span style="font-family: "courier new" , "courier" , monospace;">"titles": "labelEn",</span></div> <div> <span style="font-family: "courier new" , "courier" , monospace;">"name": "labelEn",</span></div> <div> <span style="font-family: "courier new" , "courier" , monospace;">"datatype": "string",</span></div> <div> <span style="font-family: "courier new" , "courier" , monospace;">"aboutUrl": "http://www.wikidata.org/entity/{wikidataId}",</span></div> <div> <span style="font-family: "courier new" , "courier" , monospace;">"propertyUrl": "rdfs:label",</span></div> <div> <span style="font-family: "courier new" , "courier" , monospace;">"lang": "en"</span></div> <div> <span style="font-family: "courier new" , "courier" , monospace;">}</span></div> <div> <br /></div> <div> The value of <span style="font-family: "courier new" , "courier" , monospace;">aboutUrl</span> indicates the subject of the triple generated by this column. The curly brackets indicate that the <span style="font-family: "courier new" , "courier" , monospace;">wikidataId</span> variable should be substituted in that place to generate the URI for the subject.  The value of <span style="font-family: "courier new" , "courier" , monospace;">propertyUrl</span> is <span style="font-family: "courier new" , "courier" , monospace;">rdfs:label</span>, the RDF predicate that Wikibase uses for its label field. The object of the triple by default is the value present in that column for the row.  The <span style="font-family: "courier new" , "courier" , monospace;">lang</span> value provides the language tag for the literal.</div> </div> <div> <br /></div> <div class="separator" style="clear: both; text-align: center;"> <a href="https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEgXy-Y7R1Z90ZweF5uzj0Vmjq-YFffogo1wimFRk4Sz-v-Swyn7H3qCgYnv6eW7VKPFiPZkgp4ykhmLSLVUct0i-mTuWK7SZLdJG2GlMQmu_Ic9viS7dPrrQGqL47GeLNojs9cNASEF0Qc/s1600/diagram13.png" imageanchor="1" style="margin-left: 1em; margin-right: 1em;"><img border="0" data-original-height="104" data-original-width="974" height="68" src="https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEgXy-Y7R1Z90ZweF5uzj0Vmjq-YFffogo1wimFRk4Sz-v-Swyn7H3qCgYnv6eW7VKPFiPZkgp4ykhmLSLVUct0i-mTuWK7SZLdJG2GlMQmu_Ic9viS7dPrrQGqL47GeLNojs9cNASEF0Qc/s640/diagram13.png" width="640" /></a></div> <div> <br /></div> <div> <div> So when this mapping is applied to the <span style="font-family: "courier new" , "courier" , monospace;">labelEn</span> column of the first row, the triple</div> <div> <br /></div> <div> <span style="font-family: "courier new" , "courier" , monospace;"><http://www.wikidata.org/entity/Q84268104> rdfs:label "Vanderbilt Department of Biomedical Engineering"@en.</span></div> </div> <div> <br /></div> <div> would be generated.</div> <div> <br /></div> <h4> Column for a property having value that is an item (<span style="font-family: "courier new" , "courier" , monospace;">P749</span>)</h4> <div> Here is the JSON object that maps the "<span style="font-family: "courier new" , "courier" , monospace;">parentUnit</span>" column.</div> <div> <div> <br /></div> <div> <span style="font-family: "courier new" , "courier" , monospace;">{</span></div> <div> <span style="font-family: "courier new" , "courier" , monospace;">"titles": "parentUnit",</span></div> <div> <span style="font-family: "courier new" , "courier" , monospace;">"name": "parentUnit",</span></div> <div> <span style="font-family: "courier new" , "courier" , monospace;">"datatype": "string",</span></div> <div> <span style="font-family: "courier new" , "courier" , monospace;">"aboutUrl": "http://www.wikidata.org/entity/{wikidataId}",</span></div> <div> <span style="font-family: "courier new" , "courier" , monospace;">"propertyUrl": "http://www.wikidata.org/prop/direct/P749",</span></div> <div> <span style="font-family: "courier new" , "courier" , monospace;">"valueUrl": "http://www.wikidata.org/entity/{parentUnit}"</span></div> <div> <span style="font-family: "courier new" , "courier" , monospace;">}</span></div> <div> <br /></div> <div> As before, the subject URI is established by substituting the <span style="font-family: "courier new" , "courier" , monospace;">wikidataId</span> variable into the URI template for <span style="font-family: "courier new" , "courier" , monospace;">aboutUrl</span>. Instead of directly mapping the column value as the object of the triple, the column value is inserted into a <span style="font-family: "courier new" , "courier" , monospace;">valueUrl</span> URI template in the same manner as the <span style="font-family: "courier new" , "courier" , monospace;">aboutUrl</span>.  </div> </div> <div> <br /></div> <div class="separator" style="clear: both; text-align: center;"> <a href="https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEh-bYfhyphenhyphenloSTMPqKpNOcl5tu2y1fJbQFO0XeAD_eatE9e8-X1Yy2uXjtxRqBlp4gELq0UvHgZ5IhrusJcIMuqblkMoqr9sTSXt1-u7xSmRRT7HaHpIIV4ZHBPVIfJn9EHW9AEbiJ2MGAKM/s1600/diagram14.png" imageanchor="1" style="margin-left: 1em; margin-right: 1em;"><img border="0" data-original-height="131" data-original-width="974" height="86" src="https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEh-bYfhyphenhyphenloSTMPqKpNOcl5tu2y1fJbQFO0XeAD_eatE9e8-X1Yy2uXjtxRqBlp4gELq0UvHgZ5IhrusJcIMuqblkMoqr9sTSXt1-u7xSmRRT7HaHpIIV4ZHBPVIfJn9EHW9AEbiJ2MGAKM/s640/diagram14.png" width="640" /></a></div> <div> <br /></div> <div> <div> Applying this column mapping to the <span style="font-family: "courier new" , "courier" , monospace;">parentUnit</span> column generates the triple:</div> <div> <br /></div> <div> <span style="font-family: "courier new" , "courier" , monospace;"><http://www.wikidata.org/entity/Q84268104> <http://www.wikidata.org/prop/direct/P749> <http://www.wikidata.org/entity/Q7914459>.</span></div> <div> <br /></div> <div> which can be abbreviated</div> <div> <br /></div> <div> <span style="font-family: "courier new" , "courier" , monospace;">wd:Q84268104 wdt:P749 wd:Q7914459.</span></div> </div> <div> <br /></div> <div> <div> The other columns in the CSV table are mapped similarly. If there is no <span style="font-family: "courier new" , "courier" , monospace;">valueURl</span> key:value pair, the value for the column is a literal object, and if there is a value for <span style="font-family: "courier new" , "courier" , monospace;">valueURI</span>, the value for the column is used to generate a URI denoting a non-literal object. </div> <div> <br /></div> <div> The value of <span style="font-family: "courier new" , "courier" , monospace;">datatype</span> is important since it determines the <span style="font-family: "courier new" , "courier" , monospace;">xsd:datatype</span> of literal values in the generated triples.</div> <div> <br /></div> <div> Not every column generates a triple with a subject that's the subject of the row. The subject may be the value of any other column. This allows the data in the row to form a more complicated graph structure.</div> </div> <div> <br /></div> <h2> How the VanderBot script writes the CSV data to the Wikidata API</h2> <div> <div> The script that does the actual writing to the Wikidata API is <a href="https://github.com/HeardLibrary/linked-data/blob/master/publications/process_csv_metadata_full.py" target="_blank">here</a>.  The authentication process (<a href="https://github.com/HeardLibrary/linked-data/blob/master/publications/process_csv_metadata_full.py#L338" target="_blank">line 338</a>) is described in detail <a href="https://heardlibrary.github.io/digital-scholarship/host/wikidata/bot/#use-the-bot-to-write-to-the-wikidata-test-instance" target="_blank">elsewhere</a>.   </div> <div> <br /></div> <div> The actual script begins (<a href="https://github.com/HeardLibrary/linked-data/blob/master/publications/process_csv_metadata_full.py#L374" target="_blank">line 374</a>) by loading the schema JSON into a Python data structure and loading the CSV table into a list of dictionaries. </div> <div> <br /></div> <div> The next section of the code (<a href="https://github.com/HeardLibrary/linked-data/blob/master/publications/process_csv_metadata_full.py#L402" target="_blank">lines 402 to 554</a>) uses the schema JSON to sort the columns of the tables into categories (labels, aliases, descriptions, statements with entity values, and statements with literal values).  </div> <div> <br /></div> <div> From lines <a href="https://github.com/HeardLibrary/linked-data/blob/master/publications/process_csv_metadata_full.py#L556" target="_blank">556 to 756</a>, the script steps through each row of the table to generate the data that needs to be passed to the API to upload new data.  In each row, the script goes through each category of data (labels, aliases, etc.) and turns the value in a column into the specific JSON required by the API for uploading that kind of data. I call this "snak JSON" because the units in the JSON represent "snaks" (small, discrete statements) as defined by the Wikibase data model.</div> <div> <br /></div> <div> Originally, I had written the script in a simpler way, where each piece of information about the item was written in a separate API call. This seemed intuitive since there are individual API methods for uploading every category (label, description, property, reference, etc., see the <a href="https://www.wikidata.org/w/api.php" target="_blank">API documentation</a>). However, because of rate limitations that I'll talk about later, the most reasonable way to write the data was to determine which categories needed to be written for an item and then generate the JSON for all categories at once. I then used the "all in one" method <span style="font-family: "courier new" , "courier" , monospace;">wbeditentity</span> to make all possible edits in a single API call. This resulted in much more complicated code that constructed deeply nested JSON that's difficult to read. The API help page didn't give any examples that were nearly this complicated, so getting this strategy to work required delving deeply into the Wikibase model. One lifesaver was that when a successful API call was made, the API's response included JSON structured according to the Wikibase model that was very similar to the JSON that was necessary to write to the API. Being able to look at this response JSON was really useful to help me figure out what subtle mistakes I was making when constructing the JSON to send to the API.</div> <div> <br /></div> <div> Simply creating labels, descriptions, and claims would not have been too hard, but I was determined to also have the capability to support references and qualifiers for claims. Here's how I hacked that task: for each statement column, I went through the columns and looked for other columns that the schema indicated were references or qualifiers of that statement. Currently, the script only handles one reference and one qualifier per statement, but when I get around to it, I'll improve the script to remove that limitation. </div> <div> <br /></div> <div> In line <a href="https://github.com/HeardLibrary/linked-data/blob/master/publications/process_csv_metadata_full.py#L759" target="_blank">759</a>, the script checks whether it found any information about the item that wasn't already written to Wikidata. If there was at least one thing to write, the script attempts to post a parameter dictionary (including the complex, constructed snak JSON) to the API (<a href="https://github.com/HeardLibrary/linked-data/blob/master/publications/process_csv_metadata_full.py#L305" target="_blank">lines 305 to 335</a>) If the attempt was unsuccessful because the API was too busy, it retries several times. If the attempt was unsuccessful for other reasons, the script displays the server's response for debugging. </div> <div> <br /></div> <div> If the attempt was successful, the script extracts identifiers of newly-created data records (item Q IDs, statement UUIDs, and reference hashes - see the previous post for more on this) and adds them to the CSV table so that the script will know in the future that those data are already in Wikidata.  The script rewrites the CSV table after every line so that if the script crashes or the API throws an error during a write attempt, one can simply re-start the script after fixing the problem and the script will know not to create duplicate data on the second go-around (since the identifiers for the already-written data have already been added to the CSV).  </div> <div> <br /></div> <div> I mentioned near the end of my previous post that I don't have any way to record whether labels, descriptions, and qualifiers had already been written or not, since URI identifiers aren't generated for them.  The lack of URI identifiers means that one can't refer to those particular assertions directly by URIs in a SPARQL query. Instead, one must make a query asking explicitly for the value of the label, description, or qualifier and then determine whether it's the same as the value in the CSV table. The way the script currently works, prior to creating JSON to send to the API the script sends a SPARQL query asking for the values of labels and descriptions of all of the entities in the table (lines <a href="https://github.com/HeardLibrary/linked-data/blob/master/publications/process_csv_metadata_full.py#L465" target="_blank">465</a> and <a href="https://github.com/HeardLibrary/linked-data/blob/master/publications/process_csv_metadata_full.py#L515" target="_blank">515</a>). Then as the script processes each line of the table, it checks whether the value in the CSV is the same as what's already in Wikidata (and then does nothing) or different. If the value is different, it writes the new value from the CSV and overwrites the value in Wikidata. </div> <div> <br /></div> <div> It is important to understand this behavior, because if the CSV table is "stale" and has not been updated for a long time, other users may have improved the labels or descriptions.  Running the script with the stale values will effectively revert their improvements. So it's important to update the CSV file with current values before running this script that writes to the API.  After updating, then you can manually change any labels or descriptions that are unsatisfactory.  </div> <div> <br /></div> <div> In the future, I plan to write additional scripts for managing labels and aliases, so this crude management system will hopefully be improved.</div> </div> <div> <br /></div> <h2> Cleaning up missing references</h2> <div> In some cases, other Wikidata contributors have already made statements about pre-existing Vanderbilt employee items. For example, someone may have already asserted that the Vanderbilt employee's employer was Vanderbilt University. In such cases, the primary API writing script will do nothing with those statements because it is not possible to write a reference as part of the <span style="font-family: "courier new" , "courier" , monospace;">wbeditentity</span> API method without also writing its parent statement. So I had to create <a href="https://github.com/HeardLibrary/linked-data/blob/master/publications/cleanup_csv_metadata.py" target="_blank">a separate script</a> that is a hack of the primary script in order to write the missing references. I won't describe that script here because its operation is very similar to the main script. The main difference is that it uses the <a href="https://www.wikidata.org/w/api.php?action=help&modules=wbsetreference" target="_blank"><span style="font-family: "courier new" , "courier" , monospace;">wbsetreference</span> API method</a> that is able to directly write a reference given a statement identifier. After running the main script, I run the cleanup script until all of the missing references have been added.</div> <div> <br /></div> <h2> Timing issues</h2> <h4> </h4> <h4> Maxlag</h4> <div> One of the things that I mentioned in my <a href="http://baskauf.blogspot.com/2019/06/putting-data-into-wikidata-using.html" target="_blank">original post on writing data to Wikidata</a> was that when writing to the "real" Wikidata API (vs. the test API or your own Wikibase instance) it's important to respect the <span style="font-family: "courier new" , "courier" , monospace;">maxlag</span> parameter.</div> <div> <div> <br /></div> <div> You can set the value of the <span style="font-family: "courier new" , "courier" , monospace;">maxlag</span> parameter in <a href="https://github.com/HeardLibrary/linked-data/blob/master/publications/process_csv_metadata_full.py#L376" target="_blank">line 381</a>. The recommended value is 5 seconds.  A higher <span style="font-family: "courier new" , "courier" , monospace;">maxlag</span> value is more aggressive and a lower <span style="font-family: "courier new" , "courier" , monospace;">maxlag</span> value is "nicer" but means that you are willing to be told more often by the API to wait. The value of <span style="font-family: "courier new" , "courier" , monospace;">maxlag</span> you have chosen is added to the parameters sent to the API in <a href="https://github.com/HeardLibrary/linked-data/blob/master/publications/process_csv_metadata_full.py#L764" target="_blank">line 764</a> just before the POST operation.  </div> <div> <br /></div> <div> The API lag is the average amount of time between when a user requests an operation and the API is able to honor that request.  At times of low usage (e.g. nighttime in the US and Europe), the lag may be small, but at times of high usage, the lag can be over 8 seconds (I've seen it go as high as 12 seconds). If you set <span style="font-family: "courier new" , "courier" , monospace;">maxlag</span> to 5 seconds, you are basically telling the server that if the lag gets longer than 5 seconds, ignore your request and you'll try again later.  The server tells you to wait by responding to your POST request with a response that contains a <span style="font-family: "courier new" , "courier" , monospace;">maxlag</span> error code and the amount of time the server is lagged.  This error is handled in <a href="https://github.com/HeardLibrary/linked-data/blob/master/publications/process_csv_metadata_full.py#L313" target="_blank">line 315</a> of the script.  When a lag error is detected, the recommended practice is to wait at least 5 seconds before retrying.</div> </div> <div> <br /></div> <h4> Bot flags</h4> <div> I na茂vely believed that if I respected <span style="font-family: "courier new" , "courier" , monospace;">maxlag</span> errors that I'd be able to write to the API as fast as conditions allowed. However, the very first time I used the VanderBot script to write more than 25 records in a row, I was blocked by the API as a potential spammer with the message "As an anti-abuse measure, you are limited from performing this action too many times in a short space of time, and you have exceeded this limit. Please try again in a few minutes." Clearly my assumption was wrong.  Through trial and error, I determined that a write rate of one second per write was too fast and would result in being temporarily blocked, but a rate of two seconds per write was acceptable. So to handle cases when maxlag was not invoked, I put a delay of 2 seconds on the script (<a href="https://github.com/HeardLibrary/linked-data/blob/master/publications/process_csv_metadata_full.py#L821" target="_blank">line 822</a>).</div> <div> <div> <br /></div> <div> I had several hypotheses about the cause of the blocking. One possible reason was because I didn't have a bot flag. (More on that later.) Another reason might be because I was running the script from my local computer rather than from <a href="https://www.mediawiki.org/wiki/PAWS" target="_blank">PAWS</a>. PAWS is a web-based interactive programming and publishing environment based on Jupyter notebooks. At Wikicon North America, I had an interesting and helpful conversation with Dominic Byrd-McDevitt of the National Archives who showed me how he used PAWS to publish NARA metadata to Wikidata via a PAWS-based system using Pywikibot. I don't think he had a bot flag and I think his publication rate was faster than one write per second.  But I really didn't want to take the time to test this hypothesis by converting my script over to PAWS (which would require more experimentation with authentication). So I decided to make <a href="https://lists.wikimedia.org/pipermail/wikitech-l/2020-January/092946.html" target="_blank">a post to Wikitech-l</a> and see if I could get an answer. </div> <div> <br /></div> <div> I quickly got <a href="https://lists.wikimedia.org/pipermail/wikitech-l/2020-January/092947.html" target="_blank">a helpful answer</a> that confirmed that neither using PAWS nor Pywikibot should have any effect on the rate limit. If I had a bot flag, I might gain the "noratelimit" right, which might bypass rate limiting in many cases. </div> <div> <br /></div> <div> Bot flags are discussed <a href="https://www.wikidata.org/wiki/Wikidata:Bots" target="_blank">here</a> . In order to get a bot flag, one must detail the task that the bot will perform, then demonstrate by a test run of 50 and 250 edits that the bot is working correctly. When I was at Wikicon NA, I asked some of the Powers That Be whether it was important to get a bot flag if I was not running an autonomous bot. They said that it wasn't so important if I was monitoring the writing process. It would be difficult to "detail the task" that VanderBot will perform since it's just a general-purpose API writing script, and what it writes will depend on the CSV file and the JSON mapping schema. </div> <div> <br /></div> <div> In the end, I decided to just forget about getting a bot flag for now and keep the rate at 2 seconds per write. I usually don't write more than 50-100 edits in a session and often the server will be lagged anyway requiring me to wait much longer than 2 seconds. If VanderBot's task becomes more well-defined and autonomous, I might request a bot flag at some point in the future.</div> </div> <div> <br /></div> <h4> Query Service Updater lag</h4> <div> One of the principles upon which VanderBot is built is that data are written to Wikidata by POSTing to the API, but that the status of data in Wikidata is determined by SPARQL queries of the Query Service. That is a sound idea, but it has one serious limitation. Data that are added through either the API or the human GUI do not immediately appear in the graph database that supports the Query Service. There is a delay, known as the Updater lag, between the time of upload and the time of availability at the Query Service. We can gain a better understanding by looking at the <a href="https://grafana.wikimedia.org/d/000000489/wikidata-query-service?orgId=1" target="_blank">Query Service dashboard</a>.</div> <div> <div> <br /></div> <div> Here's a view of the lag time on the day I wrote this post (2020-02-03):</div> </div> <div> <br /></div> <div class="separator" style="clear: both; text-align: center;"> <a href="https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEhavyjj-Nu16dEYaAUHOhCFWPH0gMT7u19lo8dOGwRWZVjMT1N_x_bgdZ3oSRImu2BbX8ZfjvFA4Lc3VMtSxUSyX3AaYpcTq2NTz3j-V8WZLDLE9_YWG_v0k0b6SI9zFfaa2DOw-SkT6Kc/s1600/diagram15.png" imageanchor="1" style="margin-left: 1em; margin-right: 1em;"><img border="0" data-original-height="525" data-original-width="974" height="344" src="https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEhavyjj-Nu16dEYaAUHOhCFWPH0gMT7u19lo8dOGwRWZVjMT1N_x_bgdZ3oSRImu2BbX8ZfjvFA4Lc3VMtSxUSyX3AaYpcTq2NTz3j-V8WZLDLE9_YWG_v0k0b6SI9zFfaa2DOw-SkT6Kc/s640/diagram15.png" width="640" /></a></div> <div> <br /></div> <div> The first thing to notice is that there isn't just one query service. There are actually seven servers running replicates of the Query Service that handle the queries. They are all being updated constantly with data from the relational database connected to the API, but since the updating process has to compete with queries that are being run, some servers cannot keep up with the updates and lag by as much as 10 hours. Other servers have lag times of less than one minute. So depending on the luck of the draw of which server takes your query, data that you wrote to the API may be visible via SPARQL in a few seconds or in half a day.</div> <div> <br /></div> <div class="separator" style="clear: both; text-align: center;"> <a href="https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEgGwmzXpCUHuzNlCqsPV6NcxPAf6CH4R5emKJbTvEISSPHGw8WUCyyNoZMR46tLBumPxwIBoJn2G-oHwQvoBh6ry0N3mxoVMyb3WclhQStUZEutfE52M_nzM93JArNjqBhXrUExdwqT2go/s1600/diagram16.png" imageanchor="1" style="margin-left: 1em; margin-right: 1em;"><img border="0" data-original-height="547" data-original-width="974" height="358" src="https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEgGwmzXpCUHuzNlCqsPV6NcxPAf6CH4R5emKJbTvEISSPHGw8WUCyyNoZMR46tLBumPxwIBoJn2G-oHwQvoBh6ry0N3mxoVMyb3WclhQStUZEutfE52M_nzM93JArNjqBhXrUExdwqT2go/s640/diagram16.png" width="640" /></a></div> <div> <br /></div> <div> <div> A practical implication of this is that if VanderBot updates its CSV record using SQARQL, the data could be as much as half a day out of date. Normally that isn't a problem, since the data I'm working with doesn't change much, and once I write new data, I usually don't mess with it for days. However, since the script depends on a SPARQL query to determine if the labels and descriptions in the CSV differ from what's already in Wikidata, there can be problems if the script crashes half way through the rows of the CSV. If I fix the problem and immediately re-run the script, a lagged Query Service will send a response to the query saying that the labels and descriptions that I successfully wrote a few moments earlier were in their previous state. That will cause VanderBot to attempt to re-write those labels and descriptions.  Fortunately, if the API detects that a write operation is trying to set the value of a label or description to the value it already has, it will do nothing. So generally, no harm is done.  </div> <div> <br /></div> <div> This lag is why I use the response JSON sent from the API after a write to update the CSV rather than depending on a separate SPARQL query to make the update. Because the data in the response JSON comes directly from the API and not the Query Service, it is not subject to any lag.</div> </div> <div> <br /></div> <h2> Summary</h2> <div> <br /></div> <div> The API writing script part of VanderBot does the following:</div> <div> <ol> <li>Reads the JSON mapping schema to determine the meaning of the CSV table columns.</li> <li>Reads in the data from the CSV table.</li> <li>Sorts out the columns by type of data (label, alias, description, property).</li> <li>Constructs snak JSON for any new data items that need to be written.</li> <li>Checks new statements for references and qualifiers by looking at columns associated with the statement properties, then creates snak JSON for references or qualifiers as needed.</li> <li>Inserts the constructed JSON object into the required parameter dictionary for the <span style="font-family: "courier new" , "courier" , monospace;">wbeditentity</span> API method.</li> <li>POSTs to the Wikidata API via HTTP.</li> <li>Parses the response JSON from the API to discover the identifiers of newly created data items.</li> <li>Inserts the new identifiers into the table and write the CSV file.</li> </ol> </div> <div> In the <a href="http://baskauf.blogspot.com/2020/02/vanderbot-part-4-preparing-data-to-send.html" target="_blank">final post of this series</a>, I'll describe how the data harvesting script part of VanderBot works.</div> <div> <br /></div> <div style='clear: both;'></div> </div> <div class='post-footer'> <div class='post-footer-line post-footer-line-1'> <span class='post-author vcard'> Posted by <span class='fn' itemprop='author' itemscope='itemscope' itemtype='http://schema.org/Person'> <meta content='https://www.blogger.com/profile/01896499749604153763' itemprop='url'/> <a class='g-profile' href='https://www.blogger.com/profile/01896499749604153763' rel='author' title='author profile'> <span itemprop='name'>Steve Baskauf</span> </a> </span> </span> <span class='post-timestamp'> at <meta content='http://baskauf.blogspot.com/2020/02/vanderbot-part-3-writing-data-from-csv.html' itemprop='url'/> <a class='timestamp-link' href='http://baskauf.blogspot.com/2020/02/vanderbot-part-3-writing-data-from-csv.html' rel='bookmark' title='permanent link'><abbr class='published' itemprop='datePublished' title='2020-02-07T14:49:00-08:00'>2:49 PM</abbr></a> </span> <span class='post-comment-link'> <a class='comment-link' href='http://baskauf.blogspot.com/2020/02/vanderbot-part-3-writing-data-from-csv.html#comment-form' onclick=''> No comments: </a> </span> <span class='post-icons'> <span class='item-control blog-admin pid-95103704'> <a href='https://www.blogger.com/post-edit.g?blogID=5299754536670281996&postID=1362477816070901447&from=pencil' title='Edit Post'> <img alt='' class='icon-action' height='18' src='https://resources.blogblog.com/img/icon18_edit_allbkg.gif' width='18'/> </a> </span> </span> <div class='post-share-buttons goog-inline-block'> <a class='goog-inline-block share-button sb-email' href='https://www.blogger.com/share-post.g?blogID=5299754536670281996&postID=1362477816070901447&target=email' target='_blank' title='Email This'><span class='share-button-link-text'>Email This</span></a><a class='goog-inline-block share-button sb-blog' href='https://www.blogger.com/share-post.g?blogID=5299754536670281996&postID=1362477816070901447&target=blog' onclick='window.open(this.href, "_blank", "height=270,width=475"); return false;' target='_blank' title='BlogThis!'><span class='share-button-link-text'>BlogThis!</span></a><a class='goog-inline-block share-button sb-twitter' href='https://www.blogger.com/share-post.g?blogID=5299754536670281996&postID=1362477816070901447&target=twitter' target='_blank' title='Share to X'><span class='share-button-link-text'>Share to X</span></a><a class='goog-inline-block share-button sb-facebook' href='https://www.blogger.com/share-post.g?blogID=5299754536670281996&postID=1362477816070901447&target=facebook' onclick='window.open(this.href, "_blank", "height=430,width=640"); return false;' target='_blank' title='Share to Facebook'><span class='share-button-link-text'>Share to Facebook</span></a><a class='goog-inline-block share-button sb-pinterest' href='https://www.blogger.com/share-post.g?blogID=5299754536670281996&postID=1362477816070901447&target=pinterest' target='_blank' title='Share to Pinterest'><span class='share-button-link-text'>Share to Pinterest</span></a> </div> </div> <div class='post-footer-line post-footer-line-2'> <span class='post-labels'> </span> </div> <div class='post-footer-line post-footer-line-3'> <span class='post-location'> </span> </div> </div> </div> </div> <div class='post-outer'> <div class='post hentry uncustomized-post-template' itemprop='blogPost' itemscope='itemscope' itemtype='http://schema.org/BlogPosting'> <meta content='https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEi5xlZPlCGpbO_-k5B3opwaELGdK0auVHn6yGFLD6m4pfs0m3vn3U6l3JNOZQ-5otK1jxLf3SOHUf6NvZ0yITHx8yRBhURD3D0FO2HKGTommBXKgA3GfegzV-XADSPHrIwfZScQ7ET6LfQ/s640/diagram4.png' itemprop='image_url'/> <meta content='5299754536670281996' itemprop='blogId'/> <meta content='8625522348856147807' itemprop='postId'/> <a name='8625522348856147807'></a> <h3 class='post-title entry-title' itemprop='name'> <a href='http://baskauf.blogspot.com/2020/02/vanderbot-part-2-wikibase-data-model.html'>VanderBot part 2: The Wikibase data model and Wikidata identifiers</a> </h3> <div class='post-header'> <div class='post-header-line-1'></div> </div> <div class='post-body entry-content' id='post-body-8625522348856147807' itemprop='description articleBody'> <img border="0" data-original-height="534" data-original-width="975" height="350" src="https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEi5xlZPlCGpbO_-k5B3opwaELGdK0auVHn6yGFLD6m4pfs0m3vn3U6l3JNOZQ-5otK1jxLf3SOHUf6NvZ0yITHx8yRBhURD3D0FO2HKGTommBXKgA3GfegzV-XADSPHrIwfZScQ7ET6LfQ/s640/diagram4.png" style="display: none;" width="640" /><br /> <h2> The Wikidata GUI and the Wikibase model</h2> To read part 1 of this series, see <a href="http://baskauf.blogspot.com/2020/02/vanderbot-python-script-for-writing-to.html" target="_blank">this page</a>.<br /> <br /> If you've edited Wikidata using the human-friendly graphical user interface (GUI), you know that items can have multiple properties, each property can have multiple values, each property/value statement can be qualified in multiple ways, each property/value statement can have multiple references, and each reference can have multiple statements about that reference. The GUI keeps this tree-like proliferation of data tidy by collapsing the references and organizing the statements by property.<br /> <br /> <div class="separator" style="clear: both; text-align: center;"> <a href="https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEhqIQyEdlvSvC8Kc-MXKJuMiKNedjfUg7t0SxZxmkVF_eK_p80r2FDZ5QN-ERnUGBOr1Tdi34OY5VYy2u_LijYhGuqUPg4iqny2DLBuuM9s6xS44ujhyubSFJYLSiXMq25inU0TSFlN-5I/s1600/diagram3.png" imageanchor="1" style="margin-left: 1em; margin-right: 1em;"><img border="0" data-original-height="354" data-original-width="974" height="232" src="https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEhqIQyEdlvSvC8Kc-MXKJuMiKNedjfUg7t0SxZxmkVF_eK_p80r2FDZ5QN-ERnUGBOr1Tdi34OY5VYy2u_LijYhGuqUPg4iqny2DLBuuM9s6xS44ujhyubSFJYLSiXMq25inU0TSFlN-5I/s640/diagram3.png" width="640" /></a></div> <br /> This organization of information arises from the Wikibase data model (summarized <a href="https://www.mediawiki.org/wiki/Wikibase/DataModel/Primer" target="_blank">here</a>, in detail <a href="https://www.mediawiki.org/wiki/Wikibase/DataModel" target="_blank">here</a>). For those unfamiliar with Wikibase, it is the underlying software system that Wikidata is built upon. Wikidata is just one instance of Wikibase and there are databases other than Wikidata that are built on the Wikibase system. All of those databases built on Wikibase will have a GUI that is similar to Wikidata, although the specific items and properties in those databases will be different from Wikidata.<br /> <br /> To be honest, I found working through the Wikibase model documentation a real slog. (I was particularly mystified by the obscure term for basic assertions: "snak". Originally, I though it was an acronym, but later realized it was an inside joke. A snak is "small, but more than a byte".) But understanding the Wikibase model is critical for anyone who wants to either write to the Wikidata API or query the Wikidata Query Service and I wanted to do both. So I dug in.<br /> <br /> The Wikibase model is an abstract model, but it is possible to represent it as a graph model. That's important because that is why the Wikidata dataset can be exported as RDF and made queryable by SPARQL in the Wikidata Query Service. After some exploration of Wikidata using SPARQL and puzzling over the data model documentation, I was able to draw out the major parts of the Wikibase model as a graph model. It's a bit too much to put in a single diagram, so I made one that showed references and another that showed qualifiers (inserted later in the post). Here's the diagram for references:<br /> <br /> <div class="separator" style="clear: both; text-align: center;"> <a href="https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEi5xlZPlCGpbO_-k5B3opwaELGdK0auVHn6yGFLD6m4pfs0m3vn3U6l3JNOZQ-5otK1jxLf3SOHUf6NvZ0yITHx8yRBhURD3D0FO2HKGTommBXKgA3GfegzV-XADSPHrIwfZScQ7ET6LfQ/s1600/diagram4.png" imageanchor="1" style="margin-left: 1em; margin-right: 1em;"><img border="0" data-original-height="584" data-original-width="779" height="478" src="https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEi5xlZPlCGpbO_-k5B3opwaELGdK0auVHn6yGFLD6m4pfs0m3vn3U6l3JNOZQ-5otK1jxLf3SOHUf6NvZ0yITHx8yRBhURD3D0FO2HKGTommBXKgA3GfegzV-XADSPHrIwfZScQ7ET6LfQ/s640/diagram4.png" width="640" /></a></div> <div class="separator" style="clear: both; text-align: left;"> <br /></div> <div class="separator" style="clear: both; text-align: left;"> <i>Note about namespace prefixes:</i> the exact URI for a particular namespace abbreviation will depend on the Wikibase installation. The URIs shown in the diagrams are for Wikidata. A generic Wikibase instance will contain <span style="font-family: "courier new" , "courier" , monospace;">wikibase.svc</span> as its domain name in place of <span style="font-family: "courier new" , "courier" , monospace;">www.wikidata.org</span>, and other instances will use other domain names. However, the namespace abbreviations shown above are used consistently among installations, and when querying via the human-accessible Query Service or via HTTP, the standard abbreviations can be used without declaring the underlying namespaces. That's convenient because it allows code based on the namespace abbreviations to be generic enough to be used for any Wikibase installation. </div> <div class="separator" style="clear: both; text-align: left;"> <br /></div> In the next several sections, I'm going to describe the Wikibase model and how Wikidata assigns identifiers to different parts of it. This will be important in deciding how to track data locally. Following that, I'll briefly describe my strategy for storing those data.<br /> <br /> <h2 style="clear: both; text-align: left;"> Item identifiers</h2> <div class="separator" style="clear: both; text-align: left;"> The subject item of a statement is identified by a unique "Q" identifier. For example, Vanderbilt University is identified by <span style="font-family: "courier new" , "courier" , monospace;">Q29052</span> and the researcher Antonis Rokas is identified by <span style="font-family: "courier new" , "courier" , monospace;">Q42352198</span>. We can make statements by connecting subject and object items with a defined Wikidata property. For example, the property <span style="font-family: "courier new" , "courier" , monospace;">P108</span> ("employer") can be used to state that Antonis Rokas' employer is Vanderbilt University: <span style="font-family: "courier new" , "courier" , monospace;">Q42352198 P108 Q29052</span>. When the data are transferred from the Wikidata relational database backend fed by the API to the Blazegraph graph database backend of the Query Service, the "Q" item identifiers and "P" property identifiers are turned into URIs by appending the appropriate namespace (<span style="font-family: "courier new" , "courier" , monospace;">wd:Q42352198 wdt:P108 wd:Q29052.</span>)</div> <div class="separator" style="clear: both;"> <br /></div> <div class="separator" style="clear: both;"> We can check this out by running the following query at the <a href="https://query.wikidata.org/" target="_blank">Wikidata Query Service</a>:</div> <div class="separator" style="clear: both;"> <span style="font-family: "courier new" , "courier" , monospace;"><br /></span></div> <div class="separator" style="clear: both;"> <span style="font-family: "courier new" , "courier" , monospace;">SELECT DISTINCT ?predicate ?object WHERE {</span></div> <div class="separator" style="clear: both;"> <span style="font-family: "courier new" , "courier" , monospace;">  wd:Q42352198 ?predicate ?object.</span></div> <div class="separator" style="clear: both;"> <span style="font-family: "courier new" , "courier" , monospace;">  }</span></div> <div class="separator" style="clear: both;"> <br /></div> <div class="separator" style="clear: both;"> This query returns all of the statements made about Antonis Rokas in Wikidata.</div> <div> <br /></div> <h2> Statement identifiers</h2> In order to be able to record further information about a statement itself, each statement is assigned a unique identifier in the form of a UUID. The UUID is generated at the time the statement is first made. For example, the particular statement above (<span style="font-family: "courier new" , "courier" , monospace;">Q42352198 P108 Q29052</span>) has been assigned the UUID <span style="font-family: "courier new" , "courier" , monospace;">FB9EABCA-69C0-4CFC-BDC3-44CCA9782450</span>. In the transfer from the relational database to Blazegraph, the namespace "<span style="font-family: "courier new" , "courier" , monospace;">wds:</span>" is prepended and for some reason, the subject Q ID is also prepended with a dash. So our example statement would be identified with the URI <span style="font-family: "courier new" , "courier" , monospace;">wds:Q42352198-FB9EABCA-69C0-4CFC-BDC3-44CCA9782450</span>. If you look at the results from the query above, you'll see<br /> <br /> <span style="font-family: "courier new" , "courier" , monospace;">p:P108 wds:Q42352198-FB9EABCA-69C0-4CFC-BDC3-44CCA9782450</span><br /> <br /> as one of the results.<br /> <br /> We can ask what statements have been made about the statement itself by using a similar query, but with the statement URI as the subject:<br /> <br /> <span style="font-family: "courier new" , "courier" , monospace;">SELECT DISTINCT ?predicate ?object WHERE {</span><br /> <span style="font-family: "courier new" , "courier" , monospace;">  wds:Q42352198-FB9EABCA-69C0-4CFC-BDC3-44CCA9782450 ?predicate ?object.</span><br /> <span style="font-family: "courier new" , "courier" , monospace;">  }</span><br /> <br /> One important detail relates to case insensitivity. UUIDs are supposed to be output as lowercase, but they are supposed to be case-insensitive on input. So in theory, a UUID should represent the same value regardless of the case. However, in the Wikidata system the generated identifier is just a string and that string would be different depending on the case. So the URI<br /> <br /> <span style="font-family: "courier new" , "courier" , monospace;">wds:Q42352198-FB9EABCA-69C0-4CFC-BDC3-44CCA9782450</span><br /> <br /> is <b>not</b> the same as the URI<br /> <br /> <span style="font-family: "courier new" , "courier" , monospace;">wds:Q42352198-fb9eabca-69c0-4cfc-bdc3-44cca9782450</span><br /> <br /> (Try running the query with the lower case version to convince yourself that this is true.) Typically, the UUIDs generated in Wikidata are upper case, but there are some that are lower case. For example, try<br /> <br /> <span style="font-family: "courier new" , "courier" , monospace;">wds:Q57756352-4a25cee4-45bc-63e8-74be-820454a8b7ad</span><br /> <br /> in the query. Generally it is safe to assume that the "Q" in the Q ID is upper case, but I've discovered at least one case where the Q is lower case.<br /> <div> <br /></div> <h2> Reference identifiers</h2> <div> <div> If a statement has a reference, that reference will be assigned an identifier based on a hash algorithm. Here's an example: <span style="font-family: "courier new" , "courier" , monospace;">f9c309a55265fcddd2cb0be62a530a1787c3783e</span>. The reference hash is turned into a URL by prepending the "<span style="font-family: "courier new" , "courier" , monospace;">wdref:</span>" namespace. Statements are linked to references by the property <span style="font-family: "courier new" , "courier" , monospace;">prov:wasDerivedFrom</span>. We can see an example in the results of the previous query:</div> <div> <br /></div> <div> <span style="font-family: "courier new" , "courier" , monospace;">prov:wasDerivedFrom wdref:8cfae665e8b64efffe44128acee5eaf584eda3a3</span></div> <div> <br /></div> <div> which shows the connection of the statement <span style="font-family: "courier new" , "courier" , monospace;">wds:Q42352198-FB9EABCA-69C0-4CFC-BDC3-44CCA9782450</span> (which states <span style="font-family: "courier new" , "courier" , monospace;">wd:Q42352198 wdt:P108 wd:Q29052.</span>) to the reference <span style="font-family: "courier new" , "courier" , monospace;">wdref:8cfae665e8b64efffe44128acee5eaf584eda3a3</span> (which states "reference URL http://orcid.org/0000-0002-7248-6551 and retrieved 12 January 2019"). We can see this if we run a version of the previous query asking about the reference statement:</div> <div> <br /></div> <div> <span style="font-family: "courier new" , "courier" , monospace;">SELECT DISTINCT ?predicate ?object WHERE {</span></div> <div> <span style="font-family: "courier new" , "courier" , monospace;">  wdref:8cfae665e8b64efffe44128acee5eaf584eda3a3?predicate ?object.</span></div> <div> <span style="font-family: "courier new" , "courier" , monospace;">  }</span></div> <div> <br /></div> <div> As far as I know reference hashes seem to be consistently recorded in all lower case.</div> <div> <br /></div> <div> Reference identifiers are different from statement identifiers in that they denote the reference itself, and not a particular assertion of the reference. That is, they do not denote "statement <span style="font-family: "courier new" , "courier" , monospace;">prov:wasDerivedFrom</span> reference", only the reference.  (In contrast, statement identifiers denote the whole statement "subject property value".) That means that any statement whose reference has exactly the same asserted statements will have the same reference hash (and URI). </div> <div> <br /></div> <div> We can see that reference URIs are shared by multiple statements using this query:</div> <div> <br /></div> <div> <span style="font-family: "courier new" , "courier" , monospace;">SELECT DISTINCT ?statement WHERE {</span></div> <div> <span style="font-family: "courier new" , "courier" , monospace;">  ?statement prov:wasDerivedFrom wdref:f9c309a55265fcddd2cb0be62a530a1787c3783e.</span></div> <div> <span style="font-family: "courier new" , "courier" , monospace;">  }</span></div> </div> <div> <br /></div> <h2> Identifier examples</h2> <div> The following part of a table that I generated for Vanderbilt researchers shows examples of the identifiers I've described above.</div> <div> <br /></div> <div class="separator" style="clear: both; text-align: center;"> <a href="https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEjz7uN1GhhHqSWW8Dgzyltbs4_u9ILNzBY30e7Zql1ErMGztSQRK1vQiELVG49v9-CtKITjPK1PG3cMN52ZgTwtNWvBQ4j72VJrzy2OswcEKj4WifmtmIooj7cV00RkuG5HHeCYigD7Jig/s1600/diagram5.png" imageanchor="1" style="margin-left: 1em; margin-right: 1em;"><img border="0" data-original-height="95" data-original-width="974" src="https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEjz7uN1GhhHqSWW8Dgzyltbs4_u9ILNzBY30e7Zql1ErMGztSQRK1vQiELVG49v9-CtKITjPK1PG3cMN52ZgTwtNWvBQ4j72VJrzy2OswcEKj4WifmtmIooj7cV00RkuG5HHeCYigD7Jig/s1600/diagram5.png" /></a></div> <div> <br /></div> <div> We see that each item (researcher) has a unique Q ID and that each statement that the researcher is employed at Vanderbilt University (Q29052) has a unique UUID (some upper case, some lower case) and that there are more than one statement that share the same reference (having the same reference hash).  </div> <div> <br /></div> <h2> Statement qualifiers</h2> <div> In addition to linking references to a statement, the statements can also be qualified. For example, Brandt Eichman has worked at Vanderbilt since 2004.</div> <div> <br /></div> <div class="separator" style="clear: both; text-align: center;"> <a href="https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEhga_BOadXLgd6d6HEGnlpfsiLhMQFkrPjisUWVEEppnhJQBsOWYwlyFJARRTIYT921FWmtABNMZa6K66KHv5g7AGx9miWOPHuzBMj7_7iQ6Fv2MFDzWqhu7p9mNGKTstMTKYX5Y9rsNG0/s1600/diagram6.png" imageanchor="1" style="margin-left: 1em; margin-right: 1em;"><img border="0" data-original-height="385" data-original-width="974" height="252" src="https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEhga_BOadXLgd6d6HEGnlpfsiLhMQFkrPjisUWVEEppnhJQBsOWYwlyFJARRTIYT921FWmtABNMZa6K66KHv5g7AGx9miWOPHuzBMj7_7iQ6Fv2MFDzWqhu7p9mNGKTstMTKYX5Y9rsNG0/s640/diagram6.png" width="640" /></a></div> <div> <br /></div> <div> Here's a diagram showing how the qualifier "start time 2004" is represented in Wikidata's graph database:</div> <div> <br /></div> <div class="separator" style="clear: both; text-align: center;"> <a href="https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEjF4sY2DK2WW4mGL5_E5x39YJQijzcLG4DnbiBZfH4CVrFHjBZ28SoVwuVl9yxM_P-sJv0GZYMaqKota0lzrrVGJT-dlZayJp9cJ7VI94SNHmPpNHBN67Mc0tydvv58xQo0F7dxvKP_hc8/s1600/diagram7.png" imageanchor="1" style="margin-left: 1em; margin-right: 1em;"><img border="0" data-original-height="584" data-original-width="782" height="476" src="https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEjF4sY2DK2WW4mGL5_E5x39YJQijzcLG4DnbiBZfH4CVrFHjBZ28SoVwuVl9yxM_P-sJv0GZYMaqKota0lzrrVGJT-dlZayJp9cJ7VI94SNHmPpNHBN67Mc0tydvv58xQo0F7dxvKP_hc8/s640/diagram7.png" width="640" /></a></div> <div class="separator" style="clear: both; text-align: center;"> </div> <div> <br /></div> <div> We can see that qualifiers are handled a little differently from references. If the qualifier property (in this case <span style="font-family: "courier new" , "courier" , monospace;">P580</span>, "since") has a simple value (literal or item), the value is linked to the statement instance using the <span style="font-family: "courier new" , "courier" , monospace;">pq:</span> namespace version of the property. </div> <div> <div> <br /></div> <div> If the value has a complex value (e.g. date), that value is assigned a hash and is linked to the statement instance using the <span style="font-family: "courier new" , "courier" , monospace;">pqv:</span> version of the property. When the data are transferred to the graph database, the <span style="font-family: "courier new" , "courier" , monospace;">wdv:</span> namespace is prepended to the hash. </div> <div> <br /></div> <div> Because dates are complex, the qualifier "since" requires a non-literal value in addition to a literal value linked by the <span style="font-family: "courier new" , "courier" , monospace;">pq:</span> version of the property (see <a href="https://www.wikidata.org/wiki/Help:Dates" target="_blank">this page</a> for more on the Wikibase date model). We can use this query:</div> <div> <br /></div> <div> <span style="font-family: "courier new" , "courier" , monospace;">SELECT DISTINCT ?property ?value WHERE {</span></div> <div> <span style="font-family: "courier new" , "courier" , monospace;">  wdv:849f00455434dc418fb4287a4f2b7638 ?property ?value.</span></div> <div> <span style="font-family: "courier new" , "courier" , monospace;">  }</span></div> <div> <br /></div> <div> to explore the non-literal date instance.  In Wikidata, all dates are represented as full XML Schema dateTime values (year, month, day, hour, minute, second, timezone). In order to differentiate between the year "2004" and the date 1 January 2004 (both can be represented in Wikidata by the same dateTime value), the year 2004 is assigned a timePrecision of 9 and the date 1 January 2004 is assigned a timePrecision of 11.</div> <div> <br /></div> <div> Not every qualifier will have a non-literal value. For example, the property "series ordinal" (<span style="font-family: "courier new" , "courier" , monospace;">P1545</span>; used to indicate things like the order authors are listed) has only literal values (integer numbers). So there are values associated with <span style="font-family: "courier new" , "courier" , monospace;">pq:P1545</span>, but not <span style="font-family: "courier new" , "courier" , monospace;">pqv:P1545</span>. The same is true for "language of work or name" (<span style="font-family: "courier new" , "courier" , monospace;">P407</span>; used to describe websites, songs, books, etc.), which has an entity value like <span style="font-family: "courier new" , "courier" , monospace;">Q1860</span> (English).</div> </div> <div> <br /></div> <h2> Labels, aliases, and descriptions</h2> <div> <div> Labels, aliases, and descriptions are properties of items that are handled differently from other properties in Wikidata. Labels and descriptions are handled in a similar manner, so I will discuss them together.</div> <div> <br /></div> <div> Each item in Wikidata can have only one label and one description in any particular language. Therefore adding or changing a label or description requires specifying the appropriate <a href="https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes" target="_blank">ISO 639-1 code</a> for the intended language.  When a label or description is changed in Wikidata, the previous version is replaced.</div> <div> <br /></div> <div> One important restriction is that the label/description combination in a particularly language must be unique. For example, the person with the English label "John Jones" and English description "academic" can currently only be <span style="font-family: "courier new" , "courier" , monospace;">Q16089943</span>. Because labels and descriptions can change, this label/description combination won't necessarily be permanent associated with <span style="font-family: "courier new" , "courier" , monospace;">Q16089943</span> because someone might give that John Jones a more detailed description, or make his name less generic by adding a middle name or initial. So at some point in the future, it might be possible for some other John Jones to be described as "academic".  An implication of the prohibition against two items sharing the same label/description pair is that it's better to create labels and descriptions that are as specific as possible to avoid collisions with pre-existing entities. As more entities get added to Wikidata, the probability of such collisions increases.</div> <div> <br /></div> <div> There is no limit to the number of aliases that an item can have per language. Aliases can be changed by either changing the value of a pre-existing alias or adding a new alias. As far as I know, there is no prohibition about aliases of one item matching aliases of another item.</div> <div> <br /></div> <div> When these statements are transferred to the Wikidata graph database, labels are values of <span style="font-family: "courier new" , "courier" , monospace;">rdfs:label</span>, descriptions are values of <span style="font-family: "courier new" , "courier" , monospace;">schema:description,</span> and aliases are values of <span style="font-family: "courier new" , "courier" , monospace;">skos:altLabel</span>. All of the values are language-tagged.</div> </div> <div> <br /></div> <h2> What am I skipping?</h2> <div> Another component of the Wikibase model that I have not discussed is ranks. I also haven't talked about statements that don’t have values (PropertyNoValueSnak and PropertySomeValueSnak), and sitelinks. These are features that may be important to some users, but have not yet been important enough to me to incorporate handling them in my code. </div> <div> <br /></div> <h2> Local data storage</h2> <div> If one wanted to make and track changes to Wikidata items, there are many ways to accomplish that with varying degrees of human intervention.  Last year, I spent some time pondering all of the options and came up with this diagram:</div> <div> <br /></div> <div class="separator" style="clear: both; text-align: center;"> <a href="https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEiqp6cBS09-1snJ9mc39LTpeXwStvdvSMRVui8ogjowrAiPQCiP46kAKLk_Rz0cyC9TKjAJAY3Cv54m0wI-MKXQFDicvJqEgLVw5jkYLJXy4KPRu4jeeHzJaGGNaH0S_Urv0i8PBqyBIg0/s1600/diagram9.png" imageanchor="1" style="margin-left: 1em; margin-right: 1em;"><img border="0" data-original-height="531" data-original-width="974" height="348" src="https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEiqp6cBS09-1snJ9mc39LTpeXwStvdvSMRVui8ogjowrAiPQCiP46kAKLk_Rz0cyC9TKjAJAY3Cv54m0wI-MKXQFDicvJqEgLVw5jkYLJXy4KPRu4jeeHzJaGGNaH0S_Urv0i8PBqyBIg0/s640/diagram9.png" width="640" /></a></div> <div> <br /></div> <div> Tracking every statement, reference, and qualifier for items would be complicated because each item could have an indefinite number and kind of properties, values, references, and qualifiers.  To track all of those things would require a storage system as complicated as Wikidata itself (such as a separate a relational database or a Wikibase instance as shown in the bottom of the diagram). That's way beyond what I'm interested in doing now. But what I learned about the Wikibase model and how data items are identified suggested to me a way to track all of the data that I care about in a single, flat spreadsheet. That workflow can be represented by this subset of the diagram above:</div> <div> <br /></div> <div class="separator" style="clear: both; text-align: center;"> <a href="https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEgleQdiX1ZSdXwAwCwswc7QwfI5hO2o00U6BWbABZnTJem249MR-LVq2DaiDsVuW-MxRGhA15wnW58QeXVD3KLPn4CHlfnjhLmZwXnQRFJo0mwrcoadata1biXu7SsNg_IshQnkxCzB2s0/s1600/diagram10.png" imageanchor="1" style="margin-left: 1em; margin-right: 1em;"><img border="0" data-original-height="279" data-original-width="445" height="250" src="https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEgleQdiX1ZSdXwAwCwswc7QwfI5hO2o00U6BWbABZnTJem249MR-LVq2DaiDsVuW-MxRGhA15wnW58QeXVD3KLPn4CHlfnjhLmZwXnQRFJo0mwrcoadata1biXu7SsNg_IshQnkxCzB2s0/s400/diagram10.png" width="400" /></a></div> <div> <br /></div> <div> <div> I decided on the following structure for the spreadsheet (a CSV file, example <a href="https://github.com/HeardLibrary/linked-data/blob/master/publications/departments/engineering-to-write.csv" target="_blank">here</a>.). The Wikidata Q ID serves as the key for an item and the data in a row is about a particular item. A value in the Wikidata ID column indicates that the item already exists in Wikidata. If the Wikidata ID column does not have a value, that indicates that the item needs to be created. </div> <div> <br /></div> <div> Each statement has a column representing the property with the value of that property for an item recorded in the cell for that item's row.  For each property column, there is an associated column for the UUID identifying the statement consisting of the item, property, and value. If there is no value for a property, no information is available to make that statement. If there is a value and no UUID, then the statement needs to be asserted. If there is a value and a UUID, the statement already exists in Wikidata.  </div> <div> <br /></div> <div> References consist of one or more columns representing the properties that describe the reference. References have a single column to record the hash identifier for the reference.  As with statements, if the identifier is absent, that indicates that the reference needs to be added to Wikidata. If the identifier is present, the reference has already been asserted.  </div> <div> <br /></div> <div> Because labels, descriptions, and many qualifiers do not have URIs assigned as their identifiers, their values are listed in columns of the table without corresponding identifier columns.  Knowing whether the existing labels descriptions and qualifiers already exist in Wikidata requires making a SPARQL query to find out. That process is described in the fourth blog post.</div> </div> <div> <br /></div> <h2> Where does VanderBot come in?</h2> <div> In the first post of this series, I showed a version of the following diagram to illustrate how I wanted VanderBot (my Python script for loading Vanderbilt researcher data into Wikidata) to work. That diagram is basically an elaboration of the simpler previous diagram.</div> <div> <br /></div> <div class="separator" style="clear: both; text-align: center;"> <a href="https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEg7rr66cXGcQK6m41Mh87qYFi1mO4oQLROSMant1rICiiBG_ik_x1VWBw1B536oz8KtHVpwOxbLmwwA78wZdm96rYU_bcAPw7PqjHPQRTH3OvNnLWRymTXC_L6-mGwVeItQWcKcf5kHxDU/s1600/diagram11.png" imageanchor="1" style="margin-left: 1em; margin-right: 1em;"><img border="0" data-original-height="537" data-original-width="974" height="352" src="https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEg7rr66cXGcQK6m41Mh87qYFi1mO4oQLROSMant1rICiiBG_ik_x1VWBw1B536oz8KtHVpwOxbLmwwA78wZdm96rYU_bcAPw7PqjHPQRTH3OvNnLWRymTXC_L6-mGwVeItQWcKcf5kHxDU/s640/diagram11.png" width="640" /></a></div> <div> <br /></div> <div> The part of the workflow circled in green is the <b>API writing script </b>that I will describe in the <a href="http://baskauf.blogspot.com/2020/02/vanderbot-part-3-writing-data-from-csv.html" target="_blank">third post of this series</a> (the next one). The part of the workflow circled in orange is the <b>data harvesting script</b> that I will describe in the <a href="http://baskauf.blogspot.com/2020/02/vanderbot-part-4-preparing-data-to-send.html" target="_blank">fourth post</a>. Together these two scripts form VanderBot in its current incarnation.</div> <div> <br /></div> <div> Discussing the scripts in that order may seem a bit backwards because when VanderBot operates, the data harvesting script works before the API writing script. But in developing the two scripts, I needed to think about how I was going to write to the API before I thought about how to harvest the data. So it's probably more sensible for you to learn about the API writing script first as well. Also, the design of the API writing script is intimately related to the Wikidata data model, so that's another reason to talk about it next after this post.</div> <div> <br /></div> <div style='clear: both;'></div> </div> <div class='post-footer'> <div class='post-footer-line post-footer-line-1'> <span class='post-author vcard'> Posted by <span class='fn' itemprop='author' itemscope='itemscope' itemtype='http://schema.org/Person'> <meta content='https://www.blogger.com/profile/01896499749604153763' itemprop='url'/> <a class='g-profile' href='https://www.blogger.com/profile/01896499749604153763' rel='author' title='author profile'> <span itemprop='name'>Steve Baskauf</span> </a> </span> </span> <span class='post-timestamp'> at <meta content='http://baskauf.blogspot.com/2020/02/vanderbot-part-2-wikibase-data-model.html' itemprop='url'/> <a class='timestamp-link' href='http://baskauf.blogspot.com/2020/02/vanderbot-part-2-wikibase-data-model.html' rel='bookmark' title='permanent link'><abbr class='published' itemprop='datePublished' title='2020-02-07T09:16:00-08:00'>9:16 AM</abbr></a> </span> <span class='post-comment-link'> <a class='comment-link' href='http://baskauf.blogspot.com/2020/02/vanderbot-part-2-wikibase-data-model.html#comment-form' onclick=''> No comments: </a> </span> <span class='post-icons'> <span class='item-control blog-admin pid-95103704'> <a href='https://www.blogger.com/post-edit.g?blogID=5299754536670281996&postID=8625522348856147807&from=pencil' title='Edit Post'> <img alt='' class='icon-action' height='18' src='https://resources.blogblog.com/img/icon18_edit_allbkg.gif' width='18'/> </a> </span> </span> <div class='post-share-buttons goog-inline-block'> <a class='goog-inline-block share-button sb-email' href='https://www.blogger.com/share-post.g?blogID=5299754536670281996&postID=8625522348856147807&target=email' target='_blank' title='Email This'><span class='share-button-link-text'>Email This</span></a><a class='goog-inline-block share-button sb-blog' href='https://www.blogger.com/share-post.g?blogID=5299754536670281996&postID=8625522348856147807&target=blog' onclick='window.open(this.href, "_blank", "height=270,width=475"); return false;' target='_blank' title='BlogThis!'><span class='share-button-link-text'>BlogThis!</span></a><a class='goog-inline-block share-button sb-twitter' href='https://www.blogger.com/share-post.g?blogID=5299754536670281996&postID=8625522348856147807&target=twitter' target='_blank' title='Share to X'><span class='share-button-link-text'>Share to X</span></a><a class='goog-inline-block share-button sb-facebook' href='https://www.blogger.com/share-post.g?blogID=5299754536670281996&postID=8625522348856147807&target=facebook' onclick='window.open(this.href, "_blank", "height=430,width=640"); return false;' target='_blank' title='Share to Facebook'><span class='share-button-link-text'>Share to Facebook</span></a><a class='goog-inline-block share-button sb-pinterest' href='https://www.blogger.com/share-post.g?blogID=5299754536670281996&postID=8625522348856147807&target=pinterest' target='_blank' title='Share to Pinterest'><span class='share-button-link-text'>Share to Pinterest</span></a> </div> </div> <div class='post-footer-line post-footer-line-2'> <span class='post-labels'> </span> </div> <div class='post-footer-line post-footer-line-3'> <span class='post-location'> </span> </div> </div> </div> </div> </div></div> </div> <div class='blog-pager' id='blog-pager'> <span id='blog-pager-newer-link'> <a class='blog-pager-newer-link' href='http://baskauf.blogspot.com/search?updated-max=2022-06-11T10:23:00-07:00&max-results=7&reverse-paginate=true' id='Blog1_blog-pager-newer-link' title='Newer Posts'>Newer Posts</a> </span> <span id='blog-pager-older-link'> <a class='blog-pager-older-link' href='http://baskauf.blogspot.com/search?updated-max=2020-02-07T09:16:00-08:00&max-results=7' id='Blog1_blog-pager-older-link' title='Older Posts'>Older Posts</a> </span> <a class='home-link' href='http://baskauf.blogspot.com/'>Home</a> </div> <div class='clear'></div> <div class='blog-feeds'> <div class='feed-links'> Subscribe to: <a class='feed-link' href='http://baskauf.blogspot.com/feeds/posts/default' target='_blank' type='application/atom+xml'>Posts (Atom)</a> </div> </div> </div></div> </div> </div> <div class='column-left-outer'> <div class='column-left-inner'> <aside> </aside> </div> </div> <div class='column-right-outer'> <div class='column-right-inner'> <aside> <div class='sidebar section' id='sidebar-right-1'><div class='widget BlogArchive' data-version='1' id='BlogArchive1'> <h2>Blog Archive</h2> <div class='widget-content'> <div id='ArchiveList'> <div id='BlogArchive1_ArchiveList'> <ul class='hierarchy'> <li class='archivedate collapsed'> <a class='toggle' href='javascript:void(0)'> <span class='zippy'> ►  </span> </a> <a class='post-count-link' href='http://baskauf.blogspot.com/2025/'> 2025 </a> <span class='post-count' dir='ltr'>(1)</span> <ul class='hierarchy'> <li class='archivedate collapsed'> <a class='toggle' href='javascript:void(0)'> <span class='zippy'> ►  </span> </a> <a class='post-count-link' href='http://baskauf.blogspot.com/2025/02/'> February </a> <span class='post-count' dir='ltr'>(1)</span> </li> </ul> </li> </ul> <ul class='hierarchy'> <li class='archivedate collapsed'> <a class='toggle' href='javascript:void(0)'> <span class='zippy'> ►  </span> </a> <a class='post-count-link' href='http://baskauf.blogspot.com/2023/'> 2023 </a> <span class='post-count' dir='ltr'>(2)</span> <ul class='hierarchy'> <li class='archivedate collapsed'> <a class='toggle' href='javascript:void(0)'> <span class='zippy'> ►  </span> </a> <a class='post-count-link' href='http://baskauf.blogspot.com/2023/08/'> August </a> <span class='post-count' dir='ltr'>(1)</span> </li> </ul> <ul class='hierarchy'> <li class='archivedate collapsed'> <a class='toggle' href='javascript:void(0)'> <span class='zippy'> ►  </span> </a> <a class='post-count-link' href='http://baskauf.blogspot.com/2023/04/'> April </a> <span class='post-count' dir='ltr'>(1)</span> </li> </ul> </li> </ul> <ul class='hierarchy'> <li class='archivedate collapsed'> <a class='toggle' href='javascript:void(0)'> <span class='zippy'> ►  </span> </a> <a class='post-count-link' href='http://baskauf.blogspot.com/2022/'> 2022 </a> <span class='post-count' dir='ltr'>(4)</span> <ul class='hierarchy'> <li class='archivedate collapsed'> <a class='toggle' href='javascript:void(0)'> <span class='zippy'> ►  </span> </a> <a class='post-count-link' href='http://baskauf.blogspot.com/2022/09/'> September </a> <span class='post-count' dir='ltr'>(1)</span> </li> </ul> <ul class='hierarchy'> <li class='archivedate collapsed'> <a class='toggle' href='javascript:void(0)'> <span class='zippy'> ►  </span> </a> <a class='post-count-link' href='http://baskauf.blogspot.com/2022/06/'> June </a> <span class='post-count' dir='ltr'>(1)</span> </li> </ul> <ul class='hierarchy'> <li class='archivedate collapsed'> <a class='toggle' href='javascript:void(0)'> <span class='zippy'> ►  </span> </a> <a class='post-count-link' href='http://baskauf.blogspot.com/2022/03/'> March </a> <span class='post-count' dir='ltr'>(1)</span> </li> </ul> <ul class='hierarchy'> <li class='archivedate collapsed'> <a class='toggle' href='javascript:void(0)'> <span class='zippy'> ►  </span> </a> <a class='post-count-link' href='http://baskauf.blogspot.com/2022/01/'> January </a> <span class='post-count' dir='ltr'>(1)</span> </li> </ul> </li> </ul> <ul class='hierarchy'> <li class='archivedate collapsed'> <a class='toggle' href='javascript:void(0)'> <span class='zippy'> ►  </span> </a> <a class='post-count-link' href='http://baskauf.blogspot.com/2021/'> 2021 </a> <span class='post-count' dir='ltr'>(4)</span> <ul class='hierarchy'> <li class='archivedate collapsed'> <a class='toggle' href='javascript:void(0)'> <span class='zippy'> ►  </span> </a> <a class='post-count-link' href='http://baskauf.blogspot.com/2021/03/'> March </a> <span class='post-count' dir='ltr'>(4)</span> </li> </ul> </li> </ul> <ul class='hierarchy'> <li class='archivedate expanded'> <a class='toggle' href='javascript:void(0)'> <span class='zippy toggle-open'> ▼  </span> </a> <a class='post-count-link' href='http://baskauf.blogspot.com/2020/'> 2020 </a> <span class='post-count' dir='ltr'>(5)</span> <ul class='hierarchy'> <li class='archivedate collapsed'> <a class='toggle' href='javascript:void(0)'> <span class='zippy'> ►  </span> </a> <a class='post-count-link' href='http://baskauf.blogspot.com/2020/03/'> March </a> <span class='post-count' dir='ltr'>(1)</span> </li> </ul> <ul class='hierarchy'> <li class='archivedate expanded'> <a class='toggle' href='javascript:void(0)'> <span class='zippy toggle-open'> ▼  </span> </a> <a class='post-count-link' href='http://baskauf.blogspot.com/2020/02/'> February </a> <span class='post-count' dir='ltr'>(4)</span> <ul class='posts'> <li><a href='http://baskauf.blogspot.com/2020/02/vanderbot-part-4-preparing-data-to-send.html'>VanderBot part 4: Preparing data to send to Wikidata</a></li> <li><a href='http://baskauf.blogspot.com/2020/02/vanderbot-part-3-writing-data-from-csv.html'>VanderBot part 3: Writing data from a CSV file to ...</a></li> <li><a href='http://baskauf.blogspot.com/2020/02/vanderbot-part-2-wikibase-data-model.html'>VanderBot part 2: The Wikibase data model and Wiki...</a></li> <li><a href='http://baskauf.blogspot.com/2020/02/vanderbot-python-script-for-writing-to.html'>VanderBot: A Python Script for Writing to Wikidata...</a></li> </ul> </li> </ul> </li> </ul> <ul class='hierarchy'> <li class='archivedate collapsed'> <a class='toggle' href='javascript:void(0)'> <span class='zippy'> ►  </span> </a> <a class='post-count-link' href='http://baskauf.blogspot.com/2019/'> 2019 </a> <span class='post-count' dir='ltr'>(9)</span> <ul class='hierarchy'> <li class='archivedate collapsed'> <a class='toggle' href='javascript:void(0)'> <span class='zippy'> ►  </span> </a> <a class='post-count-link' href='http://baskauf.blogspot.com/2019/10/'> October </a> <span class='post-count' dir='ltr'>(1)</span> </li> </ul> <ul class='hierarchy'> <li class='archivedate collapsed'> <a class='toggle' href='javascript:void(0)'> <span class='zippy'> ►  </span> </a> <a class='post-count-link' href='http://baskauf.blogspot.com/2019/06/'> June </a> <span class='post-count' dir='ltr'>(2)</span> </li> </ul> <ul class='hierarchy'> <li class='archivedate collapsed'> <a class='toggle' href='javascript:void(0)'> <span class='zippy'> ►  </span> </a> <a class='post-count-link' href='http://baskauf.blogspot.com/2019/05/'> May </a> <span class='post-count' dir='ltr'>(1)</span> </li> </ul> <ul class='hierarchy'> <li class='archivedate collapsed'> <a class='toggle' href='javascript:void(0)'> <span class='zippy'> ►  </span> </a> <a class='post-count-link' href='http://baskauf.blogspot.com/2019/04/'> April </a> <span class='post-count' dir='ltr'>(3)</span> </li> </ul> <ul class='hierarchy'> <li class='archivedate collapsed'> <a class='toggle' href='javascript:void(0)'> <span class='zippy'> ►  </span> </a> <a class='post-count-link' href='http://baskauf.blogspot.com/2019/03/'> March </a> <span class='post-count' dir='ltr'>(2)</span> </li> </ul> </li> </ul> <ul class='hierarchy'> <li class='archivedate collapsed'> <a class='toggle' href='javascript:void(0)'> <span class='zippy'> ►  </span> </a> <a class='post-count-link' href='http://baskauf.blogspot.com/2018/'> 2018 </a> <span class='post-count' dir='ltr'>(1)</span> <ul class='hierarchy'> <li class='archivedate collapsed'> <a class='toggle' href='javascript:void(0)'> <span class='zippy'> ►  </span> </a> <a class='post-count-link' href='http://baskauf.blogspot.com/2018/02/'> February </a> <span class='post-count' dir='ltr'>(1)</span> </li> </ul> </li> </ul> <ul class='hierarchy'> <li class='archivedate collapsed'> <a class='toggle' href='javascript:void(0)'> <span class='zippy'> ►  </span> </a> <a class='post-count-link' href='http://baskauf.blogspot.com/2017/'> 2017 </a> <span class='post-count' dir='ltr'>(6)</span> <ul class='hierarchy'> <li class='archivedate collapsed'> <a class='toggle' href='javascript:void(0)'> <span class='zippy'> ►  </span> </a> <a class='post-count-link' href='http://baskauf.blogspot.com/2017/07/'> July </a> <span class='post-count' dir='ltr'>(1)</span> </li> </ul> <ul class='hierarchy'> <li class='archivedate collapsed'> <a class='toggle' href='javascript:void(0)'> <span class='zippy'> ►  </span> </a> <a class='post-count-link' href='http://baskauf.blogspot.com/2017/05/'> May </a> <span class='post-count' dir='ltr'>(1)</span> </li> </ul> <ul class='hierarchy'> <li class='archivedate collapsed'> <a class='toggle' href='javascript:void(0)'> <span class='zippy'> ►  </span> </a> <a class='post-count-link' href='http://baskauf.blogspot.com/2017/03/'> March </a> <span class='post-count' dir='ltr'>(3)</span> </li> </ul> <ul class='hierarchy'> <li class='archivedate collapsed'> <a class='toggle' href='javascript:void(0)'> <span class='zippy'> ►  </span> </a> <a class='post-count-link' href='http://baskauf.blogspot.com/2017/02/'> February </a> <span class='post-count' dir='ltr'>(1)</span> </li> </ul> </li> </ul> <ul class='hierarchy'> <li class='archivedate collapsed'> <a class='toggle' href='javascript:void(0)'> <span class='zippy'> ►  </span> </a> <a class='post-count-link' href='http://baskauf.blogspot.com/2016/'> 2016 </a> <span class='post-count' dir='ltr'>(15)</span> <ul class='hierarchy'> <li class='archivedate collapsed'> <a class='toggle' href='javascript:void(0)'> <span class='zippy'> ►  </span> </a> <a class='post-count-link' href='http://baskauf.blogspot.com/2016/11/'> November </a> <span class='post-count' dir='ltr'>(3)</span> </li> </ul> <ul class='hierarchy'> <li class='archivedate collapsed'> <a class='toggle' href='javascript:void(0)'> <span class='zippy'> ►  </span> </a> <a class='post-count-link' href='http://baskauf.blogspot.com/2016/10/'> October </a> <span class='post-count' dir='ltr'>(3)</span> </li> </ul> <ul class='hierarchy'> <li class='archivedate collapsed'> <a class='toggle' href='javascript:void(0)'> <span class='zippy'> ►  </span> </a> <a class='post-count-link' href='http://baskauf.blogspot.com/2016/04/'> April </a> <span class='post-count' dir='ltr'>(2)</span> </li> </ul> <ul class='hierarchy'> <li class='archivedate collapsed'> <a class='toggle' href='javascript:void(0)'> <span class='zippy'> ►  </span> </a> <a class='post-count-link' href='http://baskauf.blogspot.com/2016/03/'> March </a> <span class='post-count' dir='ltr'>(3)</span> </li> </ul> <ul class='hierarchy'> <li class='archivedate collapsed'> <a class='toggle' href='javascript:void(0)'> <span class='zippy'> ►  </span> </a> <a class='post-count-link' href='http://baskauf.blogspot.com/2016/02/'> February </a> <span class='post-count' dir='ltr'>(4)</span> </li> </ul> </li> </ul> <ul class='hierarchy'> <li class='archivedate collapsed'> <a class='toggle' href='javascript:void(0)'> <span class='zippy'> ►  </span> </a> <a class='post-count-link' href='http://baskauf.blogspot.com/2015/'> 2015 </a> <span class='post-count' dir='ltr'>(6)</span> <ul class='hierarchy'> <li class='archivedate collapsed'> <a class='toggle' href='javascript:void(0)'> <span class='zippy'> ►  </span> </a> <a class='post-count-link' href='http://baskauf.blogspot.com/2015/09/'> September </a> <span class='post-count' dir='ltr'>(2)</span> </li> </ul> <ul class='hierarchy'> <li class='archivedate collapsed'> <a class='toggle' href='javascript:void(0)'> <span class='zippy'> ►  </span> </a> <a class='post-count-link' href='http://baskauf.blogspot.com/2015/07/'> July </a> <span class='post-count' dir='ltr'>(4)</span> </li> </ul> </li> </ul> <ul class='hierarchy'> <li class='archivedate collapsed'> <a class='toggle' href='javascript:void(0)'> <span class='zippy'> ►  </span> </a> <a class='post-count-link' href='http://baskauf.blogspot.com/2014/'> 2014 </a> <span class='post-count' dir='ltr'>(7)</span> <ul class='hierarchy'> <li class='archivedate collapsed'> <a class='toggle' href='javascript:void(0)'> <span class='zippy'> ►  </span> </a> <a class='post-count-link' href='http://baskauf.blogspot.com/2014/05/'> May </a> <span class='post-count' dir='ltr'>(3)</span> </li> </ul> <ul class='hierarchy'> <li class='archivedate collapsed'> <a class='toggle' href='javascript:void(0)'> <span class='zippy'> ►  </span> </a> <a class='post-count-link' href='http://baskauf.blogspot.com/2014/04/'> April </a> <span class='post-count' dir='ltr'>(4)</span> </li> </ul> </li> </ul> </div> </div> <div class='clear'></div> </div> </div><div class='widget Profile' data-version='1' id='Profile1'> <h2>About Me</h2> <div class='widget-content'> <a href='https://www.blogger.com/profile/01896499749604153763'><img alt='My photo' class='profile-img' height='80' src='//blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEhVm8jF34Q3Q0IttRnG66Z3cjplIPwTfYeVwoznPhxbTRnbtkuN5ekrcV1MNxedjGsgnImAIr_OAT_USbVVyX6pK_xy2GWTQDofBY9K7fiMj9DBOK4_dORjPC0UTaHIvP0/s220/profile-pic-carmen-small.jpg' width='80'/></a> <dl class='profile-datablock'> <dt class='profile-data'> <a class='profile-name-link g-profile' href='https://www.blogger.com/profile/01896499749604153763' rel='author' style='background-image: url(//www.blogger.com/img/logo-16.png);'> Steve Baskauf </a> </dt> </dl> <a class='profile-link' href='https://www.blogger.com/profile/01896499749604153763' rel='author'>View my complete profile</a> <div class='clear'></div> </div> </div></div> </aside> </div> </div> </div> <div style='clear: both'></div>  </div>  </div> </div> <div class='main-cap-bottom cap-bottom'> <div class='cap-left'></div> <div class='cap-right'></div> </div> </div> <footer> <div class='footer-outer'> <div class='footer-cap-top cap-top'> <div class='cap-left'></div> <div class='cap-right'></div> </div> <div class='fauxborder-left footer-fauxborder-left'> <div class='fauxborder-right footer-fauxborder-right'></div> <div class='region-inner footer-inner'> <div class='foot no-items section' id='footer-1'></div> <table border='0' cellpadding='0' cellspacing='0' class='section-columns columns-2'> <tbody> <tr> <td class='first columns-cell'> <div class='foot no-items section' id='footer-2-1'></div> </td> <td class='columns-cell'> <div class='foot no-items section' id='footer-2-2'></div> </td> </tr> </tbody> </table>  <div class='foot section' id='footer-3' name='Footer'><div class='widget Attribution' data-version='1' id='Attribution1'> <div class='widget-content' style='text-align: center;'> Simple theme. Powered by <a href='https://www.blogger.com' target='_blank'>Blogger</a>. </div> <div class='clear'></div> </div></div> </div> </div> <div class='footer-cap-bottom cap-bottom'> <div class='cap-left'></div> <div class='cap-right'></div> </div> </div> </footer>  </div> </div> <div class='content-cap-bottom cap-bottom'> <div class='cap-left'></div> <div class='cap-right'></div> </div> </div> </div> <script type='text/javascript'> window.setTimeout(function() { document.body.className = document.body.className.replace('loading', ''); }, 10); </script> <script type="text/javascript" src="https://www.blogger.com/static/v1/widgets/688949419-widgets.js"></script> <script type='text/javascript'> window['__wavt'] = 'AOuZoY4ISPrGe7fWdZS2DIX3Qgxv4oX15g:1739839758708';_WidgetManager._Init('//www.blogger.com/rearrange?blogID\x3d5299754536670281996','//baskauf.blogspot.com/2020/02/','5299754536670281996'); _WidgetManager._SetDataContext([{'name': 'blog', 'data': {'blogId': '5299754536670281996', 'title': 'Steve Baskauf\x27s blog', 'url': 'http://baskauf.blogspot.com/2020/02/', 'canonicalUrl': 'http://baskauf.blogspot.com/2020/02/', 'homepageUrl': 'http://baskauf.blogspot.com/', 'searchUrl': 'http://baskauf.blogspot.com/search', 'canonicalHomepageUrl': 'http://baskauf.blogspot.com/', 'blogspotFaviconUrl': 'http://baskauf.blogspot.com/favicon.ico', 'bloggerUrl': 'https://www.blogger.com', 'hasCustomDomain': false, 'httpsEnabled': true, 'enabledCommentProfileImages': true, 'gPlusViewType': 'FILTERED_POSTMOD', 'adultContent': false, 'analyticsAccountNumber': '', 'encoding': 'UTF-8', 'locale': 'en', 'localeUnderscoreDelimited': 'en', 'languageDirection': 'ltr', 'isPrivate': false, 'isMobile': false, 'isMobileRequest': false, 'mobileClass': '', 'isPrivateBlog': false, 'isDynamicViewsAvailable': true, 'feedLinks': '\x3clink rel\x3d\x22alternate\x22 type\x3d\x22application/atom+xml\x22 title\x3d\x22Steve Baskauf\x26#39;s blog - Atom\x22 href\x3d\x22http://baskauf.blogspot.com/feeds/posts/default\x22 /\x3e\n\x3clink rel\x3d\x22alternate\x22 type\x3d\x22application/rss+xml\x22 title\x3d\x22Steve Baskauf\x26#39;s blog - RSS\x22 href\x3d\x22http://baskauf.blogspot.com/feeds/posts/default?alt\x3drss\x22 /\x3e\n\x3clink rel\x3d\x22service.post\x22 type\x3d\x22application/atom+xml\x22 title\x3d\x22Steve Baskauf\x26#39;s blog - Atom\x22 href\x3d\x22https://www.blogger.com/feeds/5299754536670281996/posts/default\x22 /\x3e\n', 'meTag': '', 'adsenseHostId': 'ca-host-pub-1556223355139109', 'adsenseHasAds': false, 'adsenseAutoAds': false, 'boqCommentIframeForm': true, 'loginRedirectParam': '', 'view': '', 'dynamicViewsCommentsSrc': '//www.blogblog.com/dynamicviews/4224c15c4e7c9321/js/comments.js', 'dynamicViewsScriptSrc': '//www.blogblog.com/dynamicviews/f6e0cc369f0f1a05', 'plusOneApiSrc': 'https://apis.google.com/js/platform.js', 'disableGComments': true, 'interstitialAccepted': false, 'sharing': {'platforms': [{'name': 'Get link', 'key': 'link', 'shareMessage': 'Get link', 'target': ''}, {'name': 'Facebook', 'key': 'facebook', 'shareMessage': 'Share to Facebook', 'target': 'facebook'}, {'name': 'BlogThis!', 'key': 'blogThis', 'shareMessage': 'BlogThis!', 'target': 'blog'}, {'name': 'X', 'key': 'twitter', 'shareMessage': 'Share to X', 'target': 'twitter'}, {'name': 'Pinterest', 'key': 'pinterest', 'shareMessage': 'Share to Pinterest', 'target': 'pinterest'}, {'name': 'Email', 'key': 'email', 'shareMessage': 'Email', 'target': 'email'}], 'disableGooglePlus': true, 'googlePlusShareButtonWidth': 0, 'googlePlusBootstrap': '\x3cscript type\x3d\x22text/javascript\x22\x3ewindow.___gcfg \x3d {\x27lang\x27: \x27en\x27};\x3c/script\x3e'}, 'hasCustomJumpLinkMessage': false, 'jumpLinkMessage': 'Read more', 'pageType': 'archive', 'pageName': 'February 2020', 'pageTitle': 'Steve Baskauf\x27s blog: February 2020'}}, {'name': 'features', 'data': {}}, {'name': 'messages', 'data': {'edit': 'Edit', 'linkCopiedToClipboard': 'Link copied to clipboard!', 'ok': 'Ok', 'postLink': 'Post Link'}}, {'name': 'template', 'data': {'name': 'Simple', 'localizedName': 'Simple', 'isResponsive': false, 'isAlternateRendering': false, 'isCustom': false, 'variant': 'simplysimple', 'variantId': 'simplysimple'}}, {'name': 'view', 'data': {'classic': {'name': 'classic', 'url': '?view\x3dclassic'}, 'flipcard': {'name': 'flipcard', 'url': '?view\x3dflipcard'}, 'magazine': {'name': 'magazine', 'url': '?view\x3dmagazine'}, 'mosaic': {'name': 'mosaic', 'url': '?view\x3dmosaic'}, 'sidebar': {'name': 'sidebar', 'url': '?view\x3dsidebar'}, 'snapshot': {'name': 'snapshot', 'url': '?view\x3dsnapshot'}, 'timeslide': {'name': 'timeslide', 'url': '?view\x3dtimeslide'}, 'isMobile': false, 'title': 'Steve Baskauf\x27s blog', 'description': '', 'url': 'http://baskauf.blogspot.com/2020/02/', 'type': 'feed', 'isSingleItem': false, 'isMultipleItems': true, 'isError': false, 'isPage': false, 'isPost': false, 'isHomepage': false, 'isArchive': true, 'isLabelSearch': false, 'archive': {'year': 2020, 'month': 2, 'rangeMessage': 'Showing posts from February, 2020'}}}]); _WidgetManager._RegisterWidget('_NavbarView', new _WidgetInfo('Navbar1', 'navbar', document.getElementById('Navbar1'), {}, 'displayModeFull')); _WidgetManager._RegisterWidget('_HeaderView', new _WidgetInfo('Header1', 'header', document.getElementById('Header1'), {}, 'displayModeFull')); _WidgetManager._RegisterWidget('_BlogView', new _WidgetInfo('Blog1', 'main', document.getElementById('Blog1'), {'cmtInteractionsEnabled': false, 'lightboxEnabled': true, 'lightboxModuleUrl': 'https://www.blogger.com/static/v1/jsbin/1360229384-lbx.js', 'lightboxCssUrl': 'https://www.blogger.com/static/v1/v-css/1964470060-lightbox_bundle.css'}, 'displayModeFull')); _WidgetManager._RegisterWidget('_BlogArchiveView', new _WidgetInfo('BlogArchive1', 'sidebar-right-1', document.getElementById('BlogArchive1'), {'languageDirection': 'ltr', 'loadingMessage': 'Loading\x26hellip;'}, 'displayModeFull')); _WidgetManager._RegisterWidget('_ProfileView', new _WidgetInfo('Profile1', 'sidebar-right-1', document.getElementById('Profile1'), {}, 'displayModeFull')); _WidgetManager._RegisterWidget('_AttributionView', new _WidgetInfo('Attribution1', 'footer-3', document.getElementById('Attribution1'), {}, 'displayModeFull')); </script> </body> </html>