CINXE.COM

<!DOCTYPE html>   <html lang="en-US">  <head> <meta charset="UTF-8"> <meta http-equiv="Content-Security-Policy" content="upgrade-insecure-requests"> <meta name="viewport" content="width=device-width"> <title>February 2014 - Duke Collaboratory for Classics Computing (DC3)</title> <link rel="profile" href="//gmpg.org/xfn/11"> <link rel="pingback" href="https://blogs.library.duke.edu/dcthree/xmlrpc.php">  <meta name='robots' content='noindex, follow' /> <style>img:is([sizes="auto" i], [sizes^="auto," i]) { contain-intrinsic-size: 3000px 1500px }</style>  <meta property="og:locale" content="en_US" /> <meta property="og:type" content="website" /> <meta property="og:title" content="February 2014 - Duke Collaboratory for Classics Computing (DC3)" /> <meta property="og:url" content="https://blogs.library.duke.edu/dcthree/2014/02/" /> <meta property="og:site_name" content="Duke Collaboratory for Classics Computing (DC3)" /> <meta name="twitter:card" content="summary_large_image" /> <script type="application/ld+json" class="yoast-schema-graph">{"@context":"https://schema.org","@graph":[{"@type":"CollectionPage","@id":"https://blogs.library.duke.edu/dcthree/2014/02/","url":"https://blogs.library.duke.edu/dcthree/2014/02/","name":"February 2014 - Duke Collaboratory for Classics Computing (DC3)","isPartOf":{"@id":"https://blogs.library.duke.edu/dcthree/#website"},"breadcrumb":{"@id":"https://blogs.library.duke.edu/dcthree/2014/02/#breadcrumb"},"inLanguage":"en-US"},{"@type":"BreadcrumbList","@id":"https://blogs.library.duke.edu/dcthree/2014/02/#breadcrumb","itemListElement":[{"@type":"ListItem","position":1,"name":"Home","item":"https://blogs.library.duke.edu/dcthree/"},{"@type":"ListItem","position":2,"name":"Archives for February 2014"}]},{"@type":"WebSite","@id":"https://blogs.library.duke.edu/dcthree/#website","url":"https://blogs.library.duke.edu/dcthree/","name":"Duke Collaboratory for Classics Computing (DC3)","description":"a collection of parts flying in loose formation","potentialAction":[{"@type":"SearchAction","target":{"@type":"EntryPoint","urlTemplate":"https://blogs.library.duke.edu/dcthree/?s={search_term_string}"},"query-input":{"@type":"PropertyValueSpecification","valueRequired":true,"valueName":"search_term_string"}}],"inLanguage":"en-US"}]}</script>  <link rel="alternate" type="application/rss+xml" title="Duke Collaboratory for Classics Computing (DC3) » Feed" href="https://blogs.library.duke.edu/dcthree/feed/" /> <link rel="alternate" type="application/rss+xml" title="Duke Collaboratory for Classics Computing (DC3) » Comments Feed" href="https://blogs.library.duke.edu/dcthree/comments/feed/" /> <script> window._wpemojiSettings = {"baseUrl":"https:\/\/s.w.org\/images\/core\/emoji\/15.0.3\/72x72\/","ext":".png","svgUrl":"https:\/\/s.w.org\/images\/core\/emoji\/15.0.3\/svg\/","svgExt":".svg","source":{"concatemoji":"https:\/\/blogs.library.duke.edu\/dcthree\/wp-includes\/js\/wp-emoji-release.min.js?ver=6.7.2"}}; /*! This file is auto-generated */ !function(i,n){var o,s,e;function c(e){try{var t={supportTests:e,timestamp:(new Date).valueOf()};sessionStorage.setItem(o,JSON.stringify(t))}catch(e){}}function p(e,t,n){e.clearRect(0,0,e.canvas.width,e.canvas.height),e.fillText(t,0,0);var t=new Uint32Array(e.getImageData(0,0,e.canvas.width,e.canvas.height).data),r=(e.clearRect(0,0,e.canvas.width,e.canvas.height),e.fillText(n,0,0),new Uint32Array(e.getImageData(0,0,e.canvas.width,e.canvas.height).data));return t.every(function(e,t){return e===r[t]})}function u(e,t,n){switch(t){case"flag":return n(e,"\ud83c\udff3\ufe0f\u200d\u26a7\ufe0f","\ud83c\udff3\ufe0f\u200b\u26a7\ufe0f")?!1:!n(e,"\ud83c\uddfa\ud83c\uddf3","\ud83c\uddfa\u200b\ud83c\uddf3")&&!n(e,"\ud83c\udff4\udb40\udc67\udb40\udc62\udb40\udc65\udb40\udc6e\udb40\udc67\udb40\udc7f","\ud83c\udff4\u200b\udb40\udc67\u200b\udb40\udc62\u200b\udb40\udc65\u200b\udb40\udc6e\u200b\udb40\udc67\u200b\udb40\udc7f");case"emoji":return!n(e,"\ud83d\udc26\u200d\u2b1b","\ud83d\udc26\u200b\u2b1b")}return!1}function f(e,t,n){var r="undefined"!=typeof WorkerGlobalScope&&self instanceof WorkerGlobalScope?new OffscreenCanvas(300,150):i.createElement("canvas"),a=r.getContext("2d",{willReadFrequently:!0}),o=(a.textBaseline="top",a.font="600 32px Arial",{});return e.forEach(function(e){o[e]=t(a,e,n)}),o}function t(e){var t=i.createElement("script");t.src=e,t.defer=!0,i.head.appendChild(t)}"undefined"!=typeof Promise&&(o="wpEmojiSettingsSupports",s=["flag","emoji"],n.supports={everything:!0,everythingExceptFlag:!0},e=new Promise(function(e){i.addEventListener("DOMContentLoaded",e,{once:!0})}),new Promise(function(t){var n=function(){try{var e=JSON.parse(sessionStorage.getItem(o));if("object"==typeof e&&"number"==typeof e.timestamp&&(new Date).valueOf()<e.timestamp+604800&&"object"==typeof e.supportTests)return e.supportTests}catch(e){}return null}();if(!n){if("undefined"!=typeof Worker&&"undefined"!=typeof OffscreenCanvas&&"undefined"!=typeof URL&&URL.createObjectURL&&"undefined"!=typeof Blob)try{var e="postMessage("+f.toString()+"("+[JSON.stringify(s),u.toString(),p.toString()].join(",")+"));",r=new Blob([e],{type:"text/javascript"}),a=new Worker(URL.createObjectURL(r),{name:"wpTestEmojiSupports"});return void(a.onmessage=function(e){c(n=e.data),a.terminate(),t(n)})}catch(e){}c(n=f(s,u,p))}t(n)}).then(function(e){for(var t in e)n.supports[t]=e[t],n.supports.everything=n.supports.everything&&n.supports[t],"flag"!==t&&(n.supports.everythingExceptFlag=n.supports.everythingExceptFlag&&n.supports[t]);n.supports.everythingExceptFlag=n.supports.everythingExceptFlag&&!n.supports.flag,n.DOMReady=!1,n.readyCallback=function(){n.DOMReady=!0}}).then(function(){return e}).then(function(){var e;n.supports.everything||(n.readyCallback(),(e=n.source||{}).concatemoji?t(e.concatemoji):e.wpemoji&&e.twemoji&&(t(e.twemoji),t(e.wpemoji)))}))}((window,document),window._wpemojiSettings); </script> <style id='wp-emoji-styles-inline-css'> img.wp-smiley, img.emoji { display: inline !important; border: none !important; box-shadow: none !important; height: 1em !important; width: 1em !important; margin: 0 0.07em !important; vertical-align: -0.1em !important; background: none !important; padding: 0 !important; } </style> <link rel='stylesheet' id='twentyfourteen-lato-css' href='https://blogs.library.duke.edu/dcthree/wp-content/themes/twentyfourteen/fonts/font-lato.css?ver=20230328' media='all' /> <link rel='stylesheet' id='genericons-css' href='https://blogs.library.duke.edu/dcthree/wp-content/themes/twentyfourteen/genericons/genericons.css?ver=3.0.3' media='all' /> <link rel='stylesheet' id='twentyfourteen-style-css' href='https://blogs.library.duke.edu/dcthree/wp-content/themes/dul-2014/style.css?ver=20241112' media='all' /> <link rel='stylesheet' id='twentyfourteen-block-style-css' href='https://blogs.library.duke.edu/dcthree/wp-content/themes/twentyfourteen/css/blocks.css?ver=20240708' media='all' />  <style id='akismet-widget-style-inline-css'> .a-stats { --akismet-color-mid-green: #357b49; --akismet-color-white: #fff; --akismet-color-light-grey: #f6f7f7; max-width: 350px; width: auto; } .a-stats * { all: unset; box-sizing: border-box; } .a-stats strong { font-weight: 600; } .a-stats a.a-stats__link, .a-stats a.a-stats__link:visited, .a-stats a.a-stats__link:active { background: var(--akismet-color-mid-green); border: none; box-shadow: none; border-radius: 8px; color: var(--akismet-color-white); cursor: pointer; display: block; font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', 'Roboto', 'Oxygen-Sans', 'Ubuntu', 'Cantarell', 'Helvetica Neue', sans-serif; font-weight: 500; padding: 12px; text-align: center; text-decoration: none; transition: all 0.2s ease; } /* Extra specificity to deal with TwentyTwentyOne focus style */ .widget .a-stats a.a-stats__link:focus { background: var(--akismet-color-mid-green); color: var(--akismet-color-white); text-decoration: none; } .a-stats a.a-stats__link:hover { filter: brightness(110%); box-shadow: 0 4px 12px rgba(0, 0, 0, 0.06), 0 0 2px rgba(0, 0, 0, 0.16); } .a-stats .count { color: var(--akismet-color-white); display: block; font-size: 1.5em; line-height: 1.4; padding: 0 13px; white-space: nowrap; } </style> <script src="https://blogs.library.duke.edu/dcthree/wp-includes/js/jquery/jquery.min.js?ver=3.7.1" id="jquery-core-js"></script> <script src="https://blogs.library.duke.edu/dcthree/wp-includes/js/jquery/jquery-migrate.min.js?ver=3.4.1" id="jquery-migrate-js"></script> <script src="https://blogs.library.duke.edu/dcthree/wp-content/themes/twentyfourteen/js/functions.js?ver=20230526" id="twentyfourteen-script-js" defer data-wp-strategy="defer"></script> <link rel="https://api.w.org/" href="https://blogs.library.duke.edu/dcthree/wp-json/" /><link rel="EditURI" type="application/rsd+xml" title="RSD" href="https://blogs.library.duke.edu/dcthree/xmlrpc.php?rsd" /> <meta name="generator" content="WordPress 6.7.2" /> <link href='//fonts.googleapis.com/css?family=Libre+Baskerville:400,700,400italic' rel='stylesheet' type='text/css'>  <script> var _paq = window._paq = window._paq || []; _paq.push(['trackPageView']); _paq.push(['enableLinkTracking']); (function() { var u="//analytics.lib.duke.edu/"; _paq.push(['setTrackerUrl', u+'matomo.php']); _paq.push(['setSiteId', '28']); var d=document, g=d.createElement('script'), s=d.getElementsByTagName('script')[0]; g.async=true; g.src=u+'matomo.js'; s.parentNode.insertBefore(g,s); })(); </script>  </head> <body class="archive date wp-embed-responsive group-blog header-image list-view full-width"> <div id="page" class="hfeed site"> <div id="site-header"> <a href="https://blogs.library.duke.edu/dcthree/" rel="home"> <img alt="Duke Collaboratory for Classics Computing (DC3)" src="https://blogs.library.duke.edu/dcthree/files/2014/11/dc3.jpg" width="1260" height="200" alt=""> </a> </div> <div id="library_logo"><a href="//library.duke.edu" title="Duke University Libraries"><img src="/wp-content/themes/dul-2014/images/library_logo_transparent.png" alt="Duke University Libraries" border="0"></a></div> <nav id="primary-navigation" class="site-navigation primary-navigation" role="navigation"> <button class="menu-toggle">Primary Menu</button> <a class="screen-reader-text skip-link" href="#content">Skip to content</a> <div class="menu-main-container"><ul id="menu-main" class="nav-menu"><li id="menu-item-235" class="menu-item menu-item-type-post_type menu-item-object-page menu-item-235"><a href="https://blogs.library.duke.edu/dcthree/partners/">Partners</a></li> <li id="menu-item-236" class="menu-item menu-item-type-post_type menu-item-object-page menu-item-236"><a href="https://blogs.library.duke.edu/dcthree/people/">People</a></li> <li id="menu-item-237" class="menu-item menu-item-type-post_type menu-item-object-page menu-item-237"><a href="https://blogs.library.duke.edu/dcthree/projects/">Projects</a></li> </ul></div> </nav> <div id="main" class="site-main"> <section id="primary" class="content-area"> <div id="content" class="site-content" role="main"> <header class="page-header"> <h1 class="page-title"> Monthly Archives: February 2014 </h1> </header> <article id="post-172" class="post-172 post type-post status-publish format-standard hentry category-uncategorized"> <header class="entry-header"> <div class="entry-meta"> <span class="cat-links"><a href="https://blogs.library.duke.edu/dcthree/category/uncategorized/" rel="category tag">Uncategorized</a></span> </div> <h1 class="entry-title"><a href="https://blogs.library.duke.edu/dcthree/2014/02/11/how-fine-are-a-balrogs-teeth/" rel="bookmark">Searching the DDbDP (Or, How Fine are a Balrog’s Teeth?)</a></h1> <div class="entry-meta"> <span class="entry-date"><a href="https://blogs.library.duke.edu/dcthree/2014/02/11/how-fine-are-a-balrogs-teeth/" rel="bookmark"><time class="entry-date" datetime="2014-02-11T15:57:20-05:00">February 11, 2014</time></a></span> <span class="byline"><span class="author vcard"><a class="url fn n" href="https://blogs.library.duke.edu/dcthree/author/jds15duke-edu/" rel="author">Joshua Sosin</a></span></span> <span class="comments-link"><a href="https://blogs.library.duke.edu/dcthree/2014/02/11/how-fine-are-a-balrogs-teeth/#respond">Leave a comment</a></span> </div> </header> <div class="entry-content"> <span class="Z3988" title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Adc&rfr_id=info%3Asid%2Focoins.info%3Agenerator&rft.title=Searching+the+DDbDP+%28Or%2C+How+Fine+are+a+Balrog%26%238217%3Bs+Teeth%3F%29&rft.aulast=Sosin&rft.aufirst=Joshua&rft.subject=Uncategorized&rft.source=Duke+Collaboratory+for+Classics+Computing+%28DC3%29&rft.date=2014-02-11&rft.type=blogPost&rft.format=text&rft.identifier=https://blogs.library.duke.edu/dcthree/2014/02/11/how-fine-are-a-balrogs-teeth/&rft.language=English"></span> <p>A <a href="http://papyri.info/docs/ddbdp">DDbDP</a> user recently wrote with the following good question.</p> <blockquote><p>“Several years ago, when I searched for the phrase “ενταγιον εμου”, the PN returned P.Oxy. X 1326, PSI I 36 & SB XXII 15268 among the hits. When I perform the same search now, these papyri no longer appear. They all have in common a misspelling of ενταγιον: εντ<α>γιον (P.Oxy. X 1326), εντακιων (PSI I 36) & ενταγιων (SB XXII 15268).”</p></blockquote> <p>We not only <strong>like</strong> but <strong>need</strong> questions like this. This is how we improve the <a href="http://papyri.info/">PN</a>. So, keep them coming. The answer to this one is both simple and complicated. For the simple version, skip to the end. For the complicated one, read on. [Warning: if you are expert in text-searching matters, the following will seem dull and simplistic. But if you are a papyrologist it might help to explain why the PN works the way it does.]</p> <p style="text-align: center">***</p> <p>First, the user was not hallucinating; these texts exist:</p> <ul> <li><a href="http://papyri.info/ddbdp/p.oxy;10;1326">http://papyri.info/ddbdp/p.oxy;10;1326</a> <ul> <li>Line 1 has: “ἐντγιον(*) ἐμο̣ῦ” in the text and “l. ἐντ<ά>γιον” in the app, which is rather crummy; “ἐντ<ά>γιον” alone in the text, without app entry, might have been sufficient, even better.</li> </ul> </li> <li><a href="http://papyri.info/ddbdp/psi;1;36">http://papyri.info/ddbdp/psi;1;36</a> <ul> <li>Line 1 has: “ἐντάκιων(*) ἐμοῦ” in the text and “l. ἐντάγιον” in the app, which is pretty clear</li> </ul> </li> <li><a href="http://papyri.info/ddbdp/sb;22;15268">http://papyri.info/ddbdp/sb;22;15268</a> <ul> <li>Line 1 has: “ἐντάγιων(*) ἐμοῦ” in the text and “l. ἐντάγιον” in the app.</li> </ul> </li> </ul> <p>Now, several years ago, when our user first ran this search, the underlying encoding, at <em>PSI</em> I 36.1, looked like this:</p> <blockquote> <pre><choice> <reg>ἐντάγιον</reg> <orig>εντακιων</orig> </choice> ἐμοῦ</pre> </blockquote> <p>The DDbDP used to display the ‘<a href="http://www.tei-c.org/release/doc/tei-p5-doc/en/html/ref-reg.html">reg</a>(ularized)’ form up in the text, and the ‘<a href="http://www.tei-c.org/release/doc/tei-p5-doc/en/html/ref-orig.html">orig</a>(inal)’ text down in the app: “ἐντάγιον” up above and “εντακιων pap.” down below. Papyrologists, who revel in Greek as written, did not like this. So, a couple years ago, we changed the styling in order to render the original reading up in the text and the regularized reading down in the app.</p> <p>This took some work. The DDbDP’s practice had been to include diacriticals only on the regularized reading and not on what the scribe wrote, regardless of what the editor printed. This meant two things (1) We could not (and still cannot) know from the DDbDP data alone what the original editors printed in text and/or apparatus. This is a shame (although it is correctable). (2) Some 90,000 origs lacked diacriticals! So, we added them, programmatically (<a href="http://www.kcl.ac.uk/artshums/depts/ddh/people/research/lawrence/index.aspx">Faith Lawrence</a> and <a href="http://www.kcl.ac.uk/artshums/depts/ddh/people/research/bodard/index.aspx">Gabby Bodard</a>, both of <a href="http://www.kcl.ac.uk/index.aspx">KCL</a> <a href="http://www.kcl.ac.uk/artshums/depts/ddh/index.aspx">DDH</a>, did a fantastic job with this).</p> <p><span style="line-height: 1.6em">In this particular case, both we and the original editors treat ἐντάκιων as a phonetic representation of ἐντάγιον. So, we <a href="http://papyri.info/ddbdp/psi;1;36/source">produced</a>:</span></p> <blockquote> <pre><choice> <reg>ἐντάγιον</reg> <orig>ἐντάκιων</orig> </choice> ἐμοῦ</pre> </blockquote> <p>Today, the DDbDP has “ἐντάκιων” and the app indicates “l. ἐντάγιον”.</p> <p>Understanding this backstory is essential to understanding why our user’s experience a few years ago was different.</p> <p style="text-align: center">***</p> <p>Remember, before we changed editorial practice as regards reg/orig, the encoding was</p> <blockquote> <pre><choice> <reg>ἐντάγιον</reg> <orig>εντακιων</orig> </choice> ἐμοῦ</pre> </blockquote> <p>The phrase that appeared in the text was ἐντάγιον ἐμοῦ. The search index:</p> <ul> <li>knew that this text contained the word ἐντάγιον and the PN could find it</li> <li>knew that this text contained the word εντακιων and the PN could find it</li> <li>knew that this text contained the phrase ἐντάγιον ἐμοῦ and the PN could find it (as our user correctly recalls)</li> <li>DID NOT know that this text contained the phrase εντακιων ἐμοῦ</li> </ul> <p>Remember, the current encoding is</p> <blockquote> <pre><choice> <reg>ἐντάγιον</reg> <orig>ἐντάκιων</orig> </choice> ἐμοῦ</pre> </blockquote> <p>The phrase that appears in the text is ἐντάκιων ἐμου. The search index:</p> <ul> <li>knows that this text contains the word ἐντάγιον and the <a href="http://papyri.info/search?DATE_MODE=LOOSE&VOLUME=1&DOCS_PER_PAGE=15&STRING1=%CE%B5%CE%BD%CF%84%CE%B1%CE%B3%CE%B9%CE%BF%CE%BD&target1=TEXT&no_caps1=on&no_marks1=on&SERIES=psi">PN can find it</a></li> <li>knows that this text contains the word ἐντάκιων and the <a href="http://papyri.info/search?STRING=(%CE%B5%CE%BD%CF%84%CE%B1%CE%BA%CE%B9%CF%89%CE%BD)&no_caps=on&no_marks=on&target=text&DATE_MODE=LOOSE&VOLUME=1&DOCS_PER_PAGE=15&SERIES=psi">PN can find it</a></li> <li>knows that this text contains the phrase ἐντάκιων ἐμοῦ and the <a href="http://papyri.info/search?STRING=(%22%CE%B5%CE%BD%CF%84%CE%B1%CE%BA%CE%B9%CF%89%CE%BD+%CE%B5%CE%BC%CE%BF%CF%85%22)&no_caps=on&no_marks=on&target=text&DATE_MODE=LOOSE&DOCS_PER_PAGE=15">PN can find it</a></li> <li>DOES NOT know that this text contains the phrase ἐντάγιον ἐμοῦ (the <a href="http://papyri.info/search?STRING=(%22%CE%B5%CE%BD%CF%84%CE%B1%CE%B3%CE%B9%CE%BF%CE%BD+%CE%B5%CE%BC%CE%BF%CF%85%22)&no_caps=on&no_marks=on&target=text&DATE_MODE=LOOSE&VOLUME=1&DOCS_PER_PAGE=15&SERIES=psi">PN cannot find it</a>, as our user was surprised to discover)</li> </ul> <p><span style="line-height: 1.6em">Thus, the PN search is now better in one way and worse in another! It knows about all of the same discrete words, but where phrases are concerned it now does a better job with what the scribe wrote, and a poorer job with the modern normalized representation. This may be a good trade, from a papyrological point of view, but it is still a trade.</span></p> <p style="text-align: center">***</p> <p>Ok, but why isn’t the PN search as smart as we are? Two answers: (1) because you are just smarter. (2) Actually, maybe you’re not.</p> <p>In this particular case, the index ‘knows’ that ἐμοῦ immediately follows ἐντάκιων. We humans know that ἐμοῦ immediately follows ἐντάκιων on the papyrus, but that it also follows ἐντάγιον in another, constructed sense. Can we ask the indexer to ‘know’ as much as we do? Yes, sort of. Can we ask it to treat all reg/orig pairs as simultaneously occupying the same position in the line? Yes, but only to a point.</p> <p>Suppose…</p> <ul> <li>a scribe wrote: <strong>Abe’s dog has fire teeth</strong><strong>.</strong></li> <li>but meant to write: <strong>Abe’s dog has fine teeth</strong>.</li> <li>the editor prints: <strong>Abe’s dog has fire (l. fine) teeth.</strong></li> <li>we encode: <strong>Abe’s dog has </strong><choice><reg><strong>fine</strong></reg><orig><strong>fire</strong></orig></choice><strong> teeth.</strong></li> </ul> <p>If we want to be able to support proximity searches against all possible words int his sentence, we must in effect index both possible sentences, 10 words instead of 5.</p> <ol> <li><strong>Abe’s dog has fire teeth</strong><strong>.</strong></li> <li><strong>Abe’s dog has fine teeth</strong><strong>.</strong></li> </ol> <p>The more reg/orig pairs a text has, the greater the number of possible sentences, and the larger the number of index versions that we must maintain. The increase is exponential. If “Abe’s” was itself regularized from “Ave’s” we would have to index this single sentence four times.</p> <ol> <li><strong>Abe’s dog has fire teeth</strong><strong>.</strong></li> <li><strong>Abe’s dog has fine teeth</strong><strong>.</strong></li> <li><strong></strong><strong>Ave’s dog has fire teeth</strong><strong>.</strong></li> <li><strong></strong><strong>Ave’s dog has fine teeth</strong><strong>.</strong></li> </ol> <p>Remember also that reg/orig expressions can address strings that can be complicated. Say a scribe writes κεγο for κεγω, which is regularized to καὶ ἐγώ. For such a regularization three possible strings occupy the same position in a line, but one of them is two words and two of them are one!</p> <p>Now imagine that</p> <ul> <li>a scribe wrote: <strong>Abe’s dog has fire tooth</strong><strong>.</strong></li> <li>but meant to write either: <strong>Abe’s dog has a fine tooth</strong>.</li> <li>or: <strong>Abe’s dog has fine teeth</strong>.</li> <li>the editor prints something like: <strong>Abe’s dog has fire tooth (l. <strong><a> fine tooth, or </strong><strong>fine </strong>teeth).</strong></li> <li>we encode: <strong>Abe’s dog </strong><choice><reg><app type=”alternative”><lem><strong>has </strong><supplied reason=”omitted”><strong>a</strong></supplied><strong> fine tooth</strong></lem><rdg><strong>has fine teeth</strong></rdg></app></reg><orig><strong>has fire tooth</strong></orig></choice><strong>.</strong></li> <li>The Leiden+ expression of this bit of EpiDoc is much easier to take in: <strong>Abe’s dog </strong><:<:<strong>has </strong><<strong>a</strong>><strong> fine tooth</strong>|alt|<strong>has fine teeth</strong>:>|reg|<strong>has fire tooth</strong>:>.</li> </ul> <p>Now imagine that a subsequent editor, M. Smith, revisits the manuscript and reads: <strong>A Balrog has <strong>firey (l. </strong>fiery) tooth (l. teeth). </strong>The encoding for this correction will be:</p> <ul> <li><app type=”editorial”><lem resp=”M. Smith”><strong>A Balrog has </strong><choice><reg><strong>fiery</strong></reg><orig><strong>firey</strong></orig></choice> <choice><reg><strong>teeth</strong></reg><orig><strong>tooth</strong></orig></choice></lem><rdg resp=”Original editor”><strong>Abe’s dog </strong><choice><reg><app type=”alternative”><lem><strong>has </strong><supplied reason=”omitted”><strong>a</strong></supplied><strong> fine tooth</strong></lem><rdg><strong>has fine teeth</strong></rdg></app></reg><orig><strong>has fire tooth</strong></orig></choice></rdg></app></li> <li><strong></strong>And in Leiden+: <:<strong>A Balrog has </strong><:<strong>fiery</strong>|reg|<strong>firey</strong>:> <:<strong>teeth</strong>|reg|<strong>tooth</strong>:>=M. Smith|ed|<strong>Abe’s dog </strong><:<:<strong>has </strong><<strong>a</strong>><strong> fine tooth</strong>|alt|<strong>has fine teeth</strong>:>|reg|<strong>has fire tooth</strong>:>=Original editor:></li> </ul> <p>If firey/fiery is a common regularization and tooth/teeth is as well, and if we want our users to be be able to search for all combinations of this phrase, then we must index the correction alone four times (2 regs x 2 origs = 4 possible combinations):</p> <ol> <li><strong>A Balrog has <strong>firey </strong>tooth.</strong> [indexing orig | orig ]</li> <li><strong>A Balrog has <strong>firey</strong> teeth.</strong> [indexing orig | reg ]</li> <li><strong>A Balrog has fiery tooth. </strong>[indexing reg | orig ]</li> <li><strong>A Balrog has fiery teeth.</strong> [indexing reg | reg ]</li> </ol> <p>If we generate only two versions of the text in the index–one that includes origs but not regs and another that includes regs but not origs–then when someone searches for “fiery tooth” (a plausible phrase among students of <a href="https://www.google.com/search?q=balrog&espv=210&es_sm=91&source=lnms&tbm=isch&sa=X&ei=EJX6UoumFIyL1AHo_4GIBA&ved=0CAkQ_AUoAQ&biw=1065&bih=1008">Balrogs</a>) s/he will not find this text. And perhaps Joe was right to read “Balrog” but wrong about their “fiery” teeth. Maybe this Balrog has “fine” teeth. If another user wants to know how fine a Balrog’s teeth are, and so wants to search for “fine” in proximity to “Balrog”, the index must include one version for every possible combination of strings not only in Joe’s corrected text but also in that of the original edition.</p> <p>Do the math. <span style="line-height: 1.6em">How many versions of the index do we need to create in order to accommodate all possible combinations of words presented by the two competing constructions of this five-word (or is it six-word?) sentence? How many words separate “A” from “fine”? Still wonder why the search engine isn’t as smart as you? There are only contingent answers. This is not easy. The PN search works as well as it does thanks to the industry and genius of Tim Hill and Hugh Cayless. But what is intrinsically complicated will likely stay that way.</span></p> <p>In order to be able to deliver searches that specify distance between any two possible words that appear in any editorial construction of a given DDbDP text, we would need to have as many parallel indexes of that text as there are possible combinations of reg/orig pairs (and also alternate readings, and also BL corrections and their deprecated readings, and also with abbreviations expanded and unexpanded, and so on). Even if we were to create parallel indexes only to accommodate reg/orig pairs, the burden might still be more than we could serve in an ordinary production environment: to put it simplistically a text with a dozen simple reg/orig pairs would require 144 parallel indexes.</p> <p style="text-align: center">***</p> <p>But in the meantime, what’s a papyrologist to do?</p> <p>First, search the DDbDP for ενταγιον εμου (without quotes); this will find all texts that contain both words, in any position. In other words, this finds A+B not “A B”. It <b>will</b> catch all three of the examples that our user asked about. Then, walk down the list and strike those ‘hits’ that you do not want; it takes a few extra seconds, but probably no more than it would take to craft one single, perfect query (if such were even possible).</p> <p>Also, if there are common variants, run multiple queries at once. Enter</p> <ul> <li>“εντακιων εμου” [in the first search box]</li> <li>OR “ενταγιον εμου” [in the second search box]</li> </ul> <p>…and so on. Or, using wildcards, search for εντα?ι?ν εμου, which will find ενταγιον, εντακιων, ενταχιον, vel sim.</p> <p>Bottom line: something as simple as searching for a couple contiguous words is not simple. And in the inherently complex, unstable, and variant-rich world of papyrological documents, very little is simple. No search engine can erase that inherent complexity. No matter how much the PN improves, it will almost always be best to attack questions with multiple searches and a variety of strategies.</p> <p style="text-align: center">***</p> <p>And what comes next? In the short term , we aim to generate a few concurrent indexes to the DDbDP, perhaps:</p> <ol> <li>text including (1) original readings, but not their regularized forms (reg/<strong>orig</strong>), (2) original erroneous readings, but not their corrected forms (<a href="http://www.tei-c.org/release/doc/tei-p5-doc/en/html/ref-corr.html">corr</a>/<a href="http://www.tei-c.org/release/doc/tei-p5-doc/en/html/ref-sic.html"><strong>sic</strong></a>), (3) corrections to texts (from BL or PE), (4) expanded abbreviations</li> <li>text including (1) original readings, but not their regularized forms (reg/<strong>orig</strong>), (2) original erroneous readings, but not their corrected forms (corr/<strong>sic</strong>), (3) corrections to texts (from BL or PE), (4) <strong>un</strong>expanded abbreviations</li> <li>text including (1) regularized readings (<strong>reg</strong>/orig), (2) corrected readings (<strong>corr</strong>/sic), (3) deprecated readings, (4) expanded abbreviations</li> </ol> <p>Roughly speaking, you can think of the first two as indexes of what we put in the text, and the third as a slightly less complete index of what we put in the app.</p> <p>We have also started thinking about an altogether different approach (which serves different goals as well). If we were to generate a comprehensive, curated index of unambiguous phonetic variants attested in the papyri (e.g. ἐντάγιον=ἐντάκιον), then we could automatically run concurrent searches (e.g. “ἐντάγιον ἐμοῦ” OR “ἐντάκιον ἐμοῦ”), whenever users query phrases containing one of the indexed pairs. So, a user searches for “ἐντάγιον ἐμοῦ” and we return texts with “ἐντάκιον ἐμοῦ” as well. One day.</p> <p>Neither is a comprehensive ‘fix.’ But either one would have let our user find ἐντάγιον ἐμοῦ at <em>PSI</em> I 36.</p> </div> </article> </div> </section> <div id="secondary"> <h2 class="site-description">a collection of parts flying in loose formation</h2> <div id="primary-sidebar" class="primary-sidebar widget-area" role="complementary"> <aside id="text-2" class="widget widget_text"><h1 class="widget-title">About</h1> <div class="textwidget">The Duke Collaboratory for Classics Computing, the DC3, is a no-nonsense, interdisciplinary, results-oriented research group devoted to the creation and care of standards, services, and tooling for digital classics and beyond. We aim to be flexible, durable, and to leverage the strengths of our many partnerships so as to be a collection of parts flying in loose formation. Like the plane. <p> <p>The DC3 manages papyri.info data and tooling, experiments in the development of new complementary resources, and engages in teaching and outreach at Duke and beyond.</div> </aside><aside id="search-3" class="widget widget_search"><h1 class="widget-title">Search the DC3 Blog</h1><form role="search" method="get" class="search-form" action="https://blogs.library.duke.edu/dcthree/"> <label> <span class="screen-reader-text">Search for:</span> <input type="search" class="search-field" placeholder="Search …" value="" name="s" /> </label> <input type="submit" class="search-submit" value="Search" /> </form></aside> <aside id="recent-posts-2" class="widget widget_recent_entries"> <h1 class="widget-title">Recent Posts</h1><nav aria-label="Recent Posts"> <ul> <li> <a href="https://blogs.library.duke.edu/dcthree/2018/01/10/digital-servius/">Digital Servius</a> <span class="post-date">January 10, 2018</span> </li> <li> <a href="https://blogs.library.duke.edu/dcthree/2016/08/04/theres-new-whale-town/">There’s a new whale in town</a> <span class="post-date">August 4, 2016</span> </li> <li> <a href="https://blogs.library.duke.edu/dcthree/2016/02/04/harpokration-done-but-not-even-almost/">Harpokration, done but not even almost.</a> <span class="post-date">February 4, 2016</span> </li> <li> <a href="https://blogs.library.duke.edu/dcthree/2015/05/26/harpokration-on-line/">Harpokration On Line</a> <span class="post-date">May 26, 2015</span> </li> <li> <a href="https://blogs.library.duke.edu/dcthree/2015/04/02/text-of-shifting-frontiers-in-the-digital-humanities/">Text of “Shifting Frontiers in the Digital Humanities”</a> <span class="post-date">April 2, 2015</span> </li> </ul> </nav></aside><aside id="archives-3" class="widget widget_archive"><h1 class="widget-title">Archives</h1><nav aria-label="Archives"> <ul> <li><a href='https://blogs.library.duke.edu/dcthree/2018/01/'>January 2018</a></li> <li><a href='https://blogs.library.duke.edu/dcthree/2016/08/'>August 2016</a></li> <li><a href='https://blogs.library.duke.edu/dcthree/2016/02/'>February 2016</a></li> <li><a href='https://blogs.library.duke.edu/dcthree/2015/05/'>May 2015</a></li> <li><a href='https://blogs.library.duke.edu/dcthree/2015/04/'>April 2015</a></li> <li><a href='https://blogs.library.duke.edu/dcthree/2015/03/'>March 2015</a></li> <li><a href='https://blogs.library.duke.edu/dcthree/2014/09/'>September 2014</a></li> <li><a href='https://blogs.library.duke.edu/dcthree/2014/08/'>August 2014</a></li> <li><a href='https://blogs.library.duke.edu/dcthree/2014/02/' aria-current="page">February 2014</a></li> <li><a href='https://blogs.library.duke.edu/dcthree/2013/12/'>December 2013</a></li> <li><a href='https://blogs.library.duke.edu/dcthree/2013/11/'>November 2013</a></li> <li><a href='https://blogs.library.duke.edu/dcthree/2013/08/'>August 2013</a></li> <li><a href='https://blogs.library.duke.edu/dcthree/2013/07/'>July 2013</a></li> <li><a href='https://blogs.library.duke.edu/dcthree/2013/06/'>June 2013</a></li> </ul> </nav></aside> </div> </div> </div> <footer id="colophon" class="site-footer" role="contentinfo"> <div class="site-info"> <a href="https://blogs.library.duke.edu/dcthree/" rel="home">Duke Collaboratory for Classics Computing (DC3)</a> </div> </footer> </div> </body> </html>