CINXE.COM

Bug #389217 “solr treats diacriticals as word breaks” : Bugs : Open Library

<!DOCTYPE html> <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en" dir="ltr"> <head> <base href="https://bugs.launchpad.net/openlibrary/+bug/389217/+index" /> <meta charset="UTF-8" /> <title>Bug #389217 “solr treats diacriticals as word breaks” : Bugs : Open Library</title> <link rel="apple-touch-icon" sizes="180x180" href="/@@/apple-touch-icon.png?v=2022" /> <link rel="icon" type="image/png" sizes="32x32" href="/@@/favicon-32x32.png?v=2022" /> <link rel="icon" type="image/png" sizes="16x16" href="/@@/favicon-16x16.png?v=2022" /> <link rel="manifest" href="/@@/site.webmanifest?v=2022" /> <link rel="mask-icon" href="/@@/safari-pinned-tab.svg?v=2022" color="#e9531f" /> <link rel="shortcut icon" href="/@@/favicon.ico?v=2022" /> <meta name="msapplication-TileColor" content="#da532c" /> <meta name="msapplication-config" content="/@@/browserconfig.xml?v=2022" /> <meta name="theme-color" content="#ffffff" /> <link rel="canonical" href="https://bugs.launchpad.net/bugs/389217" /> <link rel="alternate" type="application/atom+xml" href="http://feeds.launchpad.net/bugs/389217/bug.atom" title="Bug 389217 Feed" /> <link type="text/css" rel="stylesheet" media="screen, print" href="/+icing/rev22ade00ab50b929fac63b8ee7252243aceda294a/combo.css" /> <meta name="description" content="Found by Karen: Perhaps I'd like to find books about astronauts who practice eastern religion: http://openlibrary.org/search?q=nasa+jainism This instead finds a bunch of Indian names that contain the letters &quot;nasa&quot; starting in the middle of a word, but preceded by an accented letter. Need to check that we're using the right solr input tokenizer. Unicode normalization may also figure into this." /> <meta property="og:description" content="Found by Karen: Perhaps I'd like to find books about astronauts who practice eastern religion: http://openlibrary.org/search?q=nasa+jainism This instead finds a bunch of Indian names that contain the letters &quot;nasa&quot; starting in the middle of a word, but preceded by an accented letter. Need to check that we're using the right solr input tokenizer. Unicode normalization may also figure into this." /> <meta property="og:title" content="Bug #389217 “solr treats diacriticals as word breaks” : Bugs : Open Library" /> <meta property="og:type" content="website" /> <meta property="og:image" content="/@@/launchpad-og-image.png" /> <meta property="og:url" content="https://bugs.launchpad.net/bugs/389217" /> <meta property="og:site_name" content="Launchpad" /> <script type="text/javascript"> var LP = { cache: {}, links: {} }; </script> <script type="text/javascript">var cookie_scope = '; Path=/; Secure; Domain=.launchpad.net';</script> <script type="text/javascript" src="/+combo/rev22ade00ab50b929fac63b8ee7252243aceda294a/?yui/yui/yui-min.js&amp;lp/meta.js&amp;yui/loader/loader-min.js"></script> <script type="text/javascript"> var raw = null; if (LP.devmode) { raw = 'raw'; } YUI.GlobalConfig = { combine: true, comboBase: '/+combo/rev22ade00ab50b929fac63b8ee7252243aceda294a/?', root: 'yui/', filter: raw, debug: false, fetchCSS: false, maxURLLength: 2000, groups: { lp: { combine: true, base: '/+combo/rev22ade00ab50b929fac63b8ee7252243aceda294a/?lp/', comboBase: '/+combo/rev22ade00ab50b929fac63b8ee7252243aceda294a/?', root: 'lp/', // comes from including lp/meta.js modules: LP_MODULES, fetchCSS: false } } }</script> <script type="text/javascript"> // we need this to create a single YUI instance all events and code // talks across. All instances of YUI().use should be based off of // LPJS instead. var LPJS = new YUI(); </script> <script id="base-layout-load-scripts" type="text/javascript"> //<![CDATA[ LPJS.use('base', 'node', 'console', 'event', 'oop', 'lp', 'lp.app.foldables','lp.app.sorttable', 'lp.app.inlinehelp', 'lp.app.links', 'lp.bugs.bugtask_index', 'lp.bugs.subscribers', 'lp.app.ellipsis', 'lp.code.branchmergeproposal.diff', 'lp.views.global', function(Y) { Y.on("domready", function () { var global_view = new Y.lp.views.Global(); global_view.render(); Y.lp.app.sorttable.SortTable.init(); Y.lp.app.inlinehelp.init_help(); Y.lp.activate_collapsibles(); Y.lp.app.foldables.activate(); Y.lp.app.links.check_valid_lp_links(); }); Y.on('lp:context:web_link:changed', function(e) { window.location = e.new_value; }); }); //]]> </script> <script id="base-helper-functions" type="text/javascript"> //<![CDATA[ // This code is pulled from lp.js that needs to be available on every // request. Pulling here to get it outside the scope of the YUI block. function setFocusByName(name) { // Focus the first element matching the given name which can be focused. var nodes = document.getElementsByName(name); var i, node; for (i = 0; i < nodes.length; i++) { node = nodes[i]; if (node.focus) { try { // Trying to focus a hidden element throws an error in IE8. if (node.offsetHeight !== 0) { node.focus(); } } catch (e) { LPJS.use('console', function(Y) { Y.log('In setFocusByName(<' + node.tagName + ' type=' + node.type + '>): ' + e); }); } break; } } } function selectWidget(widget_name, event) { if (event && (event.keyCode === 9 || event.keyCode === 13)) { // Avoid firing if user is tabbing through or simply pressing // enter to submit the form. return; } document.getElementById(widget_name).checked = true; } //]]> </script> <script type="text/javascript" id="available-official-tags-js">var available_official_tags = ["api", "ariel", "covers", "i18n", "language", "lending", "marc", "rdf", "search", "types", "works"];</script> <script type="text/javascript"> LPJS.use('base', 'node', 'oop', 'event', 'lp.bugs.bugtask_index', 'lp.bugs.subscribers', 'lp.code.branchmergeproposal.diff', 'lp.app.comment', 'lp.services.messages.edit', function(Y) { Y.on('domready', function() { Y.lp.code.branchmergeproposal.diff.connect_diff_links(); Y.lp.bugs.bugtask_index.setup_bugtask_index(); Y.lp.bugs.bugtask_index.setup_bugtask_table(); LP.cache.comment_context = LP.cache.bug; var cl = new Y.lp.app.comment.CommentList(); cl.render(); var sl = new Y.lp.bugs.subscribers.createBugSubscribersLoader({ container_box: '#other-bug-subscribers', subscribers_details_view: '/+bug-portlet-subscribers-details', subscribe_someone_else_link: '.menu-link-addsubscriber' }, window); Y.lp.services.messages.edit.setup(); }); }); </script> <style type="text/css"> /* Align the 'add comment' link to the right of the comment box. */ #add-comment-form textarea { width: 100%; } #add-comment-form { max-width: 60em; padding-bottom: 4em; } #add-comment-form .actions {float: right;} .buglink-summary dd { font-size: 10px; } a#privacy-link:link:hover, a#privacy-link:visited:hover {text-decoration:none;} </style> <style type="text/css"> .yui3-overlay .value label { /* It normally makes sense for form labels to be bold, but since this form consists only of radio buttons, there's nothing but labels so we just get wall-to-wall bold. */ font-weight: normal !important; } </style> </head> <body id="document" itemscope="" itemtype="http://schema.org/WebPage" class="tab-bugs main_side public yui3-skin-sam"> <div class="yui-d0"> <div id="locationbar" class="login-logout"> <div id="logincontrol"><a href="https://bugs.launchpad.net/openlibrary/+bug/389217/+login">Log in / Register</a></div> </div><!--id="locationbar"--> <div id="watermark" class="watermark-apps-portlet"> <div> <a href="https://launchpad.net/openlibrary"><img alt="" width="64" height="64" src="https://launchpadlibrarian.net/35799906/ol-launchpad.png" /></a> </div> <div class="wide"> <h2 id="watermark-heading"><a href="https://launchpad.net/openlibrary">Open Library</a></h2> </div> <!-- Application Menu --> <ul class="facetmenu"> <li class="overview"><a href="https://launchpad.net/openlibrary">Overview</a></li> <li class="branches"><a href="https://code.launchpad.net/openlibrary">Code</a></li> <li class="bugs active"><a href="https://bugs.launchpad.net/openlibrary">Bugs</a></li> <li class="specifications"><a href="https://blueprints.launchpad.net/openlibrary">Blueprints</a></li> <li class="translations"><a href="https://translations.launchpad.net/openlibrary">Translations</a></li> <li class="answers"><a href="https://answers.launchpad.net/openlibrary">Answers</a></li> </ul> </div> <div class="yui-t4"> <div id="maincontent" class="yui-main"> <div class="yui-b" dir="ltr"> <div class="context-publication"> <h1 id="edit-title"> <span class="yui3-editable_text-text ellipsis" style="max-width: 95%;"> solr treats diacriticals as word breaks </span> </h1> <div id="registration" class="registering"> Bug #389217 reported by <a href="https://launchpad.net/~solrize" class="sprite person">solrize</a> <time title="2009-06-18 21:25:06 UTC" datetime="2009-06-18T21:25:06.636746+00:00">on 2009-06-18</time> </div> </div> <div id="request-notifications"> </div> <div> <div id="bug-is-duplicate"> </div> <div style="float: right;"> <span><a href="/+help-bugs/bug-heat.html" target="help" class="sprite flame">6</a></span> </div> <div class="actions"> <span id="affectsmetoo" style="display: inline">This bug affects 1 person</span> </div> <table id="affected-software" class="listing"> <thead> <tr> <th colspan="2">Affects</th> <th>Status</th> <th>Importance</th> <th>Assigned to</th> <th>Milestone</th> </tr> </thead> <tbody> <tr class="highlight" id="tasksummary456203"> <td> </td> <td> <span id="bugtarget-picker-tasksummary456203"> <span class="yui3-activator-data-box"> <a class="sprite product" href="https://bugs.launchpad.net/openlibrary">Open Library</a> </span> <div class="yui3-activator-message-box yui3-activator-hidden"></div> </span> </td> <td style="width: 20%; vertical-align: middle"> <div class="status-content" style="width: 100%; float: left"> <span style="float: left" class="value statusCONFIRMED">Confirmed</span> </div> </td> <td style="width: 15em; vertical-align: middle"> <div class="importance-content" style="width: 100%; float: left"> <span style="float: left" class="value importanceLOW">Low</span> </div> </td> <td style="width:20%; margin: 0; padding: 0; vertical-align: middle; padding-left: 0.5em"> <span id="assignee-picker-tasksummary456203"> <span class="yui3-activator-data-box"> <a class="sprite person" href="https://launchpad.net/~edwardbetts">Edward Betts</a> </span> <div class="yui3-activator-message-box yui3-activator-hidden"></div> </span> </td> <td style="width: 20%; vertical-align: middle"> <div class="milestone-content" style="width: 100%; float: left"> <a class="value" href=""></a> </div> </td> </tr> </tbody> </table> <div id="maincontentsub"> <div class="top-portlet"> <div itemprop="mainContentOfPage" class="report"> <div> <div class="lazr-multiline-edit" id="edit-description"> <div class="clearfix"> <h3>Bug Description</h3> </div> <div class="yui3-editable_text-text"><p>Found by Karen:</p> <p>Perhaps I&#x27;d like to find books about astronauts who practice eastern religion:</p> <p><a rel="nofollow" href="http://openlibrary.org/search?q=nasa+jainism">http://<wbr />openlibrary.<wbr />org/search?<wbr />q=nasa+<wbr />jainism</a></p> <p>This instead finds a bunch of Indian names that contain the letters &quot;nasa&quot; starting in the middle of a word, but preceded by an accented letter. Need to check that we&#x27;re using the right solr input tokenizer. Unicode normalization may also figure into this.</p></div> </div> </div> <div style="margin:-10px 0 20px 5px" class="clearfix"> </div> <div id="bug-tags"> <span id="tags-heading"> </span> <span id="tag-list"> </span> </div> <script type="text/javascript"> LPJS.use('event', 'node', 'lp.bugs.tags_entry', function(Y) { Y.on('domready', function(e) { Y.lp.bugs.tags_entry.setup_tag_entry( available_official_tags); }, window); }); </script> <div class="clearfix"></div> </div> <div id="branches-and-cves"> <div id="bug-branches-container" style="float: left"> </div><!-- bug-branch-container --> <div class="clearfix"></div> </div> <!-- branches and CVEs --> </div> <div> <div class="boardComment"> <div class="boardCommentDetails"> <a href="https://launchpad.net/~solrize" class="sprite person">solrize (solrize)</a> <time title="2009-06-18 21:25:37 UTC" datetime="2009-06-18T21:25:37.983735+00:00">on 2009-06-18</time> </div> <div class="boardCommentActivity"> <table class="bug-activity"> <tr> <td colspan="2">Changed in openlibrary: </td> </tr> <tr> <td style="text-align: right;"> <b>assignee</b>: </td> <td> nobody &#8594; solrize (solrize) </td> </tr> <tr> <td style="text-align: right;"> <b>importance</b>: </td> <td> Undecided &#8594; Low </td> </tr> <tr> <td style="text-align: right;"> <b>status</b>: </td> <td> New &#8594; Confirmed </td> </tr> </table> </div> </div> <div xmlns="http://www.w3.org/1999/xhtml" itemscope="" itemtype="http://schema.org/UserComments" class="boardComment editable-message " data-baseurl="/openlibrary/+bug/389217/comments/1" data-i-can-edit="False"> <div class="boardCommentDetails"> <div class="message-revision-container"> <div class="message-revision-container-header"> <span>Revision history for this message</span> <img src="/+icing/build/overlay/assets/skins/sam/images/close.gif" class="message-revision-close" /> </div> <script type="text/template"> <div class='message-revision-item'> <div class='message-revision-title'> <a class="sprite remove action-icon message-revision-del-btn"> Remove </a> <a class="js-action"> Revision #{revision}, created at {date_created_display} </a> </div> <div class='message-revision-body'>{content}</div> </div> </script> <div class="message-revision-list"></div> </div> <table> <tbody> <tr> <td> <a href="https://launchpad.net/~kcoyle" class="sprite person">Karen Coyle (kcoyle)</a> wrote <time itemprop="commentTime" datetime="2009-06-19T00:18:48.532235+00:00" title="2009-06-19 00:18:48 UTC">on 2009-06-19</time><span class="editable-message-last-edit-date">: </span> </td> <td> </td> <td> </td> <td class="bug-comment-index"> <a itemprop="url" href="/openlibrary/+bug/389217/comments/1"> #1</a> </td> </tr> </tbody> </table> </div> <div class="boardCommentBody"> <div class="editable-message-body"> <div class="comment-text editable-message-text" itemprop="commentText"><p>at least two other tokenizing problems:</p> <p>subject headings with ampersands, e.g. &quot;sports &amp; recreation&quot;, retrieve zero when clicked on. Note that there are headings with ampersands but no surrounding spaces (&quot;Sports&amp;<wbr />Recreation&quot;<wbr />), and these can be retrieved by putting together the two words without the ampersand (&quot;sportsrecreat<wbr />ion&quot;). This latter does not retrieve the ones with spaces around the ampersand.</p> <p>some subject headings with slashes have this same problem, e.g. &quot;Children&#x27;s Books/Ages 4-8 Fiction&quot;. However, others, e.g. &quot;Health/Fitness&quot; work fine. The search &quot;healthfitness&quot; retrieves books with &quot;health/fitness&quot;.</p> <p>totally unclear to me how solr tokenizes.</p></div> </div> <div class="editable-message-form" style="display: none"> <textarea style="width: 100%" rows="10">at least two other tokenizing problems: subject headings with ampersands, e.g. "sports &amp; recreation", retrieve zero when clicked on. Note that there are headings with ampersands but no surrounding spaces ("Sports&amp;Recreation"), and these can be retrieved by putting together the two words without the ampersand ("sportsrecreation"). This latter does not retrieve the ones with spaces around the ampersand. some subject headings with slashes have this same problem, e.g. "Children's Books/Ages 4-8 Fiction". However, others, e.g. "Health/Fitness" work fine. The search "healthfitness" retrieves books with "health/fitness". totally unclear to me how solr tokenizes.</textarea> <input type="button" value="Update" class="editable-message-update-btn" /> <input type="button" value="Cancel" class="editable-message-cancel-btn" /> </div> </div> </div> <div xmlns="http://www.w3.org/1999/xhtml" itemscope="" itemtype="http://schema.org/UserComments" class="boardComment editable-message " data-baseurl="/openlibrary/+bug/389217/comments/2" data-i-can-edit="False"> <div class="boardCommentDetails"> <div class="message-revision-container"> <div class="message-revision-container-header"> <span>Revision history for this message</span> <img src="/+icing/build/overlay/assets/skins/sam/images/close.gif" class="message-revision-close" /> </div> <script type="text/template"> <div class='message-revision-item'> <div class='message-revision-title'> <a class="sprite remove action-icon message-revision-del-btn"> Remove </a> <a class="js-action"> Revision #{revision}, created at {date_created_display} </a> </div> <div class='message-revision-body'>{content}</div> </div> </script> <div class="message-revision-list"></div> </div> <table> <tbody> <tr> <td> <a href="https://launchpad.net/~solrize" class="sprite person">solrize (solrize)</a> wrote <time itemprop="commentTime" datetime="2009-06-19T01:25:57.181637+00:00" title="2009-06-19 01:25:57 UTC">on 2009-06-19</time><span class="editable-message-last-edit-date">: </span> </td> <td> </td> <td> </td> <td class="bug-comment-index"> <a itemprop="url" href="/openlibrary/+bug/389217/comments/2"> #2</a> </td> </tr> </tbody> </table> </div> <div class="boardCommentBody"> <div class="editable-message-body"> <div class="comment-text editable-message-text" itemprop="commentText"><p>The issue with ampersands in those subject links is unrelated, it&#x27;s caused by thingrepr making unicode with escaped entities, which hash into different facet tokens than the unescaped versions, so the links don&#x27;t find anything. That is discussed in #378841.</p></div> </div> <div class="editable-message-form" style="display: none"> <textarea style="width: 100%" rows="10">The issue with ampersands in those subject links is unrelated, it's caused by thingrepr making unicode with escaped entities, which hash into different facet tokens than the unescaped versions, so the links don't find anything. That is discussed in #378841.</textarea> <input type="button" value="Update" class="editable-message-update-btn" /> <input type="button" value="Cancel" class="editable-message-cancel-btn" /> </div> </div> </div> <div xmlns="http://www.w3.org/1999/xhtml" itemscope="" itemtype="http://schema.org/UserComments" class="boardComment editable-message " data-baseurl="/openlibrary/+bug/389217/comments/3" data-i-can-edit="False"> <div class="boardCommentDetails"> <div class="message-revision-container"> <div class="message-revision-container-header"> <span>Revision history for this message</span> <img src="/+icing/build/overlay/assets/skins/sam/images/close.gif" class="message-revision-close" /> </div> <script type="text/template"> <div class='message-revision-item'> <div class='message-revision-title'> <a class="sprite remove action-icon message-revision-del-btn"> Remove </a> <a class="js-action"> Revision #{revision}, created at {date_created_display} </a> </div> <div class='message-revision-body'>{content}</div> </div> </script> <div class="message-revision-list"></div> </div> <table> <tbody> <tr> <td> <a href="https://launchpad.net/~kcoyle" class="sprite person">Karen Coyle (kcoyle)</a> wrote <time itemprop="commentTime" datetime="2009-06-19T16:04:08.096002+00:00" title="2009-06-19 16:04:08 UTC">on 2009-06-19</time><span class="editable-message-last-edit-date">: </span> </td> <td> </td> <td> </td> <td class="bug-comment-index"> <a itemprop="url" href="/openlibrary/+bug/389217/comments/3"> #3</a> </td> </tr> </tbody> </table> </div> <div class="boardCommentBody"> <div class="editable-message-body"> <div class="comment-text editable-message-text" itemprop="commentText"><p>How solr tokenizes: <a rel="nofollow" href="http://wiki.apache.org/solr/AnalyzersTokenizersTokenFilters">http://<wbr />wiki.apache.<wbr />org/solr/<wbr />AnalyzersTokeni<wbr />zersTokenFilter<wbr />s</a> -- the part about whitespace is pretty clear, but I&#x27;m still not sure what solr does with slashes or ampersands.</p> <p>One of the problems could be if we are using the de-composed Unicode forms, and solr expects pre-composed. So we would have a letter followed by an accent character, rather than have the two combined in a single unicode character. We may want to switch to pre-composed if that is the case.</p> <p>Pre-composed could present some problems for transliterations (which is what we see with the nasa+jainism case) -- sometimes there isn&#x27;t a pre-composed equivalent because the transliterations are artificial. But we&#x27;d probably still get better search results for most cases.</p></div> </div> <div class="editable-message-form" style="display: none"> <textarea style="width: 100%" rows="10">How solr tokenizes: http://wiki.apache.org/solr/AnalyzersTokenizersTokenFilters -- the part about whitespace is pretty clear, but I'm still not sure what solr does with slashes or ampersands. One of the problems could be if we are using the de-composed Unicode forms, and solr expects pre-composed. So we would have a letter followed by an accent character, rather than have the two combined in a single unicode character. We may want to switch to pre-composed if that is the case. Pre-composed could present some problems for transliterations (which is what we see with the nasa+jainism case) -- sometimes there isn't a pre-composed equivalent because the transliterations are artificial. But we'd probably still get better search results for most cases. </textarea> <input type="button" value="Update" class="editable-message-update-btn" /> <input type="button" value="Cancel" class="editable-message-cancel-btn" /> </div> </div> </div> <div xmlns="http://www.w3.org/1999/xhtml" itemscope="" itemtype="http://schema.org/UserComments" class="boardComment editable-message " data-baseurl="/openlibrary/+bug/389217/comments/4" data-i-can-edit="False"> <div class="boardCommentDetails"> <div class="message-revision-container"> <div class="message-revision-container-header"> <span>Revision history for this message</span> <img src="/+icing/build/overlay/assets/skins/sam/images/close.gif" class="message-revision-close" /> </div> <script type="text/template"> <div class='message-revision-item'> <div class='message-revision-title'> <a class="sprite remove action-icon message-revision-del-btn"> Remove </a> <a class="js-action"> Revision #{revision}, created at {date_created_display} </a> </div> <div class='message-revision-body'>{content}</div> </div> </script> <div class="message-revision-list"></div> </div> <table> <tbody> <tr> <td> <a href="https://launchpad.net/~kcoyle" class="sprite person">Karen Coyle (kcoyle)</a> wrote <time itemprop="commentTime" datetime="2009-06-30T15:11:59.182136+00:00" title="2009-06-30 15:11:59 UTC">on 2009-06-30</time><span class="editable-message-last-edit-date">: </span> </td> <td> </td> <td> </td> <td class="bug-comment-index"> <a itemprop="url" href="/openlibrary/+bug/389217/comments/4"> #4</a> </td> </tr> </tbody> </table> </div> <div class="boardCommentBody"> <div class="editable-message-body"> <div class="comment-text editable-message-text" itemprop="commentText"><p>Note: Edward already has on his list to re-normalize the data to switch to pre-composed unicode, so that will solve the nara + jainism problem.</p></div> </div> <div class="editable-message-form" style="display: none"> <textarea style="width: 100%" rows="10">Note: Edward already has on his list to re-normalize the data to switch to pre-composed unicode, so that will solve the nara + jainism problem.</textarea> <input type="button" value="Update" class="editable-message-update-btn" /> <input type="button" value="Cancel" class="editable-message-cancel-btn" /> </div> </div> </div> <div xmlns="http://www.w3.org/1999/xhtml" itemscope="" itemtype="http://schema.org/UserComments" class="boardComment editable-message " data-baseurl="/openlibrary/+bug/389217/comments/5" data-i-can-edit="False"> <div class="boardCommentDetails"> <div class="message-revision-container"> <div class="message-revision-container-header"> <span>Revision history for this message</span> <img src="/+icing/build/overlay/assets/skins/sam/images/close.gif" class="message-revision-close" /> </div> <script type="text/template"> <div class='message-revision-item'> <div class='message-revision-title'> <a class="sprite remove action-icon message-revision-del-btn"> Remove </a> <a class="js-action"> Revision #{revision}, created at {date_created_display} </a> </div> <div class='message-revision-body'>{content}</div> </div> </script> <div class="message-revision-list"></div> </div> <table> <tbody> <tr> <td> <a href="https://launchpad.net/~solrize" class="sprite person">solrize (solrize)</a> wrote <time itemprop="commentTime" datetime="2009-06-30T20:58:13.886208+00:00" title="2009-06-30 20:58:13 UTC">on 2009-06-30</time><span class="editable-message-last-edit-date">: </span> </td> <td> </td> <td> </td> <td class="bug-comment-index"> <a itemprop="url" href="/openlibrary/+bug/389217/comments/5"> #5</a> </td> </tr> </tbody> </table> </div> <div class="boardCommentBody"> <div class="editable-message-body"> <div class="comment-text editable-message-text" itemprop="commentText"><p>Karen, the issue with the ampersands in the subject links is discussed in <a href="/bugs/378841" class="bug-link">bug #378841</a>. It has absolutely nothing to do with solr tokenization.</p> <p>I agree that switching normalization will help with these diacriticals and probably with some other issues.</p></div> </div> <div class="editable-message-form" style="display: none"> <textarea style="width: 100%" rows="10">Karen, the issue with the ampersands in the subject links is discussed in bug #378841. It has absolutely nothing to do with solr tokenization. I agree that switching normalization will help with these diacriticals and probably with some other issues.</textarea> <input type="button" value="Update" class="editable-message-update-btn" /> <input type="button" value="Cancel" class="editable-message-cancel-btn" /> </div> </div> </div> <div xmlns="http://www.w3.org/1999/xhtml" itemscope="" itemtype="http://schema.org/UserComments" class="boardComment editable-message " data-baseurl="/openlibrary/+bug/389217/comments/6" data-i-can-edit="False"> <div class="boardCommentDetails"> <div class="message-revision-container"> <div class="message-revision-container-header"> <span>Revision history for this message</span> <img src="/+icing/build/overlay/assets/skins/sam/images/close.gif" class="message-revision-close" /> </div> <script type="text/template"> <div class='message-revision-item'> <div class='message-revision-title'> <a class="sprite remove action-icon message-revision-del-btn"> Remove </a> <a class="js-action"> Revision #{revision}, created at {date_created_display} </a> </div> <div class='message-revision-body'>{content}</div> </div> </script> <div class="message-revision-list"></div> </div> <table> <tbody> <tr> <td> <a href="https://launchpad.net/~george-archive" class="sprite person">George (george-archive)</a> wrote <time itemprop="commentTime" datetime="2010-02-17T23:38:19.220068+00:00" title="2010-02-17 23:38:19 UTC">on 2010-02-17</time><span class="editable-message-last-edit-date">: </span> </td> <td> </td> <td> </td> <td class="bug-comment-index"> <a itemprop="url" href="/openlibrary/+bug/389217/comments/6"> #6</a> </td> </tr> </tbody> </table> </div> <div class="boardCommentBody"> <div class="editable-message-body"> <div class="comment-text editable-message-text" itemprop="commentText"><p>Edward - thoughts?</p> <p>If needed, keep open.</p></div> </div> <div class="editable-message-form" style="display: none"> <textarea style="width: 100%" rows="10">Edward - thoughts? If needed, keep open. </textarea> <input type="button" value="Update" class="editable-message-update-btn" /> <input type="button" value="Cancel" class="editable-message-cancel-btn" /> </div> </div> <div class="boardCommentActivity"> <table class="bug-activity"> <tr> <td colspan="2">Changed in openlibrary: </td> </tr> <tr> <td style="text-align: right;"> <b>assignee</b>: </td> <td> solrize (solrize) &#8594; Edward Betts (edwardbetts) </td> </tr> </table> </div> </div> <div style="float: right;"> <a class="menu-link-activitylog" href="https://bugs.launchpad.net/openlibrary/+bug/389217/+activity">See full activity log</a> </div> <div class="clearfix"></div> <div align="center" id="add-comment-login-first"> To post a comment you must <a href="+login?comments=all">log in</a>. </div> </div><!-- class="top-portlet" --> </div><!--- id="maincontentsub"--> <div> <div id="duplicate-form-container"></div> <div id="privacy-form-container"></div> </div> </div> </div><!-- yui-b --> </div><!-- yui-main --> <div id="side-portlets" class="yui-b side"> <div id="involvement" class="portlet"> <ul class="involvement"> <li class="single"> <a class="sprite bugs" href="/openlibrary/+filebug"> Report a bug </a> </li> </ul> </div> <div id="privacy" class="first portlet public"> <div id="privacy-text"> <span id="information-type-summary" class="sprite public">This report contains <strong id="information-type">Public</strong> information </span>&nbsp; <div id="information-type-description" style="padding-top: 5px">Everyone can see this information. </div> </div> </div> <div id="portlet-actions" class="portlet vertical"> <ul id="duplicate-actions"> </ul> <ul id="lock-status-actions"> </ul> </div> <div class="portlet vertical" id="portlet-subscription"> <div class="section"> <div id="current_user_subscription" class="False"> <span>You are</span> <a class="menu-link-subscription sprite modify edit" href="/openlibrary/+bug/389217/+subscribe"> not directly subscribed to this bug's notifications. </a> </div> <div id="sub-unsub-spinner">Subscribing...</div> <ul> <li><a class="menu-link-editsubscriptions sprite modify edit" href="https://bugs.launchpad.net/openlibrary/+bug/389217/+subscriptions" title="View and change your subscriptions to this bug">Edit bug mail</a></li> </ul> </div> <script type="text/javascript"> LPJS.use('io-base', 'node', 'lp.bugs.bugtask_index.portlets.subscription', function(Y) { Y.on('domready', function() { Y.lp.bugs.bugtask_index.portlets.subscription.initialize(); }); }); </script> </div> <div class="portlet vertical" id="portlet-subscribers"> <h2>Other bug subscribers</h2> <div> <div><a class="menu-link-addsubscriber sprite add" href="https://bugs.launchpad.net/openlibrary/+bug/389217/+addsubscriber" title="Launchpad will email that person whenever this bugs changes">Subscribe someone else</a></div> </div> <div id="other-bug-subscribers"></div> </div> <div class="portlet" id="portlet-watches"> <h2>Remote bug watches</h2> <ul> </ul> <p>Bug watches keep track of this bug in other bug trackers.</p> </div> </div><!-- yui-b side --> </div><!-- yui-t4 --> <div id="footer" class="footer"> <div class="lp-arcana"> <div class="lp-branding"> <a href="https://launchpad.net/"><img src="/@@/launchpad-footer-logo.svg" alt="Launchpad" width="65" height="18" /></a> &nbsp;&bull;&nbsp; <a href="https://launchpad.net/+tour">Take the tour</a> &nbsp;&bull;&nbsp; <a href="https://help.launchpad.net/">Read the guide</a> &nbsp; <form id="globalsearch" method="get" accept-charset="UTF-8" action="https://launchpad.net/+search"> <input type="search" id="search-text" name="field.text" /> <input type="image" src="/@@/search" style="vertical-align:5%" alt="Search Launchpad" /> </form> </div> </div> <div class="colophon"> &copy; 2004 <a href="http://canonical.com/">Canonical&nbsp;Ltd.</a> &nbsp;&bull;&nbsp; <a href="https://launchpad.net/legal">Terms of use</a> &nbsp;&bull;&nbsp; <a href="https://www.ubuntu.com/legal/dataprivacy">Data privacy</a> &nbsp;&bull;&nbsp; <a href="/feedback">Contact Launchpad Support</a> &nbsp;&bull;&nbsp; <a href="http://blog.launchpad.net/">Blog</a> &nbsp;&bull;&nbsp; <a href="https://canonical.com/careers">Careers</a> &nbsp;&bull;&nbsp; <a href="https://ubuntu.social/@launchpadstatus">System status</a> <span id="lp-version"> &nbsp;&bull;&nbsp; 22ade00 (<a href="https://dev.launchpad.net/">Get the code!</a>) </span> </div> </div> </div><!-- yui-d0--> <script id="json-cache-script">LP.cache = {"related_features": {}, "bug": {"self_link": "https://bugs.launchpad.net/api/devel/bugs/389217", "web_link": "https://bugs.launchpad.net/bugs/389217", "resource_type_link": "https://bugs.launchpad.net/api/devel/#bug", "id": 389217, "private": false, "information_type": "Public", "name": null, "title": "solr treats diacriticals as word breaks", "description": "Found by Karen:\n\nPerhaps I'd like to find books about astronauts who practice eastern religion:\n\nhttp://openlibrary.org/search?q=nasa+jainism\n\nThis instead finds a bunch of Indian names that contain the letters \"nasa\" starting in the middle of a word, but preceded by an accented letter. Need to check that we're using the right solr input tokenizer. Unicode normalization may also figure into this.", "owner_link": "https://bugs.launchpad.net/api/devel/~solrize", "bug_tasks_collection_link": "https://bugs.launchpad.net/api/devel/bugs/389217/bug_tasks", "duplicate_of_link": null, "date_created": "2009-06-18T21:25:06.636746+00:00", "activity_collection_link": "https://bugs.launchpad.net/api/devel/bugs/389217/activity", "subscriptions_collection_link": "https://bugs.launchpad.net/api/devel/bugs/389217/subscriptions", "date_last_updated": "2010-02-17T23:38:19.633383+00:00", "who_made_private_link": null, "date_made_private": null, "heat": 6, "bug_watches_collection_link": "https://bugs.launchpad.net/api/devel/bugs/389217/bug_watches", "cves_collection_link": "https://bugs.launchpad.net/api/devel/bugs/389217/cves", "vulnerabilities_collection_link": "https://bugs.launchpad.net/api/devel/bugs/389217/vulnerabilities", "duplicates_collection_link": "https://bugs.launchpad.net/api/devel/bugs/389217/duplicates", "attachments_collection_link": "https://bugs.launchpad.net/api/devel/bugs/389217/attachments", "security_related": false, "latest_patch_uploaded": null, "tags": [], "date_last_message": "2009-06-30T20:58:13.886208+00:00", "number_of_duplicates": 0, "message_count": 7, "users_affected_count": 1, "users_unaffected_count": 0, "users_affected_collection_link": "https://bugs.launchpad.net/api/devel/bugs/389217/users_affected", "users_unaffected_collection_link": "https://bugs.launchpad.net/api/devel/bugs/389217/users_unaffected", "users_affected_count_with_dupes": 1, "other_users_affected_count_with_dupes": 1, "users_affected_with_dupes_collection_link": "https://bugs.launchpad.net/api/devel/bugs/389217/users_affected_with_dupes", "messages_collection_link": "https://bugs.launchpad.net/api/devel/bugs/389217/messages", "lock_status": "Unlocked", "lock_reason": null, "linked_branches_collection_link": "https://bugs.launchpad.net/api/devel/bugs/389217/linked_branches", "linked_merge_proposals_collection_link": "https://bugs.launchpad.net/api/devel/bugs/389217/linked_merge_proposals", "http_etag": "\"9711a78342add284fba80b3cc08b0ae1a8e53959-cf00d3e68d4f26114b4cceadce0d3c56d910851e\""}, "subscribers_portlet_url_data": {"web_link": "https://bugs.launchpad.net/bugs/389217", "self_link": "https://bugs.launchpad.net/api/devel/bugs/389217"}, "total_comments_and_activity": 10, "initial_comment_batch_offset": 41, "first visible_recent_comment": -34, "bugtask_data": {"456203": {"id": 456203, "row_id": "tasksummary456203", "form_row_id": "task456203", "bugtask_path": "/openlibrary/+bug/389217", "prefix": "openlibrary", "targetname": "Open Library", "bug_title": "solr treats diacriticals as word breaks", "assignee_value": "edwardbetts", "assignee_is_team": false, "assignee_vocabulary": "AllUserTeamsParticipation", "assignee_vocabulary_filters": [], "hide_assignee_team_selection": true, "user_can_unassign": false, "user_can_delete": false, "delete_link": "https://bugs.launchpad.net/openlibrary/+bug/389217/+delete", "target_is_product": true, "status_widget_items": [{"name": "Confirmed", "value": "Confirmed", "description": "Verified by someone other than the reporter.\n", "description_css_class": "choice-description", "style": "", "help": "", "disabled": false, "css_class": "statusCONFIRMED"}], "status_value": "Confirmed", "importance_widget_items": "[]", "importance_value": "Low", "milestone_widget_items": "[]", "milestone_value": null, "user_can_edit_assignee": false, "user_can_edit_milestone": false, "user_can_edit_status": false, "user_can_edit_importance": false}}, "information_type_data": {"PUBLIC": {"value": "PUBLIC", "description": "Everyone can see this information.\n", "name": "Public", "order": 0, "is_private": false, "description_css_class": "choice-description"}, "PUBLICSECURITY": {"value": "PUBLICSECURITY", "description": "Everyone can see this security related information.\n", "name": "Public Security", "order": 1, "is_private": false, "description_css_class": "choice-description"}, "PRIVATESECURITY": {"value": "PRIVATESECURITY", "description": "Only the security group can see this information.\n ", "name": "Private Security", "order": 2, "is_private": true, "description_css_class": "choice-description"}, "USERDATA": {"value": "USERDATA", "description": "Only shared with users permitted to see private user information.\n", "name": "Private", "order": 3, "is_private": true, "description_css_class": "choice-description"}}, "bug_is_private": false, "context": {"self_link": "https://bugs.launchpad.net/api/devel/openlibrary/+bug/389217", "web_link": "https://bugs.launchpad.net/openlibrary/+bug/389217", "resource_type_link": "https://bugs.launchpad.net/api/devel/#bug_task", "bug_link": "https://bugs.launchpad.net/api/devel/bugs/389217", "milestone_link": null, "status": "Confirmed", "status_explanation": null, "importance": "Low", "importance_explanation": null, "assignee_link": "https://bugs.launchpad.net/api/devel/~edwardbetts", "bug_target_display_name": "Open Library", "bug_target_name": "openlibrary", "bug_watch_link": null, "date_assigned": "2009-06-18T21:25:38.784945+00:00", "date_created": "2009-06-18T21:25:06.636746+00:00", "date_confirmed": "2009-06-18T21:25:38.550176+00:00", "date_incomplete": null, "date_in_progress": null, "date_closed": null, "date_left_new": "2009-06-18T21:25:38.550176+00:00", "date_triaged": null, "date_fix_committed": null, "date_fix_released": null, "date_left_closed": null, "owner_link": "https://bugs.launchpad.net/api/devel/~solrize", "target_link": "https://bugs.launchpad.net/api/devel/openlibrary", "title": "Bug #389217 in Open Library: \"solr treats diacriticals as word breaks\"", "related_tasks_collection_link": "https://bugs.launchpad.net/api/devel/openlibrary/+bug/389217/related_tasks", "is_complete": false, "http_etag": "\"22bb8039e93142b40605e5c1df305d0bc9ba6a11-f9dbcd4c29973832322746cd755a324b5cc368b5\""}};</script> </body> <!-- Facet name: bugs Page type: main_side Has global search: True Has application tabs: True Has side portlets: True At least 46 queries/external actions issued in 0.80 seconds Features: {'profiling.enabled': None, 'hard_timeout': '9000', 'app.mainsite_only.canonical_url': None, 'js.yui_version': None, 'app.maintenance_message': None, 'bugs.affected_count_includes_dupes.disabled': None, 'baselayout.careers_link.disabled': None, 'visible_render_time': None} r22ade00 --> </html>

Pages: 1 2 3 4 5 6 7 8 9 10