CINXE.COM
Rules based classification - UK Charity Classification
<!doctype html> <html lang="en" class="no-js"> <head> <meta charset="utf-8"> <meta name="viewport" content="width=device-width,initial-scale=1"> <link rel="canonical" href="https://charityclassification.org.uk/method/rules-based-classification/"> <link rel="prev" href="../machine-learning/"> <link rel="next" href="../../data/outputs/"> <link rel="alternate" type="application/rss+xml" title="RSS feed" href="../../feed_rss_created.xml"> <link rel="alternate" type="application/rss+xml" title="RSS feed of updated content" href="../../feed_rss_updated.xml"> <link rel="icon" href="../../assets/images/favicon.png"> <meta name="generator" content="mkdocs-1.6.0, mkdocs-material-9.5.31"> <title>Rules based classification - UK Charity Classification</title> <link rel="stylesheet" href="../../assets/stylesheets/main.3cba04c6.min.css"> <link rel="stylesheet" href="../../assets/stylesheets/palette.06af60db.min.css"> <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin> <link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"> <style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style> <script>__md_scope=new URL("../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script> <link rel="alternate" type="application/rss+xml" title="RSS feed of created content" href="https://charityclassification.org.uk/feed_rss_created.xml"> <link rel="alternate" type="application/rss+xml" title="RSS feed of updated content" href="https://charityclassification.org.uk/feed_rss_updated.xml"> </head> <body dir="ltr" data-md-color-scheme="default" data-md-color-primary="teal" data-md-color-accent="indigo"> <input class="md-toggle" data-md-toggle="drawer" type="checkbox" id="__drawer" autocomplete="off"> <input class="md-toggle" data-md-toggle="search" type="checkbox" id="__search" autocomplete="off"> <label class="md-overlay" for="__drawer"></label> <div data-md-component="skip"> <a href="#rules-based-classification" class="md-skip"> Skip to content </a> </div> <div data-md-component="announce"> </div> <header class="md-header" data-md-component="header"> <nav class="md-header__inner md-grid" aria-label="Header"> <a href="../.." title="UK Charity Classification" class="md-header__button md-logo" aria-label="UK Charity Classification" data-md-component="logo"> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M17.63 5.84C17.27 5.33 16.67 5 16 5H5a2 2 0 0 0-2 2v10a2 2 0 0 0 2 2h11c.67 0 1.27-.34 1.63-.85L22 12l-4.37-6.16Z"/></svg> </a> <label class="md-header__button md-icon" for="__drawer"> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3V6m0 5h18v2H3v-2m0 5h18v2H3v-2Z"/></svg> </label> <div class="md-header__title" data-md-component="header-title"> <div class="md-header__ellipsis"> <div class="md-header__topic"> <span class="md-ellipsis"> UK Charity Classification </span> </div> <div class="md-header__topic" data-md-component="header-topic"> <span class="md-ellipsis"> Rules based classification </span> </div> </div> </div> <form class="md-header__option" data-md-component="palette"> <input class="md-option" data-md-color-media="(prefers-color-scheme: light)" data-md-color-scheme="default" data-md-color-primary="teal" data-md-color-accent="indigo" aria-label="Switch to dark mode" type="radio" name="__palette" id="__palette_0"> <label class="md-header__button md-icon" title="Switch to dark mode" for="__palette_1" hidden> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M17 6H7c-3.31 0-6 2.69-6 6s2.69 6 6 6h10c3.31 0 6-2.69 6-6s-2.69-6-6-6zm0 10H7c-2.21 0-4-1.79-4-4s1.79-4 4-4h10c2.21 0 4 1.79 4 4s-1.79 4-4 4zM7 9c-1.66 0-3 1.34-3 3s1.34 3 3 3 3-1.34 3-3-1.34-3-3-3z"/></svg> </label> <input class="md-option" data-md-color-media="(prefers-color-scheme: dark)" data-md-color-scheme="slate" data-md-color-primary="teal" data-md-color-accent="indigo" aria-label="Switch to light mode" type="radio" name="__palette" id="__palette_1"> <label class="md-header__button md-icon" title="Switch to light mode" for="__palette_0" hidden> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M17 7H7a5 5 0 0 0-5 5 5 5 0 0 0 5 5h10a5 5 0 0 0 5-5 5 5 0 0 0-5-5m0 8a3 3 0 0 1-3-3 3 3 0 0 1 3-3 3 3 0 0 1 3 3 3 3 0 0 1-3 3Z"/></svg> </label> </form> <script>var media,input,key,value,palette=__md_get("__palette");if(palette&&palette.color){"(prefers-color-scheme)"===palette.color.media&&(media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']"),palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent"));for([key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script> <div class="md-header__source"> <a href="https://github.com/charity-classification/ukcat" title="Go to repository" class="md-source" data-md-component="source"> <div class="md-source__icon md-icon"> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.6.0 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg> </div> <div class="md-source__repository"> GitHub </div> </a> </div> </nav> </header> <div class="md-container" data-md-component="container"> <nav class="md-tabs" aria-label="Tabs" data-md-component="tabs"> <div class="md-grid"> <ul class="md-tabs__list"> <li class="md-tabs__item"> <a href="../.." class="md-tabs__link"> Classifying UK Charities </a> </li> <li class="md-tabs__item"> <a href="../../blog/2021/10/06/launching-ukcat/" class="md-tabs__link"> Blog </a> </li> <li class="md-tabs__item md-tabs__item--active"> <a href="../introduction/" class="md-tabs__link"> Method </a> </li> <li class="md-tabs__item"> <a href="../../data/outputs/" class="md-tabs__link"> Data </a> </li> <li class="md-tabs__item"> <a href="../../other-work/" class="md-tabs__link"> Other resources </a> </li> <li class="md-tabs__item"> <a href="../../contact/" class="md-tabs__link"> Contact us </a> </li> </ul> </div> </nav> <main class="md-main" data-md-component="main"> <div class="md-main__inner md-grid"> <div class="md-sidebar md-sidebar--primary" data-md-component="sidebar" data-md-type="navigation" > <div class="md-sidebar__scrollwrap"> <div class="md-sidebar__inner"> <nav class="md-nav md-nav--primary md-nav--lifted" aria-label="Navigation" data-md-level="0"> <label class="md-nav__title" for="__drawer"> <a href="../.." title="UK Charity Classification" class="md-nav__button md-logo" aria-label="UK Charity Classification" data-md-component="logo"> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M17.63 5.84C17.27 5.33 16.67 5 16 5H5a2 2 0 0 0-2 2v10a2 2 0 0 0 2 2h11c.67 0 1.27-.34 1.63-.85L22 12l-4.37-6.16Z"/></svg> </a> UK Charity Classification </label> <div class="md-nav__source"> <a href="https://github.com/charity-classification/ukcat" title="Go to repository" class="md-source" data-md-component="source"> <div class="md-source__icon md-icon"> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><!--! Font Awesome Free 6.6.0 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2024 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg> </div> <div class="md-source__repository"> GitHub </div> </a> </div> <ul class="md-nav__list" data-md-scrollfix> <li class="md-nav__item"> <a href="../.." class="md-nav__link"> <span class="md-ellipsis"> Classifying UK Charities </span> </a> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_2" > <label class="md-nav__link" for="__nav_2" id="__nav_2_label" tabindex="0"> <span class="md-ellipsis"> Blog </span> <span class="md-nav__icon md-icon"></span> </label> <nav class="md-nav" data-md-level="1" aria-labelledby="__nav_2_label" aria-expanded="false"> <label class="md-nav__title" for="__nav_2"> <span class="md-nav__icon md-icon"></span> Blog </label> <ul class="md-nav__list" data-md-scrollfix> <li class="md-nav__item"> <a href="../../blog/2021/10/06/launching-ukcat/" class="md-nav__link"> <span class="md-ellipsis"> Launching UK-CAT </span> </a> </li> <li class="md-nav__item"> <a href="../../blog/2021/03/17/a-uk-charity-classification-system/" class="md-nav__link"> <span class="md-ellipsis"> A UK Charity Classification System </span> </a> </li> <li class="md-nav__item"> <a href="../../blog/2021/01/11/classifying-the-charity-register/" class="md-nav__link"> <span class="md-ellipsis"> Classifying the charity register </span> </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--active md-nav__item--section md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_3" checked> <label class="md-nav__link" for="__nav_3" id="__nav_3_label" tabindex=""> <span class="md-ellipsis"> Method </span> <span class="md-nav__icon md-icon"></span> </label> <nav class="md-nav" data-md-level="1" aria-labelledby="__nav_3_label" aria-expanded="true"> <label class="md-nav__title" for="__nav_3"> <span class="md-nav__icon md-icon"></span> Method </label> <ul class="md-nav__list" data-md-scrollfix> <li class="md-nav__item"> <a href="../introduction/" class="md-nav__link"> <span class="md-ellipsis"> Introduction </span> </a> </li> <li class="md-nav__item"> <a href="../sampling/" class="md-nav__link"> <span class="md-ellipsis"> Sampling charities </span> </a> </li> <li class="md-nav__item"> <a href="../designing-taxonomy/" class="md-nav__link"> <span class="md-ellipsis"> Designing the UK-CAT Taxonomy </span> </a> </li> <li class="md-nav__item"> <a href="../manual-classification/" class="md-nav__link"> <span class="md-ellipsis"> Manual classification of a sample of charities </span> </a> </li> <li class="md-nav__item"> <a href="../machine-learning/" class="md-nav__link"> <span class="md-ellipsis"> Machine Learning </span> </a> </li> <li class="md-nav__item md-nav__item--active"> <input class="md-nav__toggle md-toggle" type="checkbox" id="__toc"> <a href="./" class="md-nav__link md-nav__link--active"> <span class="md-ellipsis"> Rules based classification </span> </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_4" > <label class="md-nav__link" for="__nav_4" id="__nav_4_label" tabindex="0"> <span class="md-ellipsis"> Data </span> <span class="md-nav__icon md-icon"></span> </label> <nav class="md-nav" data-md-level="1" aria-labelledby="__nav_4_label" aria-expanded="false"> <label class="md-nav__title" for="__nav_4"> <span class="md-nav__icon md-icon"></span> Data </label> <ul class="md-nav__list" data-md-scrollfix> <li class="md-nav__item"> <a href="../../data/outputs/" class="md-nav__link"> <span class="md-ellipsis"> Outputs </span> </a> </li> <li class="md-nav__item"> <a href="../../data/data-downloads/" class="md-nav__link"> <span class="md-ellipsis"> Data downloads </span> </a> </li> <li class="md-nav__item"> <a href="../../data/tag_list/" class="md-nav__link"> <span class="md-ellipsis"> UK-CAT </span> </a> </li> </ul> </nav> </li> <li class="md-nav__item"> <a href="../../other-work/" class="md-nav__link"> <span class="md-ellipsis"> Other resources </span> </a> </li> <li class="md-nav__item"> <a href="../../contact/" class="md-nav__link"> <span class="md-ellipsis"> Contact us </span> </a> </li> </ul> </nav> </div> </div> </div> <div class="md-sidebar md-sidebar--secondary" data-md-component="sidebar" data-md-type="toc" > <div class="md-sidebar__scrollwrap"> <div class="md-sidebar__inner"> <nav class="md-nav md-nav--secondary" aria-label="Table of contents"> </nav> </div> </div> </div> <div class="md-content" data-md-component="content"> <article class="md-content__inner md-typeset"> <a href="https://github.com/charity-classification/ukcat/edit/master/docs/method/rules-based-classification.md" title="edit.link.title" class="md-content__button md-icon"> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M20.71 7.04c.39-.39.39-1.04 0-1.41l-2.34-2.34c-.37-.39-1.02-.39-1.41 0l-1.84 1.83 3.75 3.75M3 17.25V21h3.75L17.81 9.93l-3.75-3.75L3 17.25Z"/></svg> </a> <h1 id="rules-based-classification">Rules based classification<a class="headerlink" href="#rules-based-classification" title="Permanent link">¶</a></h1> <p>The manually classified entries provided a pool of baseline data from which to start developing the automated keyword searching. </p> <p>To assist with the creation of these keywords, we took all of the charities from the manual sample linked to each tag and ran frequencies on the most common words and pairs of words (bigrams). This provided an initial indication of the most common associated terms and allowed us to create a ‘regular expression’ of search terms. </p> <p>Secondly, using an online tool created for this purpose, we worked through each tag examining the ‘false negatives’. These were charities we had linked manually to a tag, but which were not yet being caught by our search terms. Examining the activities of these charities often revealed modifications or additions that needed to be made to the search terms. </p> <p>At the same time, we kept a close eye on those charities that were being included by the search terms, particularly those from the sample which we hadn’t manually linked. In many cases, these were entirely reasonable and had either been missed during the manual classification, or were just slightly wider in scope than the manual coders had operated. </p> <p>Unsurprisingly, the eventual search terms included for each tag was a balance between whether to prioritise avoiding false positives or false negatives, which had to be struck fairly intuitively by the research team.</p> <p>The method for refining the tags allowed us to produce two measures of success: precision and recall, as well as the f1 score, which combines the two. Each measure was scored between 0 and 1. In our case, these measures are defined as follows:</p> <ul> <li><strong>Precision</strong> shows how many of the charities selected by the tag keyword were correct. This is equivalent to true positives as a proportion of all selected elements. A high precision score shows that the keyword was good at minimising false positives - a high proportion of those selected were correct. A low precision score meant that the keyword selected lots of charities that shouldn’t have been (false positives).</li> <li><strong>Recall</strong> shows how many of the charities that should be selected for this tag were. This is equivalent to true positives as a proportion of all relevant elements. A high recall score means that the keyword did well at finding a large proportion of the relevant charities. A low recall score means that the keyword found a small proportion of the relevant charities - lots of false negatives.</li> <li>The <strong>F1 Score</strong> combines these, using the harmonic mean. It generally reflects the lower of the two figures.</li> </ul> <p>Figure X shows the distribution of tag results in bands using these three measures. It demonstrates that the current tags are better at maximising recall, and less so with precision. These means we would expect to see more false positives in the result, but fewer false negatives. This is reflected in the results when the tags are applied more widely, with a larger number of tags per charity than in the sample data.</p> <p>Figure X: Distribution of tags across different measures of quality of the results</p> <div class="flourish-embed flourish-chart" data-src="visualisation/6706687"><script src="https://public.flourish.studio/resources/embed.js"></script></div> <p>To complicate matters further, we continued to refine the UK-CAT throughout this process. Although it would have been a neater, more linear method to finish it first, we found that often the process of coming up with the key words made it logical to tweak the tags themselves. Sometimes this was to avoid duplication, clarify the difference between two tags, or in one or two cases because creating suitable search terms was simply not possible due to the inherent ambiguity that sometimes occurs in the English language. </p> <aside class="md-source-file"> <span class="md-source-file__fact"> <span class="md-icon" title="Last update"> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M21 13.1c-.1 0-.3.1-.4.2l-1 1 2.1 2.1 1-1c.2-.2.2-.6 0-.8l-1.3-1.3c-.1-.1-.2-.2-.4-.2m-1.9 1.8-6.1 6V23h2.1l6.1-6.1-2.1-2M12.5 7v5.2l4 2.4-1 1L11 13V7h1.5M11 21.9c-5.1-.5-9-4.8-9-9.9C2 6.5 6.5 2 12 2c5.3 0 9.6 4.1 10 9.3-.3-.1-.6-.2-1-.2s-.7.1-1 .2C19.6 7.2 16.2 4 12 4c-4.4 0-8 3.6-8 8 0 4.1 3.1 7.5 7.1 7.9l-.1.2v1.8Z"/></svg> </span> <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-date">August 7, 2024</span> </span> <span class="md-source-file__fact"> <span class="md-icon" title="Created"> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M14.47 15.08 11 13V7h1.5v5.25l3.08 1.83c-.41.28-.79.62-1.11 1m-1.39 4.84c-.36.05-.71.08-1.08.08-4.42 0-8-3.58-8-8s3.58-8 8-8 8 3.58 8 8c0 .37-.03.72-.08 1.08.69.1 1.33.32 1.92.64.1-.56.16-1.13.16-1.72 0-5.5-4.5-10-10-10S2 6.5 2 12s4.47 10 10 10c.59 0 1.16-.06 1.72-.16-.32-.59-.54-1.23-.64-1.92M18 15v3h-3v2h3v3h2v-3h3v-2h-3v-3h-2Z"/></svg> </span> <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-date">August 7, 2024</span> </span> </aside> </article> </div> <script>var target=document.getElementById(location.hash.slice(1));target&&target.name&&(target.checked=target.name.startsWith("__tabbed_"))</script> </div> </main> <footer class="md-footer"> <div class="md-footer-meta md-typeset"> <div class="md-footer-meta__inner md-grid"> <div class="md-copyright"> Made with <a href="https://squidfunk.github.io/mkdocs-material/" target="_blank" rel="noopener"> Material for MkDocs </a> </div> </div> </div> </footer> </div> <div class="md-dialog" data-md-component="dialog"> <div class="md-dialog__inner md-typeset"></div> </div> <script id="__config" type="application/json">{"base": "../..", "features": ["navigation.tabs"], "search": "../../assets/javascripts/workers/search.b8dbb3d2.min.js", "translations": {"clipboard.copied": "Copied to clipboard", "clipboard.copy": "Copy to clipboard", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.placeholder": "Type to start searching", "search.result.term.missing": "Missing", "select.version": "Select version"}}</script> <script src="../../assets/javascripts/bundle.fe8b6f2b.min.js"></script> <script async defer src="https://scripts.simpleanalyticscdn.com/latest.js"></script> <noscript><img src="https://queue.simpleanalyticscdn.com/noscript.gif" alt="" referrerpolicy="no-referrer-when-downgrade" /></noscript> </body> </html>