Machine Learning - UK Charity Classification

<!doctype html> <html lang="en" class="no-js"> <head> <meta charset="utf-8"> <meta name="viewport" content="width=device-width,initial-scale=1"> <link rel="canonical" href="https://charityclassification.org.uk/method/machine-learning/"> <link rel="prev" href="../manual-classification/"> <link rel="next" href="../rules-based-classification/"> <link rel="alternate" type="application/rss+xml" title="RSS feed" href="../../feed_rss_created.xml"> <link rel="alternate" type="application/rss+xml" title="RSS feed of updated content" href="../../feed_rss_updated.xml"> <link rel="icon" href="../../assets/images/favicon.png"> <meta name="generator" content="mkdocs-1.6.0, mkdocs-material-9.5.31"> <title>Machine Learning - UK Charity Classification</title> <link rel="stylesheet" href="../../assets/stylesheets/main.3cba04c6.min.css"> <link rel="stylesheet" href="../../assets/stylesheets/palette.06af60db.min.css"> <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin> <link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"> <style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style> <script>__md_scope=new URL("../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script> <link rel="alternate" type="application/rss+xml" title="RSS feed of created content" href="https://charityclassification.org.uk/feed_rss_created.xml"> <link rel="alternate" type="application/rss+xml" title="RSS feed of updated content" href="https://charityclassification.org.uk/feed_rss_updated.xml"> </head> <body dir="ltr" data-md-color-scheme="default" data-md-color-primary="teal" data-md-color-accent="indigo"> <input class="md-toggle" data-md-toggle="drawer" type="checkbox" id="__drawer" autocomplete="off"> <input class="md-toggle" data-md-toggle="search" type="checkbox" id="__search" autocomplete="off"> <label class="md-overlay" for="__drawer"></label> <div data-md-component="skip"> <a href="#machine-learning" class="md-skip"> Skip to content </a> </div> <div data-md-component="announce"> </div> <header class="md-header" data-md-component="header"> <nav class="md-header__inner md-grid" aria-label="Header"> <a href="../.." title="UK Charity Classification" class="md-header__button md-logo" aria-label="UK Charity Classification" data-md-component="logo"> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M17.63 5.84C17.27 5.33 16.67 5 16 5H5a2 2 0 0 0-2 2v10a2 2 0 0 0 2 2h11c.67 0 1.27-.34 1.63-.85L22 12l-4.37-6.16Z"/></svg> </a> <label class="md-header__button md-icon" for="__drawer"> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3V6m0 5h18v2H3v-2m0 5h18v2H3v-2Z"/></svg> </label> <div class="md-header__title" data-md-component="header-title"> <div class="md-header__ellipsis"> <div class="md-header__topic"> <span class="md-ellipsis"> UK Charity Classification </span> </div> <div class="md-header__topic" data-md-component="header-topic"> <span class="md-ellipsis"> Machine Learning </span> </div> </div> </div> <form class="md-header__option" data-md-component="palette"> <input class="md-option" data-md-color-media="(prefers-color-scheme: light)" data-md-color-scheme="default" data-md-color-primary="teal" data-md-color-accent="indigo" aria-label="Switch to dark mode" type="radio" name="__palette" id="__palette_0"> <label class="md-header__button md-icon" title="Switch to dark mode" for="__palette_1" hidden> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M17 6H7c-3.31 0-6 2.69-6 6s2.69 6 6 6h10c3.31 0 6-2.69 6-6s-2.69-6-6-6zm0 10H7c-2.21 0-4-1.79-4-4s1.79-4 4-4h10c2.21 0 4 1.79 4 4s-1.79 4-4 4zM7 9c-1.66 0-3 1.34-3 3s1.34 3 3 3 3-1.34 3-3-1.34-3-3-3z"/></svg> </label> <input class="md-option" data-md-color-media="(prefers-color-scheme: dark)" data-md-color-scheme="slate" data-md-color-primary="teal" data-md-color-accent="indigo" aria-label="Switch to light mode" type="radio" name="__palette" id="__palette_1"> <label class="md-header__button md-icon" title="Switch to light mode" for="__palette_0" hidden> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M17 7H7a5 5 0 0 0-5 5 5 5 0 0 0 5 5h10a5 5 0 0 0 5-5 5 5 0 0 0-5-5m0 8a3 3 0 0 1-3-3 3 3 0 0 1 3-3 3 3 0 0 1 3 3 3 3 0 0 1-3 3Z"/></svg> </label> </form> <script>var media,input,key,value,palette=__md_get("__palette");if(palette&&palette.color){"(prefers-color-scheme)"===palette.color.media&&(media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']"),palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent"));for([key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script> <div class="md-header__source"> <a href="https://github.com/charity-classification/ukcat" title="Go to repository" class="md-source" data-md-component="source"> <div class="md-source__icon md-icon"> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg> </div> <div class="md-source__repository"> GitHub </div> </a> </div> </nav> </header> <div class="md-container" data-md-component="container"> <nav class="md-tabs" aria-label="Tabs" data-md-component="tabs"> <div class="md-grid"> <ul class="md-tabs__list"> <li class="md-tabs__item"> <a href="../.." class="md-tabs__link"> Classifying UK Charities </a> </li> <li class="md-tabs__item"> <a href="../../blog/2021/10/06/launching-ukcat/" class="md-tabs__link"> Blog </a> </li> <li class="md-tabs__item md-tabs__item--active"> <a href="../introduction/" class="md-tabs__link"> Method </a> </li> <li class="md-tabs__item"> <a href="../../data/outputs/" class="md-tabs__link"> Data </a> </li> <li class="md-tabs__item"> <a href="../../other-work/" class="md-tabs__link"> Other resources </a> </li> <li class="md-tabs__item"> <a href="../../contact/" class="md-tabs__link"> Contact us </a> </li> </ul> </div> </nav> <main class="md-main" data-md-component="main"> <div class="md-main__inner md-grid"> <div class="md-sidebar md-sidebar--primary" data-md-component="sidebar" data-md-type="navigation" > <div class="md-sidebar__scrollwrap"> <div class="md-sidebar__inner"> <nav class="md-nav md-nav--primary md-nav--lifted" aria-label="Navigation" data-md-level="0"> <label class="md-nav__title" for="__drawer"> <a href="../.." title="UK Charity Classification" class="md-nav__button md-logo" aria-label="UK Charity Classification" data-md-component="logo"> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M17.63 5.84C17.27 5.33 16.67 5 16 5H5a2 2 0 0 0-2 2v10a2 2 0 0 0 2 2h11c.67 0 1.27-.34 1.63-.85L22 12l-4.37-6.16Z"/></svg> </a> UK Charity Classification </label> <div class="md-nav__source"> <a href="https://github.com/charity-classification/ukcat" title="Go to repository" class="md-source" data-md-component="source"> <div class="md-source__icon md-icon"> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg> </div> <div class="md-source__repository"> GitHub </div> </a> </div> <ul class="md-nav__list" data-md-scrollfix> <li class="md-nav__item"> <a href="../.." class="md-nav__link"> <span class="md-ellipsis"> Classifying UK Charities </span> </a> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_2" > <label class="md-nav__link" for="__nav_2" id="__nav_2_label" tabindex="0"> <span class="md-ellipsis"> Blog </span> <span class="md-nav__icon md-icon"></span> </label> <nav class="md-nav" data-md-level="1" aria-labelledby="__nav_2_label" aria-expanded="false"> <label class="md-nav__title" for="__nav_2"> <span class="md-nav__icon md-icon"></span> Blog </label> <ul class="md-nav__list" data-md-scrollfix> <li class="md-nav__item"> <a href="../../blog/2021/10/06/launching-ukcat/" class="md-nav__link"> <span class="md-ellipsis"> Launching UK-CAT </span> </a> </li> <li class="md-nav__item"> <a href="../../blog/2021/03/17/a-uk-charity-classification-system/" class="md-nav__link"> <span class="md-ellipsis"> A UK Charity Classification System </span> </a> </li> <li class="md-nav__item"> <a href="../../blog/2021/01/11/classifying-the-charity-register/" class="md-nav__link"> <span class="md-ellipsis"> Classifying the charity register </span> </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--active md-nav__item--section md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_3" checked> <label class="md-nav__link" for="__nav_3" id="__nav_3_label" tabindex=""> <span class="md-ellipsis"> Method </span> <span class="md-nav__icon md-icon"></span> </label> <nav class="md-nav" data-md-level="1" aria-labelledby="__nav_3_label" aria-expanded="true"> <label class="md-nav__title" for="__nav_3"> <span class="md-nav__icon md-icon"></span> Method </label> <ul class="md-nav__list" data-md-scrollfix> <li class="md-nav__item"> <a href="../introduction/" class="md-nav__link"> <span class="md-ellipsis"> Introduction </span> </a> </li> <li class="md-nav__item"> <a href="../sampling/" class="md-nav__link"> <span class="md-ellipsis"> Sampling charities </span> </a> </li> <li class="md-nav__item"> <a href="../designing-taxonomy/" class="md-nav__link"> <span class="md-ellipsis"> Designing the UK-CAT Taxonomy </span> </a> </li> <li class="md-nav__item"> <a href="../manual-classification/" class="md-nav__link"> <span class="md-ellipsis"> Manual classification of a sample of charities </span> </a> </li> <li class="md-nav__item md-nav__item--active"> <input class="md-nav__toggle md-toggle" type="checkbox" id="__toc"> <label class="md-nav__link md-nav__link--active" for="__toc"> <span class="md-ellipsis"> Machine Learning </span> <span class="md-nav__icon md-icon"></span> </label> <a href="./" class="md-nav__link md-nav__link--active"> <span class="md-ellipsis"> Machine Learning </span> </a> <nav class="md-nav md-nav--secondary" aria-label="Table of contents"> <label class="md-nav__title" for="__toc"> <span class="md-nav__icon md-icon"></span> Table of contents </label> <ul class="md-nav__list" data-md-component="toc" data-md-scrollfix> <li class="md-nav__item"> <a href="#model-training-results" class="md-nav__link"> <span class="md-ellipsis"> Model training results </span> </a> </li> <li class="md-nav__item"> <a href="#manual-check" class="md-nav__link"> <span class="md-ellipsis"> Manual check </span> </a> </li> <li class="md-nav__item"> <a href="#refinements-to-models" class="md-nav__link"> <span class="md-ellipsis"> Refinements to models </span> </a> <nav class="md-nav" aria-label="Refinements to models"> <ul class="md-nav__list"> <li class="md-nav__item"> <a href="#refinement-1-combine-with-tag-classification" class="md-nav__link"> <span class="md-ellipsis"> Refinement 1: combine with tag classification </span> </a> </li> <li class="md-nav__item"> <a href="#refinement-2-classify-by-group-first" class="md-nav__link"> <span class="md-ellipsis"> Refinement 2: classify by group first </span> </a> </li> <li class="md-nav__item"> <a href="#potential-further-refinements" class="md-nav__link"> <span class="md-ellipsis"> Potential further refinements </span> </a> </li> </ul> </nav> </li> </ul> </nav> </li> <li class="md-nav__item"> <a href="../rules-based-classification/" class="md-nav__link"> <span class="md-ellipsis"> Rules based classification </span> </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_4" > <label class="md-nav__link" for="__nav_4" id="__nav_4_label" tabindex="0"> <span class="md-ellipsis"> Data </span> <span class="md-nav__icon md-icon"></span> </label> <nav class="md-nav" data-md-level="1" aria-labelledby="__nav_4_label" aria-expanded="false"> <label class="md-nav__title" for="__nav_4"> <span class="md-nav__icon md-icon"></span> Data </label> <ul class="md-nav__list" data-md-scrollfix> <li class="md-nav__item"> <a href="../../data/outputs/" class="md-nav__link"> <span class="md-ellipsis"> Outputs </span> </a> </li> <li class="md-nav__item"> <a href="../../data/data-downloads/" class="md-nav__link"> <span class="md-ellipsis"> Data downloads </span> </a> </li> <li class="md-nav__item"> <a href="../../data/tag_list/" class="md-nav__link"> <span class="md-ellipsis"> UK-CAT </span> </a> </li> </ul> </nav> </li> <li class="md-nav__item"> <a href="../../other-work/" class="md-nav__link"> <span class="md-ellipsis"> Other resources </span> </a> </li> <li class="md-nav__item"> <a href="../../contact/" class="md-nav__link"> <span class="md-ellipsis"> Contact us </span> </a> </li> </ul> </nav> </div> </div> </div> <div class="md-sidebar md-sidebar--secondary" data-md-component="sidebar" data-md-type="toc" > <div class="md-sidebar__scrollwrap"> <div class="md-sidebar__inner"> <nav class="md-nav md-nav--secondary" aria-label="Table of contents"> <label class="md-nav__title" for="__toc"> <span class="md-nav__icon md-icon"></span> Table of contents </label> <ul class="md-nav__list" data-md-component="toc" data-md-scrollfix> <li class="md-nav__item"> <a href="#model-training-results" class="md-nav__link"> <span class="md-ellipsis"> Model training results </span> </a> </li> <li class="md-nav__item"> <a href="#manual-check" class="md-nav__link"> <span class="md-ellipsis"> Manual check </span> </a> </li> <li class="md-nav__item"> <a href="#refinements-to-models" class="md-nav__link"> <span class="md-ellipsis"> Refinements to models </span> </a> <nav class="md-nav" aria-label="Refinements to models"> <ul class="md-nav__list"> <li class="md-nav__item"> <a href="#refinement-1-combine-with-tag-classification" class="md-nav__link"> <span class="md-ellipsis"> Refinement 1: combine with tag classification </span> </a> </li> <li class="md-nav__item"> <a href="#refinement-2-classify-by-group-first" class="md-nav__link"> <span class="md-ellipsis"> Refinement 2: classify by group first </span> </a> </li> <li class="md-nav__item"> <a href="#potential-further-refinements" class="md-nav__link"> <span class="md-ellipsis"> Potential further refinements </span> </a> </li> </ul> </nav> </li> </ul> </nav> </div> </div> </div> <div class="md-content" data-md-component="content"> <article class="md-content__inner md-typeset"> <a href="https://github.com/charity-classification/ukcat/edit/master/docs/method/machine-learning.md" title="edit.link.title" class="md-content__button md-icon"> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M20.71 7.04c.39-.39.39-1.04 0-1.41l-2.34-2.34c-.37-.39-1.02-.39-1.41 0l-1.84 1.83 3.75 3.75M3 17.25V21h3.75L17.81 9.93l-3.75-3.75L3 17.25Z"/></svg> </a> <h1 id="machine-learning">Machine Learning<a class="headerlink" href="#machine-learning" title="Permanent link">¶</a></h1> <p>To classify charities into ICNP/TSO categories, the keyword/regular expression process used for UK-CAT would not work. This is because the keywords could indicate multiple ICNP/TSO categories for a given charity, and we need to choose just one for each charity.</p> <p>Instead, a machine learning approach was preferred. In this approach a set of training data (the names and activities description for our sample set of charities) is fed into a machine learning model along with the assigned ICNP/TSO categories. You can then provide the model with a name and activities description for an unknown charity and it will predict the category that best applies (we can hold back part of the training dataset in order to test the accuracy of the model). This problem is generally known as text classification.</p> <p>The approach to this task involved using various models provided by the <a href="https://scikit-learn.org/stable/">scikit-learn python package</a>. We selected eight different models that were suggested as potentially appropriate for performing text classification through machine learning. The nature of scikit-learn means it is possible to easily produce a set of pipelines that put the same data through each of the models and compare the results.</p> <p>The pipeline for training the models consists of the following:</p> <ul> <li>Combine the name and activities description fields for each organisation into a single field</li> <li>Normalise the text in these fields. This includes removing special characters and putting the text into all lowercase.</li> <li>Remove "stopwords" from the text. Stopwords are common words used in English like "and", "a", "for", etc that are very common. A number of charity-specific stopwords were also removed, eg "trust", "fund", "charitable", "charity".</li> <li>Lemmatise all the words in the text. Lemmatisation converts words to a common root - so for example "better" and "good" are treated as the same.</li> </ul> <p>For best results, this pipeline needs to also be applied to any text before the model can offer a prediction.</p> <p>The sample data was split into "training" and "test" datasets. The sample dataset consisted of charities from both the manually classified random sample and the top 2000 charities that were manually classified. In total there were 6,203 charities, of which 4,962 (80%) were in the training set and 1,241 (20%) were in the test set.</p> <h2 id="model-training-results">Model training results<a class="headerlink" href="#model-training-results" title="Permanent link">¶</a></h2> <p>Each model was then trained on the training data and used to predict categories for the test data. The accuracy of the model can then be computed. The accuracy of the model is the proportion of categories in the predicted values for the test data that were the same as the actual result from manual classification. The results across the eight models were as follows:</p> <table> <thead> <tr> <th>Model</th> <th style="text-align: right;">Accuracy</th> </tr> </thead> <tbody> <tr> <td>Linear Support Vector Classification</td> <td style="text-align: right;">56.7%</td> </tr> <tr> <td>Logistic Regression</td> <td style="text-align: right;">55.8%</td> </tr> <tr> <td>Linear model - Stochastic gradient descent</td> <td style="text-align: right;">54.6%</td> </tr> <tr> <td>Support Vector Classification</td> <td style="text-align: right;">51.7%</td> </tr> <tr> <td>Naive Bayesian</td> <td style="text-align: right;">37.1%</td> </tr> <tr> <td>Decision Tree</td> <td style="text-align: right;">22.3%</td> </tr> <tr> <td>Ada Boost</td> <td style="text-align: right;">14.2%</td> </tr> <tr> <td>Random Forest</td> <td style="text-align: right;">9.7%</td> </tr> </tbody> </table> <p>The results was a near tie between Linear Support Vector Classification, Logistic Regression and Linear model - Stochastic gradient descent, with accuracy of 57%, 56% and 55% respectively. Although it came a close second, Logistic Regression was chosen as the most appropriate model to take forward, primarily because it is possible to extract the probability assigned by the model to the results, which allows for further investigation of them.</p> <p>Accuracy of 56% does mean that the model gets an incorrect result 11 times out of 20. However, this result is more impressive when compared to the fact that there are over 76 different categories for the model to choose from. A model that simply randomly assigned a category to each charity would have an accuracy of around 1%.</p> <p>And while the final model had an accuracy of 56% for the lowest-level of ICNP/TSO classification, it was correct for the "group" of the ICNP/TSO category 70% of the time. This means that, for example, a charity may have been correctly identified as in the "Education" group (group B) but the exact sub-category may not have been right.</p> <p>The probability scores given for the best match found by the model do give some insight into its confidence, although they are not especially helpful. 60% of the test results were given a confidence of 0.99 or higher. Out of those 72% were correct and 28% were incorrect - a higher accuracy than the model overall. The accuracy for matches with scores between 0.75-0.99 was around 35%, and for lower than 0.75 was around 25%. This does show that the confidence scores do reflect the accuracy of the results, but even for low confidence scores they are correct around one third of the time.</p> <h2 id="manual-check">Manual check<a class="headerlink" href="#manual-check" title="Permanent link">¶</a></h2> <p>To further check the results of the machine learning model, a random sample of 300 charities was taken from the full results. This consisted of a weighted sample, based on taking 20 charities across 5 income bands, in each of the 3 regulatory jurisdictions. The sample included some manually classified results - these were ignored in the results shown below, with 236 of the results from the machine learning model. Each result was then checked and put into one of three categories:</p> <ul> <li>"Correct" - the ICNP/TSO category assigned by the model was correct</li> <li>"Plausible" - the ICNP/TSO category assigned was plausible, but a human looking at the charity might have chosen a different one</li> <li>"Incorrect" - the ICNP/TSO category assigned did not look correct</li> </ul> <p>The exercise produced a better result than found in the formal machine learning test - 85% of the matches overall were "correct", with a further 5% "plausible", leaving 11% that could be considered "incorrect". The results varied by regulator and by income band. CCEW had the lowest "correct" score, with 79%, followed by oscr (82% correct) and CCNI (91%). </p> <p>The score did not appear to very across income band for OSCR and CCNI, but for the CCEW results there was a pattern of smaller organisations being more likely to be incorrect - with 65% correct for the smallest band (under 拢10k, for CCEW) compared to 89% for 拢1m-拢10m.</p> <p>These results are encouraging, and suggest that the results are better than the formal machine learning test would suggest. If the "plausible" results are also accepted, the overall success rate from this exercise would be nearly 90%. </p> <h2 id="refinements-to-models">Refinements to models<a class="headerlink" href="#refinements-to-models" title="Permanent link">¶</a></h2> <p>There are ways that could improve the performance of the models for producing the correct ICNP/TSO results. Two of these methods have been tried (and offer no significant improvement over the base model), while the others could be tried in future research.</p> <h3 id="refinement-1-combine-with-tag-classification">Refinement 1: combine with tag classification<a class="headerlink" href="#refinement-1-combine-with-tag-classification" title="Permanent link">¶</a></h3> <p>This refinement involves using the keywords developed for the UK-CAT classification. As each UK-CAT tag has one or more "related" ICNP/TSO categories, the method for this refinement was to first run the UK-CAT keywords against the charity text data (the name and activities). The unique set of related ICNP/TSO categories found through these keywords could then be used to narrow down the set of allowed ICNP/TSO categories from the machine learning model. The best result from the related ICNP/TSO categories would be selected, using the probability from the machine learning model.</p> <p>This method produced an accuracy of 52%, slightly less than the base model. In around 14% of cases only one related ICNP/TSO category was found which was chosen by default. And in a small number of cases (<5%) no related categories were found. There was no difference in the accuracy of this technique across the number of different tags found - it performed worse than the base model no matter how many related tags were found.</p> <p>The results of this refinement can be found in the <code>icnptso-ml-tag-test.ipynb</code> notebook.</p> <h3 id="refinement-2-classify-by-group-first">Refinement 2: classify by group first<a class="headerlink" href="#refinement-2-classify-by-group-first" title="Permanent link">¶</a></h3> <p>A second potential refinement was to split the classification problem into two stages. The first stage would classify charities into ICNP/TSO groups (e.g. Education), with the second stage then deciding on the subcategory within the group. This would involve one model to predict the group classification, then a series of models for the subcategories, one for each group. </p> <p>Applying this refinement did not produce any improvement in the accuracy generated. The accuracy of the group classification model was 69%, around the same as the accuracy at a group level for the classification as a whole. And once the individual models were run the overall accuracy of the process was 53%, slightly less than the base model.</p> <h3 id="potential-further-refinements">Potential further refinements<a class="headerlink" href="#potential-further-refinements" title="Permanent link">¶</a></h3> <p>The two refinements tested did not produce any improvement in the accuracy of the result. There are other areas that could be tried though.</p> <p>The first is parameter optimization. Currently the logistic regression model uses some default parameters chosen from the documentation. It is possible to tweak the parameters that the model uses to improve performance. This tweaking can be done manually, or by exploring a parameter space and optimising automatically based on results.</p> <p>The second potential improvement would be to use another machine learning technique to get better results. An inspiration in this area might be Ma (2021), in which the author uses a number of advanced machine learning techniques in a similar space and produces improved results. In particular, the use of the BERT set of pre-trained word embeddings could help bring different words with similar meanings closer together within the model.</p> <p>Finally, a larger sample dataset should result in increased accuracy, even for the existing model. There may be innovative ways of producing this increased sample - for example by "gamifying" the process of confirming potential tags for a charity using an interactive online tool. </p> <aside class="md-source-file"> <span class="md-source-file__fact"> <span class="md-icon" title="Last update"> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M21 13.1c-.1 0-.3.1-.4.2l-1 1 2.1 2.1 1-1c.2-.2.2-.6 0-.8l-1.3-1.3c-.1-.1-.2-.2-.4-.2m-1.9 1.8-6.1 6V23h2.1l6.1-6.1-2.1-2M12.5 7v5.2l4 2.4-1 1L11 13V7h1.5M11 21.9c-5.1-.5-9-4.8-9-9.9C2 6.5 6.5 2 12 2c5.3 0 9.6 4.1 10 9.3-.3-.1-.6-.2-1-.2s-.7.1-1 .2C19.6 7.2 16.2 4 12 4c-4.4 0-8 3.6-8 8 0 4.1 3.1 7.5 7.1 7.9l-.1.2v1.8Z"/></svg> </span> <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-date">August 7, 2024</span> </span> <span class="md-source-file__fact"> <span class="md-icon" title="Created"> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M14.47 15.08 11 13V7h1.5v5.25l3.08 1.83c-.41.28-.79.62-1.11 1m-1.39 4.84c-.36.05-.71.08-1.08.08-4.42 0-8-3.58-8-8s3.58-8 8-8 8 3.58 8 8c0 .37-.03.72-.08 1.08.69.1 1.33.32 1.92.64.1-.56.16-1.13.16-1.72 0-5.5-4.5-10-10-10S2 6.5 2 12s4.47 10 10 10c.59 0 1.16-.06 1.72-.16-.32-.59-.54-1.23-.64-1.92M18 15v3h-3v2h3v3h2v-3h3v-2h-3v-3h-2Z"/></svg> </span> <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-date">August 7, 2024</span> </span> </aside> </article> </div> <script>var target=document.getElementById(location.hash.slice(1));target&&target.name&&(target.checked=target.name.startsWith("__tabbed_"))</script> </div> </main> <footer class="md-footer"> <div class="md-footer-meta md-typeset"> <div class="md-footer-meta__inner md-grid"> <div class="md-copyright"> Made with <a href="https://squidfunk.github.io/mkdocs-material/" target="_blank" rel="noopener"> Material for MkDocs </a> </div> </div> </div> </footer> </div> <div class="md-dialog" data-md-component="dialog"> <div class="md-dialog__inner md-typeset"></div> </div> <script id="__config" type="application/json">{"base": "../..", "features": ["navigation.tabs"], "search": "../../assets/javascripts/workers/search.b8dbb3d2.min.js", "translations": {"clipboard.copied": "Copied to clipboard", "clipboard.copy": "Copy to clipboard", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.placeholder": "Type to start searching", "search.result.term.missing": "Missing", "select.version": "Select version"}}</script> <script src="../../assets/javascripts/bundle.fe8b6f2b.min.js"></script> <script async defer src="https://scripts.simpleanalyticscdn.com/latest.js"></script> <noscript><img src="https://queue.simpleanalyticscdn.com/noscript.gif" alt="" referrerpolicy="no-referrer-when-downgrade" /></noscript> </body> </html>

CINXE.COM

Machine Learning - UK Charity Classification