CINXE.COM
Project ideas – Ensembl Blog
<!DOCTYPE html> <html lang="en-GB"> <head> <meta charset="UTF-8" /> <meta http-equiv="X-UA-Compatible" content="IE=edge"> <meta name="viewport" content="width=device-width, initial-scale=1.0" /> <link rel="profile" href="http://gmpg.org/xfn/11" /> <link rel="pingback" href="https://www.ensembl.info/xmlrpc.php" /> <title>Project ideas – Ensembl Blog</title> <meta name='robots' content='max-image-preview:large' /> <link rel='dns-prefetch' href='//secure.gravatar.com' /> <link rel='dns-prefetch' href='//stats.wp.com' /> <link rel='dns-prefetch' href='//fonts.googleapis.com' /> <link rel='dns-prefetch' href='//v0.wordpress.com' /> <link rel="alternate" type="application/rss+xml" title="Ensembl Blog » Feed" href="https://www.ensembl.info/feed/" /> <link rel="alternate" type="application/rss+xml" title="Ensembl Blog » Comments Feed" href="https://www.ensembl.info/comments/feed/" /> <!-- This site uses the Google Analytics by MonsterInsights plugin v8.23.1 - Using Analytics tracking - https://www.monsterinsights.com/ --> <!-- Note: MonsterInsights is not currently configured on this site. The site owner needs to authenticate with Google Analytics in the MonsterInsights settings panel. --> <!-- No tracking code set --> <!-- / Google Analytics by MonsterInsights --> <script type="text/javascript"> /* <![CDATA[ */ window._wpemojiSettings = {"baseUrl":"https:\/\/s.w.org\/images\/core\/emoji\/14.0.0\/72x72\/","ext":".png","svgUrl":"https:\/\/s.w.org\/images\/core\/emoji\/14.0.0\/svg\/","svgExt":".svg","source":{"concatemoji":"https:\/\/www.ensembl.info\/wp-includes\/js\/wp-emoji-release.min.js?ver=6.4.5"}}; /*! This file is auto-generated */ !function(i,n){var o,s,e;function c(e){try{var t={supportTests:e,timestamp:(new Date).valueOf()};sessionStorage.setItem(o,JSON.stringify(t))}catch(e){}}function p(e,t,n){e.clearRect(0,0,e.canvas.width,e.canvas.height),e.fillText(t,0,0);var t=new Uint32Array(e.getImageData(0,0,e.canvas.width,e.canvas.height).data),r=(e.clearRect(0,0,e.canvas.width,e.canvas.height),e.fillText(n,0,0),new Uint32Array(e.getImageData(0,0,e.canvas.width,e.canvas.height).data));return t.every(function(e,t){return e===r[t]})}function u(e,t,n){switch(t){case"flag":return n(e,"\ud83c\udff3\ufe0f\u200d\u26a7\ufe0f","\ud83c\udff3\ufe0f\u200b\u26a7\ufe0f")?!1:!n(e,"\ud83c\uddfa\ud83c\uddf3","\ud83c\uddfa\u200b\ud83c\uddf3")&&!n(e,"\ud83c\udff4\udb40\udc67\udb40\udc62\udb40\udc65\udb40\udc6e\udb40\udc67\udb40\udc7f","\ud83c\udff4\u200b\udb40\udc67\u200b\udb40\udc62\u200b\udb40\udc65\u200b\udb40\udc6e\u200b\udb40\udc67\u200b\udb40\udc7f");case"emoji":return!n(e,"\ud83e\udef1\ud83c\udffb\u200d\ud83e\udef2\ud83c\udfff","\ud83e\udef1\ud83c\udffb\u200b\ud83e\udef2\ud83c\udfff")}return!1}function f(e,t,n){var r="undefined"!=typeof WorkerGlobalScope&&self instanceof WorkerGlobalScope?new OffscreenCanvas(300,150):i.createElement("canvas"),a=r.getContext("2d",{willReadFrequently:!0}),o=(a.textBaseline="top",a.font="600 32px Arial",{});return e.forEach(function(e){o[e]=t(a,e,n)}),o}function t(e){var t=i.createElement("script");t.src=e,t.defer=!0,i.head.appendChild(t)}"undefined"!=typeof Promise&&(o="wpEmojiSettingsSupports",s=["flag","emoji"],n.supports={everything:!0,everythingExceptFlag:!0},e=new Promise(function(e){i.addEventListener("DOMContentLoaded",e,{once:!0})}),new Promise(function(t){var n=function(){try{var e=JSON.parse(sessionStorage.getItem(o));if("object"==typeof e&&"number"==typeof e.timestamp&&(new Date).valueOf()<e.timestamp+604800&&"object"==typeof e.supportTests)return e.supportTests}catch(e){}return null}();if(!n){if("undefined"!=typeof Worker&&"undefined"!=typeof OffscreenCanvas&&"undefined"!=typeof URL&&URL.createObjectURL&&"undefined"!=typeof Blob)try{var e="postMessage("+f.toString()+"("+[JSON.stringify(s),u.toString(),p.toString()].join(",")+"));",r=new Blob([e],{type:"text/javascript"}),a=new Worker(URL.createObjectURL(r),{name:"wpTestEmojiSupports"});return void(a.onmessage=function(e){c(n=e.data),a.terminate(),t(n)})}catch(e){}c(n=f(s,u,p))}t(n)}).then(function(e){for(var t in e)n.supports[t]=e[t],n.supports.everything=n.supports.everything&&n.supports[t],"flag"!==t&&(n.supports.everythingExceptFlag=n.supports.everythingExceptFlag&&n.supports[t]);n.supports.everythingExceptFlag=n.supports.everythingExceptFlag&&!n.supports.flag,n.DOMReady=!1,n.readyCallback=function(){n.DOMReady=!0}}).then(function(){return e}).then(function(){var e;n.supports.everything||(n.readyCallback(),(e=n.source||{}).concatemoji?t(e.concatemoji):e.wpemoji&&e.twemoji&&(t(e.twemoji),t(e.wpemoji)))}))}((window,document),window._wpemojiSettings); /* ]]> */ </script> <style id='wp-emoji-styles-inline-css' type='text/css'> img.wp-smiley, img.emoji { display: inline !important; border: none !important; box-shadow: none !important; height: 1em !important; width: 1em !important; margin: 0 0.07em !important; vertical-align: -0.1em !important; background: none !important; padding: 0 !important; } </style> <link rel='stylesheet' id='wp-block-library-css' href='https://www.ensembl.info/wp-includes/css/dist/block-library/style.min.css?ver=6.4.5' media='all' /> <style id='wp-block-library-inline-css' type='text/css'> .has-text-align-justify{text-align:justify;} </style> <link rel='stylesheet' id='mediaelement-css' href='https://www.ensembl.info/wp-includes/js/mediaelement/mediaelementplayer-legacy.min.css?ver=4.2.17' media='all' /> <link rel='stylesheet' id='wp-mediaelement-css' href='https://www.ensembl.info/wp-includes/js/mediaelement/wp-mediaelement.min.css?ver=6.4.5' media='all' /> <style id='classic-theme-styles-inline-css' type='text/css'> /*! This file is auto-generated */ .wp-block-button__link{color:#fff;background-color:#32373c;border-radius:9999px;box-shadow:none;text-decoration:none;padding:calc(.667em + 2px) calc(1.333em + 2px);font-size:1.125em}.wp-block-file__button{background:#32373c;color:#fff;text-decoration:none} </style> <style id='global-styles-inline-css' type='text/css'> body{--wp--preset--color--black: #000000;--wp--preset--color--cyan-bluish-gray: #abb8c3;--wp--preset--color--white: #ffffff;--wp--preset--color--pale-pink: #f78da7;--wp--preset--color--vivid-red: #cf2e2e;--wp--preset--color--luminous-vivid-orange: #ff6900;--wp--preset--color--luminous-vivid-amber: #fcb900;--wp--preset--color--light-green-cyan: #7bdcb5;--wp--preset--color--vivid-green-cyan: #00d084;--wp--preset--color--pale-cyan-blue: #8ed1fc;--wp--preset--color--vivid-cyan-blue: #0693e3;--wp--preset--color--vivid-purple: #9b51e0;--wp--preset--gradient--vivid-cyan-blue-to-vivid-purple: linear-gradient(135deg,rgba(6,147,227,1) 0%,rgb(155,81,224) 100%);--wp--preset--gradient--light-green-cyan-to-vivid-green-cyan: linear-gradient(135deg,rgb(122,220,180) 0%,rgb(0,208,130) 100%);--wp--preset--gradient--luminous-vivid-amber-to-luminous-vivid-orange: linear-gradient(135deg,rgba(252,185,0,1) 0%,rgba(255,105,0,1) 100%);--wp--preset--gradient--luminous-vivid-orange-to-vivid-red: linear-gradient(135deg,rgba(255,105,0,1) 0%,rgb(207,46,46) 100%);--wp--preset--gradient--very-light-gray-to-cyan-bluish-gray: linear-gradient(135deg,rgb(238,238,238) 0%,rgb(169,184,195) 100%);--wp--preset--gradient--cool-to-warm-spectrum: linear-gradient(135deg,rgb(74,234,220) 0%,rgb(151,120,209) 20%,rgb(207,42,186) 40%,rgb(238,44,130) 60%,rgb(251,105,98) 80%,rgb(254,248,76) 100%);--wp--preset--gradient--blush-light-purple: linear-gradient(135deg,rgb(255,206,236) 0%,rgb(152,150,240) 100%);--wp--preset--gradient--blush-bordeaux: linear-gradient(135deg,rgb(254,205,165) 0%,rgb(254,45,45) 50%,rgb(107,0,62) 100%);--wp--preset--gradient--luminous-dusk: linear-gradient(135deg,rgb(255,203,112) 0%,rgb(199,81,192) 50%,rgb(65,88,208) 100%);--wp--preset--gradient--pale-ocean: linear-gradient(135deg,rgb(255,245,203) 0%,rgb(182,227,212) 50%,rgb(51,167,181) 100%);--wp--preset--gradient--electric-grass: linear-gradient(135deg,rgb(202,248,128) 0%,rgb(113,206,126) 100%);--wp--preset--gradient--midnight: linear-gradient(135deg,rgb(2,3,129) 0%,rgb(40,116,252) 100%);--wp--preset--font-size--small: 13px;--wp--preset--font-size--medium: 20px;--wp--preset--font-size--large: 36px;--wp--preset--font-size--x-large: 42px;--wp--preset--spacing--20: 0.44rem;--wp--preset--spacing--30: 0.67rem;--wp--preset--spacing--40: 1rem;--wp--preset--spacing--50: 1.5rem;--wp--preset--spacing--60: 2.25rem;--wp--preset--spacing--70: 3.38rem;--wp--preset--spacing--80: 5.06rem;--wp--preset--shadow--natural: 6px 6px 9px rgba(0, 0, 0, 0.2);--wp--preset--shadow--deep: 12px 12px 50px rgba(0, 0, 0, 0.4);--wp--preset--shadow--sharp: 6px 6px 0px rgba(0, 0, 0, 0.2);--wp--preset--shadow--outlined: 6px 6px 0px -3px rgba(255, 255, 255, 1), 6px 6px rgba(0, 0, 0, 1);--wp--preset--shadow--crisp: 6px 6px 0px rgba(0, 0, 0, 1);}:where(.is-layout-flex){gap: 0.5em;}:where(.is-layout-grid){gap: 0.5em;}body .is-layout-flow > .alignleft{float: left;margin-inline-start: 0;margin-inline-end: 2em;}body .is-layout-flow > .alignright{float: right;margin-inline-start: 2em;margin-inline-end: 0;}body .is-layout-flow > .aligncenter{margin-left: auto !important;margin-right: auto !important;}body .is-layout-constrained > .alignleft{float: left;margin-inline-start: 0;margin-inline-end: 2em;}body .is-layout-constrained > .alignright{float: right;margin-inline-start: 2em;margin-inline-end: 0;}body .is-layout-constrained > .aligncenter{margin-left: auto !important;margin-right: auto !important;}body .is-layout-constrained > :where(:not(.alignleft):not(.alignright):not(.alignfull)){max-width: var(--wp--style--global--content-size);margin-left: auto !important;margin-right: auto !important;}body .is-layout-constrained > .alignwide{max-width: var(--wp--style--global--wide-size);}body .is-layout-flex{display: flex;}body .is-layout-flex{flex-wrap: wrap;align-items: center;}body .is-layout-flex > *{margin: 0;}body .is-layout-grid{display: grid;}body .is-layout-grid > *{margin: 0;}:where(.wp-block-columns.is-layout-flex){gap: 2em;}:where(.wp-block-columns.is-layout-grid){gap: 2em;}:where(.wp-block-post-template.is-layout-flex){gap: 1.25em;}:where(.wp-block-post-template.is-layout-grid){gap: 1.25em;}.has-black-color{color: var(--wp--preset--color--black) !important;}.has-cyan-bluish-gray-color{color: var(--wp--preset--color--cyan-bluish-gray) !important;}.has-white-color{color: var(--wp--preset--color--white) !important;}.has-pale-pink-color{color: var(--wp--preset--color--pale-pink) !important;}.has-vivid-red-color{color: var(--wp--preset--color--vivid-red) !important;}.has-luminous-vivid-orange-color{color: var(--wp--preset--color--luminous-vivid-orange) !important;}.has-luminous-vivid-amber-color{color: var(--wp--preset--color--luminous-vivid-amber) !important;}.has-light-green-cyan-color{color: var(--wp--preset--color--light-green-cyan) !important;}.has-vivid-green-cyan-color{color: var(--wp--preset--color--vivid-green-cyan) !important;}.has-pale-cyan-blue-color{color: var(--wp--preset--color--pale-cyan-blue) !important;}.has-vivid-cyan-blue-color{color: var(--wp--preset--color--vivid-cyan-blue) !important;}.has-vivid-purple-color{color: var(--wp--preset--color--vivid-purple) !important;}.has-black-background-color{background-color: var(--wp--preset--color--black) !important;}.has-cyan-bluish-gray-background-color{background-color: var(--wp--preset--color--cyan-bluish-gray) !important;}.has-white-background-color{background-color: var(--wp--preset--color--white) !important;}.has-pale-pink-background-color{background-color: var(--wp--preset--color--pale-pink) !important;}.has-vivid-red-background-color{background-color: var(--wp--preset--color--vivid-red) !important;}.has-luminous-vivid-orange-background-color{background-color: var(--wp--preset--color--luminous-vivid-orange) !important;}.has-luminous-vivid-amber-background-color{background-color: var(--wp--preset--color--luminous-vivid-amber) !important;}.has-light-green-cyan-background-color{background-color: var(--wp--preset--color--light-green-cyan) !important;}.has-vivid-green-cyan-background-color{background-color: var(--wp--preset--color--vivid-green-cyan) !important;}.has-pale-cyan-blue-background-color{background-color: var(--wp--preset--color--pale-cyan-blue) !important;}.has-vivid-cyan-blue-background-color{background-color: var(--wp--preset--color--vivid-cyan-blue) !important;}.has-vivid-purple-background-color{background-color: var(--wp--preset--color--vivid-purple) !important;}.has-black-border-color{border-color: var(--wp--preset--color--black) !important;}.has-cyan-bluish-gray-border-color{border-color: var(--wp--preset--color--cyan-bluish-gray) !important;}.has-white-border-color{border-color: var(--wp--preset--color--white) !important;}.has-pale-pink-border-color{border-color: var(--wp--preset--color--pale-pink) !important;}.has-vivid-red-border-color{border-color: var(--wp--preset--color--vivid-red) !important;}.has-luminous-vivid-orange-border-color{border-color: var(--wp--preset--color--luminous-vivid-orange) !important;}.has-luminous-vivid-amber-border-color{border-color: var(--wp--preset--color--luminous-vivid-amber) !important;}.has-light-green-cyan-border-color{border-color: var(--wp--preset--color--light-green-cyan) !important;}.has-vivid-green-cyan-border-color{border-color: var(--wp--preset--color--vivid-green-cyan) !important;}.has-pale-cyan-blue-border-color{border-color: var(--wp--preset--color--pale-cyan-blue) !important;}.has-vivid-cyan-blue-border-color{border-color: var(--wp--preset--color--vivid-cyan-blue) !important;}.has-vivid-purple-border-color{border-color: var(--wp--preset--color--vivid-purple) !important;}.has-vivid-cyan-blue-to-vivid-purple-gradient-background{background: var(--wp--preset--gradient--vivid-cyan-blue-to-vivid-purple) !important;}.has-light-green-cyan-to-vivid-green-cyan-gradient-background{background: var(--wp--preset--gradient--light-green-cyan-to-vivid-green-cyan) !important;}.has-luminous-vivid-amber-to-luminous-vivid-orange-gradient-background{background: var(--wp--preset--gradient--luminous-vivid-amber-to-luminous-vivid-orange) !important;}.has-luminous-vivid-orange-to-vivid-red-gradient-background{background: var(--wp--preset--gradient--luminous-vivid-orange-to-vivid-red) !important;}.has-very-light-gray-to-cyan-bluish-gray-gradient-background{background: var(--wp--preset--gradient--very-light-gray-to-cyan-bluish-gray) !important;}.has-cool-to-warm-spectrum-gradient-background{background: var(--wp--preset--gradient--cool-to-warm-spectrum) !important;}.has-blush-light-purple-gradient-background{background: var(--wp--preset--gradient--blush-light-purple) !important;}.has-blush-bordeaux-gradient-background{background: var(--wp--preset--gradient--blush-bordeaux) !important;}.has-luminous-dusk-gradient-background{background: var(--wp--preset--gradient--luminous-dusk) !important;}.has-pale-ocean-gradient-background{background: var(--wp--preset--gradient--pale-ocean) !important;}.has-electric-grass-gradient-background{background: var(--wp--preset--gradient--electric-grass) !important;}.has-midnight-gradient-background{background: var(--wp--preset--gradient--midnight) !important;}.has-small-font-size{font-size: var(--wp--preset--font-size--small) !important;}.has-medium-font-size{font-size: var(--wp--preset--font-size--medium) !important;}.has-large-font-size{font-size: var(--wp--preset--font-size--large) !important;}.has-x-large-font-size{font-size: var(--wp--preset--font-size--x-large) !important;} .wp-block-navigation a:where(:not(.wp-element-button)){color: inherit;} :where(.wp-block-post-template.is-layout-flex){gap: 1.25em;}:where(.wp-block-post-template.is-layout-grid){gap: 1.25em;} :where(.wp-block-columns.is-layout-flex){gap: 2em;}:where(.wp-block-columns.is-layout-grid){gap: 2em;} .wp-block-pullquote{font-size: 1.5em;line-height: 1.6;} </style> <link rel='stylesheet' id='flat-fonts-css' href='//fonts.googleapis.com/css?family=Amatic+SC%7CRoboto:400,700%7CRoboto+Slab%7CRoboto+Condensed' media='all' /> <link rel='stylesheet' id='flat-theme-css' href='https://www.ensembl.info/wp-content/themes/flat/assets/css/flat.min.css?ver=1.7.11' media='all' /> <link rel='stylesheet' id='flat-style-css' href='https://www.ensembl.info/wp-content/themes/flat-ensembl/style.css?ver=6.4.5' media='all' /> <link rel='stylesheet' id='jetpack_css-css' href='https://www.ensembl.info/wp-content/plugins/jetpack/css/jetpack.css?ver=12.9.4' media='all' /> <script type="text/javascript" src="https://www.ensembl.info/wp-includes/js/jquery/jquery.min.js?ver=3.7.1" id="jquery-core-js"></script> <script type="text/javascript" src="https://www.ensembl.info/wp-includes/js/jquery/jquery-migrate.min.js?ver=3.4.1" id="jquery-migrate-js"></script> <script type="text/javascript" src="https://www.ensembl.info/wp-content/themes/flat/assets/js/flat.min.js?ver=1.7.11" id="flat-js-js"></script> <!--[if lt IE 9]> <script type="text/javascript" src="https://www.ensembl.info/wp-content/themes/flat/assets/js/html5shiv.min.js?ver=3.7.2" id="html5shiv-js"></script> <![endif]--> <link rel="https://api.w.org/" href="https://www.ensembl.info/wp-json/" /><link rel="alternate" type="application/json" href="https://www.ensembl.info/wp-json/wp/v2/pages/9395" /><link rel="EditURI" type="application/rsd+xml" title="RSD" href="https://www.ensembl.info/xmlrpc.php?rsd" /> <meta name="generator" content="WordPress 6.4.5" /> <link rel="canonical" href="https://www.ensembl.info/about/projects/" /> <link rel='shortlink' href='https://wp.me/P3A2rn-2rx' /> <link rel="alternate" type="application/json+oembed" href="https://www.ensembl.info/wp-json/oembed/1.0/embed?url=https%3A%2F%2Fwww.ensembl.info%2Fabout%2Fprojects%2F" /> <link rel="alternate" type="text/xml+oembed" href="https://www.ensembl.info/wp-json/oembed/1.0/embed?url=https%3A%2F%2Fwww.ensembl.info%2Fabout%2Fprojects%2F&format=xml" /> <style>img#wpstats{display:none}</style> <style type="text/css">#page:before, .sidebar-offcanvas, #secondary { background-color: #333366; }@media (max-width: 1199px) { #page > .container { background-color: #333366; } }</style><style type="text/css">#masthead .site-title {font-family:Amatic SC}body {font-family:Roboto }h1,h2,h3,h4,h5,h6 {font-family:Roboto Slab}#masthead .site-description, .hentry .entry-meta {font-family:Roboto Condensed}</style><style type="text/css" id="custom-background-css"> body.custom-background { background-image: url("https://www.ensembl.info/wp-content/uploads/2018/02/weird_genomes_4tile2.png"); background-position: left top; background-size: auto; background-repeat: repeat; background-attachment: fixed; } </style> <!-- Jetpack Open Graph Tags --> <meta property="og:type" content="article" /> <meta property="og:title" content="Project ideas" /> <meta property="og:url" content="https://www.ensembl.info/about/projects/" /> <meta property="og:description" content="Ensembl is part of the Genome Assembly and Annotation (GAA) section of EMBL-EBI. The GAA section contains other popular services such as MGnify, HGNC/VGNC and Wormbase. Below are project ideas rela…" /> <meta property="article:published_time" content="2018-02-05T16:10:53+00:00" /> <meta property="article:modified_time" content="2023-03-02T15:09:29+00:00" /> <meta property="og:site_name" content="Ensembl Blog" /> <meta property="og:image" content="https://www.ensembl.info/wp-content/uploads/2018/01/cropped-ebang-512.png" /> <meta property="og:image:width" content="512" /> <meta property="og:image:height" content="512" /> <meta property="og:image:alt" content="" /> <meta property="og:locale" content="en_GB" /> <meta name="twitter:text:title" content="Project ideas" /> <meta name="twitter:image" content="https://www.ensembl.info/wp-content/uploads/2018/01/cropped-ebang-512-270x270.png" /> <meta name="twitter:card" content="summary" /> <!-- End Jetpack Open Graph Tags --> <link rel="icon" href="https://www.ensembl.info/wp-content/uploads/2018/01/cropped-ebang-512-32x32.png" sizes="32x32" /> <link rel="icon" href="https://www.ensembl.info/wp-content/uploads/2018/01/cropped-ebang-512-192x192.png" sizes="192x192" /> <link rel="apple-touch-icon" href="https://www.ensembl.info/wp-content/uploads/2018/01/cropped-ebang-512-180x180.png" /> <meta name="msapplication-TileImage" content="https://www.ensembl.info/wp-content/uploads/2018/01/cropped-ebang-512-270x270.png" /> </head> <body class="page-template-default page page-id-9395 page-child parent-pageid-2 custom-background" itemscope itemtype="http://schema.org/WebPage"> <div id="page"> <div class="container"> <div class="row row-offcanvas row-offcanvas-left"> <div id="secondary" class="col-lg-3"> <header id="masthead" class="site-header" role="banner"> <div class="hgroup"> <h1 class="site-title display-title-logo"><a href="https://www.ensembl.info/" title="Ensembl Blog" rel="home"><img itemprop="primaryImageofPage" alt="Ensembl Blog" src="https://www.ensembl.info/wp-content/uploads/2018/02/cropped-ebang-512-32x32.png" /><span itemprop="name">Ensembl Blog</span></a></h1><h2 itemprop="description" class="site-description">News about the Ensembl Project and its genome browser</h2> </div> <button type="button" class="btn btn-link hidden-lg toggle-sidebar" data-toggle="offcanvas" aria-label="Sidebar"><i class="fa fa-gear"></i></button> <button type="button" class="btn btn-link hidden-lg toggle-navigation" aria-label="Navigation Menu"><i class="fa fa-bars"></i></button> <nav id="site-navigation" class="navigation main-navigation" role="navigation"> <div class="nav-menu"><ul> <li class="page_item page-item-2 page_item_has_children current_page_ancestor current_page_parent"><a href="https://www.ensembl.info/about/">About Us</a> <ul class='children'> <li class="page_item page-item-11490"><a href="https://www.ensembl.info/about/documentation-projects/">Documentation projects</a></li> <li class="page_item page-item-524"><a href="https://www.ensembl.info/about/roadmap/">Future Plans</a></li> <li class="page_item page-item-9395 current_page_item"><a href="https://www.ensembl.info/about/projects/" aria-current="page">Project ideas</a></li> </ul> </li> <li class="page_item page-item-9409"><a href="http://training.ensembl.org">Workshops</a></li> <li class="page_item page-item-635 page_item_has_children"><a href="https://www.ensembl.info/known-bugs/">Known Bugs</a> <ul class='children'> <li class="page_item page-item-12384"><a href="https://www.ensembl.info/known-bugs/ensembl-100/">Ensembl 100</a></li> <li class="page_item page-item-12720"><a href="https://www.ensembl.info/known-bugs/ensembl-101/">Ensembl 101</a></li> <li class="page_item page-item-13006"><a href="https://www.ensembl.info/known-bugs/ensembl-102/">Ensembl 102</a></li> <li class="page_item page-item-13132"><a href="https://www.ensembl.info/known-bugs/ensembl-103/">Ensembl 103</a></li> <li class="page_item page-item-13320"><a href="https://www.ensembl.info/known-bugs/ensembl-104/">Ensembl 104</a></li> <li class="page_item page-item-13632"><a href="https://www.ensembl.info/known-bugs/ensembl-105/">Ensembl 105</a></li> <li class="page_item page-item-13907"><a href="https://www.ensembl.info/known-bugs/ensembl-106/">Ensembl 106</a></li> <li class="page_item page-item-14072"><a href="https://www.ensembl.info/known-bugs/ensembl-107/">Ensembl 107</a></li> <li class="page_item page-item-14241"><a href="https://www.ensembl.info/known-bugs/ensembl-108/">Ensembl 108</a></li> <li class="page_item page-item-14478"><a href="https://www.ensembl.info/known-bugs/ensembl-109/">Ensembl 109</a></li> <li class="page_item page-item-14839"><a href="https://www.ensembl.info/known-bugs/ensembl-110/">Ensembl 110</a></li> <li class="page_item page-item-15176"><a href="https://www.ensembl.info/known-bugs/ensembl-111/">Ensembl 111</a></li> <li class="page_item page-item-15448"><a href="https://www.ensembl.info/known-bugs/ensembl-112/">Ensembl 112</a></li> <li class="page_item page-item-15719"><a href="https://www.ensembl.info/known-bugs/ensembl-113/">Ensembl 113</a></li> <li class="page_item page-item-12387"><a href="https://www.ensembl.info/known-bugs/ensembl-99-and-earlier/">Ensembl 99 and earlier</a></li> </ul> </li> <li class="page_item page-item-6"><a href="https://www.ensembl.info/contact-us/">Contact Us</a></li> </ul></div> </nav> </header> <div class="sidebar-offcanvas"> <div id="main-sidebar" class="widget-area" role="complementary"> <aside id="search-2" class="widget widget_search"> <form method="get" id="searchform" action="https://www.ensembl.info/"> <label for="s" class="assistive-text">Search</label> <input type="text" class="field" name="s" id="s" placeholder="Search" /> <input type="submit" class="submit" name="submit" id="searchsubmit" value="Search" /> </form> </aside> <aside id="categories-2" class="widget widget_categories"> <h3 class='widget-title'>Categories</h3> <ul> <li class="cat-item cat-item-4"><a href="https://www.ensembl.info/category/01-release/">Release announcements</a> </li> <li class="cat-item cat-item-282"><a href="https://www.ensembl.info/category/02-covid-19/">COVID-19</a> </li> <li class="cat-item cat-item-289"><a href="https://www.ensembl.info/category/02-ensembl-vep/">Ensembl VEP</a> </li> <li class="cat-item cat-item-12"><a href="https://www.ensembl.info/category/02-updates/">New data and web features</a> </li> <li class="cat-item cat-item-6"><a href="https://www.ensembl.info/category/03-other/">Other news</a> </li> <li class="cat-item cat-item-7"><a href="https://www.ensembl.info/category/04-training/">Training</a> </li> <li class="cat-item cat-item-3"><a href="https://www.ensembl.info/category/05-community/">Community</a> </li> <li class="cat-item cat-item-11"><a href="https://www.ensembl.info/category/06-jobs/">Jobs @ Ensembl</a> </li> <li class="cat-item cat-item-5"><a href="https://www.ensembl.info/category/07-status/">Service status</a> </li> <li class="cat-item cat-item-1"><a href="https://www.ensembl.info/category/uncategorised/">Uncategorised</a> </li> </ul> </aside> <aside id="archives-2" class="widget widget_archive"> <h3 class='widget-title'>Archives</h3> <label class="screen-reader-text" for="archives-dropdown-2">Archives</label> <select id="archives-dropdown-2" name="archive-dropdown"> <option value="">Select Month</option> <option value='https://www.ensembl.info/2024/11/'> November 2024 </option> <option value='https://www.ensembl.info/2024/10/'> October 2024 </option> <option value='https://www.ensembl.info/2024/08/'> August 2024 </option> <option value='https://www.ensembl.info/2024/07/'> July 2024 </option> <option value='https://www.ensembl.info/2024/06/'> June 2024 </option> <option value='https://www.ensembl.info/2024/05/'> May 2024 </option> <option value='https://www.ensembl.info/2024/04/'> April 2024 </option> <option value='https://www.ensembl.info/2024/03/'> March 2024 </option> <option value='https://www.ensembl.info/2024/02/'> February 2024 </option> <option value='https://www.ensembl.info/2024/01/'> January 2024 </option> <option value='https://www.ensembl.info/2023/11/'> November 2023 </option> <option value='https://www.ensembl.info/2023/10/'> October 2023 </option> <option value='https://www.ensembl.info/2023/09/'> September 2023 </option> <option value='https://www.ensembl.info/2023/08/'> August 2023 </option> <option value='https://www.ensembl.info/2023/07/'> July 2023 </option> <option value='https://www.ensembl.info/2023/06/'> June 2023 </option> <option value='https://www.ensembl.info/2023/05/'> May 2023 </option> <option value='https://www.ensembl.info/2023/04/'> April 2023 </option> <option value='https://www.ensembl.info/2023/03/'> March 2023 </option> <option value='https://www.ensembl.info/2023/02/'> February 2023 </option> <option value='https://www.ensembl.info/2023/01/'> January 2023 </option> <option value='https://www.ensembl.info/2022/12/'> December 2022 </option> <option value='https://www.ensembl.info/2022/11/'> November 2022 </option> <option value='https://www.ensembl.info/2022/10/'> October 2022 </option> <option value='https://www.ensembl.info/2022/08/'> August 2022 </option> <option value='https://www.ensembl.info/2022/07/'> July 2022 </option> <option value='https://www.ensembl.info/2022/06/'> June 2022 </option> <option value='https://www.ensembl.info/2022/05/'> May 2022 </option> <option value='https://www.ensembl.info/2022/04/'> April 2022 </option> <option value='https://www.ensembl.info/2022/03/'> March 2022 </option> <option value='https://www.ensembl.info/2022/02/'> February 2022 </option> <option value='https://www.ensembl.info/2022/01/'> January 2022 </option> <option value='https://www.ensembl.info/2021/12/'> December 2021 </option> <option value='https://www.ensembl.info/2021/11/'> November 2021 </option> <option value='https://www.ensembl.info/2021/10/'> October 2021 </option> <option value='https://www.ensembl.info/2021/09/'> September 2021 </option> <option value='https://www.ensembl.info/2021/08/'> August 2021 </option> <option value='https://www.ensembl.info/2021/06/'> June 2021 </option> <option value='https://www.ensembl.info/2021/05/'> May 2021 </option> <option value='https://www.ensembl.info/2021/04/'> April 2021 </option> <option value='https://www.ensembl.info/2021/03/'> March 2021 </option> <option value='https://www.ensembl.info/2021/02/'> February 2021 </option> <option value='https://www.ensembl.info/2021/01/'> January 2021 </option> <option value='https://www.ensembl.info/2020/12/'> December 2020 </option> <option value='https://www.ensembl.info/2020/11/'> November 2020 </option> <option value='https://www.ensembl.info/2020/10/'> October 2020 </option> <option value='https://www.ensembl.info/2020/09/'> September 2020 </option> <option value='https://www.ensembl.info/2020/08/'> August 2020 </option> <option value='https://www.ensembl.info/2020/07/'> July 2020 </option> <option value='https://www.ensembl.info/2020/06/'> June 2020 </option> <option value='https://www.ensembl.info/2020/05/'> May 2020 </option> <option value='https://www.ensembl.info/2020/04/'> April 2020 </option> <option value='https://www.ensembl.info/2020/03/'> March 2020 </option> <option value='https://www.ensembl.info/2020/02/'> February 2020 </option> <option value='https://www.ensembl.info/2020/01/'> January 2020 </option> <option value='https://www.ensembl.info/2019/11/'> November 2019 </option> <option value='https://www.ensembl.info/2019/10/'> October 2019 </option> <option value='https://www.ensembl.info/2019/09/'> September 2019 </option> <option value='https://www.ensembl.info/2019/08/'> August 2019 </option> <option value='https://www.ensembl.info/2019/07/'> July 2019 </option> <option value='https://www.ensembl.info/2019/06/'> June 2019 </option> <option value='https://www.ensembl.info/2019/05/'> May 2019 </option> <option value='https://www.ensembl.info/2019/04/'> April 2019 </option> <option value='https://www.ensembl.info/2019/03/'> March 2019 </option> <option value='https://www.ensembl.info/2019/02/'> February 2019 </option> <option value='https://www.ensembl.info/2019/01/'> January 2019 </option> <option value='https://www.ensembl.info/2018/11/'> November 2018 </option> <option value='https://www.ensembl.info/2018/10/'> October 2018 </option> <option value='https://www.ensembl.info/2018/09/'> September 2018 </option> <option value='https://www.ensembl.info/2018/08/'> August 2018 </option> <option value='https://www.ensembl.info/2018/07/'> July 2018 </option> <option value='https://www.ensembl.info/2018/06/'> June 2018 </option> <option value='https://www.ensembl.info/2018/05/'> May 2018 </option> <option value='https://www.ensembl.info/2018/04/'> April 2018 </option> <option value='https://www.ensembl.info/2018/03/'> March 2018 </option> <option value='https://www.ensembl.info/2018/02/'> February 2018 </option> <option value='https://www.ensembl.info/2018/01/'> January 2018 </option> <option value='https://www.ensembl.info/2017/12/'> December 2017 </option> <option value='https://www.ensembl.info/2017/11/'> November 2017 </option> <option value='https://www.ensembl.info/2017/10/'> October 2017 </option> <option value='https://www.ensembl.info/2017/09/'> September 2017 </option> <option value='https://www.ensembl.info/2017/08/'> August 2017 </option> <option value='https://www.ensembl.info/2017/07/'> July 2017 </option> <option value='https://www.ensembl.info/2017/06/'> June 2017 </option> <option value='https://www.ensembl.info/2017/05/'> May 2017 </option> <option value='https://www.ensembl.info/2017/04/'> April 2017 </option> <option value='https://www.ensembl.info/2017/03/'> March 2017 </option> <option value='https://www.ensembl.info/2017/02/'> February 2017 </option> <option value='https://www.ensembl.info/2017/01/'> January 2017 </option> <option value='https://www.ensembl.info/2016/12/'> December 2016 </option> <option value='https://www.ensembl.info/2016/11/'> November 2016 </option> <option value='https://www.ensembl.info/2016/10/'> October 2016 </option> <option value='https://www.ensembl.info/2016/08/'> August 2016 </option> <option value='https://www.ensembl.info/2016/07/'> July 2016 </option> <option value='https://www.ensembl.info/2016/06/'> June 2016 </option> <option value='https://www.ensembl.info/2016/04/'> April 2016 </option> <option value='https://www.ensembl.info/2016/03/'> March 2016 </option> <option value='https://www.ensembl.info/2016/02/'> February 2016 </option> <option value='https://www.ensembl.info/2016/01/'> January 2016 </option> <option value='https://www.ensembl.info/2015/12/'> December 2015 </option> <option value='https://www.ensembl.info/2015/11/'> November 2015 </option> <option value='https://www.ensembl.info/2015/10/'> October 2015 </option> <option value='https://www.ensembl.info/2015/09/'> September 2015 </option> <option value='https://www.ensembl.info/2015/08/'> August 2015 </option> <option value='https://www.ensembl.info/2015/07/'> July 2015 </option> <option value='https://www.ensembl.info/2015/06/'> June 2015 </option> <option value='https://www.ensembl.info/2015/05/'> May 2015 </option> <option value='https://www.ensembl.info/2015/04/'> April 2015 </option> <option value='https://www.ensembl.info/2015/03/'> March 2015 </option> <option value='https://www.ensembl.info/2015/02/'> February 2015 </option> <option value='https://www.ensembl.info/2015/01/'> January 2015 </option> <option value='https://www.ensembl.info/2014/12/'> December 2014 </option> <option value='https://www.ensembl.info/2014/11/'> November 2014 </option> <option value='https://www.ensembl.info/2014/10/'> October 2014 </option> <option value='https://www.ensembl.info/2014/08/'> August 2014 </option> <option value='https://www.ensembl.info/2014/07/'> July 2014 </option> <option value='https://www.ensembl.info/2014/06/'> June 2014 </option> <option value='https://www.ensembl.info/2014/05/'> May 2014 </option> <option value='https://www.ensembl.info/2014/04/'> April 2014 </option> <option value='https://www.ensembl.info/2014/03/'> March 2014 </option> <option value='https://www.ensembl.info/2014/02/'> February 2014 </option> <option value='https://www.ensembl.info/2014/01/'> January 2014 </option> <option value='https://www.ensembl.info/2013/12/'> December 2013 </option> <option value='https://www.ensembl.info/2013/11/'> November 2013 </option> <option value='https://www.ensembl.info/2013/10/'> October 2013 </option> <option value='https://www.ensembl.info/2013/09/'> September 2013 </option> <option value='https://www.ensembl.info/2013/08/'> August 2013 </option> <option value='https://www.ensembl.info/2013/07/'> July 2013 </option> <option value='https://www.ensembl.info/2013/06/'> June 2013 </option> <option value='https://www.ensembl.info/2013/05/'> May 2013 </option> <option value='https://www.ensembl.info/2013/04/'> April 2013 </option> <option value='https://www.ensembl.info/2013/03/'> March 2013 </option> <option value='https://www.ensembl.info/2013/02/'> February 2013 </option> <option value='https://www.ensembl.info/2013/01/'> January 2013 </option> <option value='https://www.ensembl.info/2012/12/'> December 2012 </option> <option value='https://www.ensembl.info/2012/11/'> November 2012 </option> <option value='https://www.ensembl.info/2012/10/'> October 2012 </option> <option value='https://www.ensembl.info/2012/09/'> September 2012 </option> <option value='https://www.ensembl.info/2012/08/'> August 2012 </option> <option value='https://www.ensembl.info/2012/07/'> July 2012 </option> <option value='https://www.ensembl.info/2012/06/'> June 2012 </option> <option value='https://www.ensembl.info/2012/05/'> May 2012 </option> <option value='https://www.ensembl.info/2012/04/'> April 2012 </option> <option value='https://www.ensembl.info/2012/03/'> March 2012 </option> <option value='https://www.ensembl.info/2012/02/'> February 2012 </option> <option value='https://www.ensembl.info/2012/01/'> January 2012 </option> <option value='https://www.ensembl.info/2011/12/'> December 2011 </option> <option value='https://www.ensembl.info/2011/11/'> November 2011 </option> <option value='https://www.ensembl.info/2011/10/'> October 2011 </option> <option value='https://www.ensembl.info/2011/09/'> September 2011 </option> <option value='https://www.ensembl.info/2011/08/'> August 2011 </option> <option value='https://www.ensembl.info/2011/07/'> July 2011 </option> <option value='https://www.ensembl.info/2011/06/'> June 2011 </option> <option value='https://www.ensembl.info/2011/05/'> May 2011 </option> <option value='https://www.ensembl.info/2011/04/'> April 2011 </option> <option value='https://www.ensembl.info/2011/03/'> March 2011 </option> <option value='https://www.ensembl.info/2011/02/'> February 2011 </option> <option value='https://www.ensembl.info/2011/01/'> January 2011 </option> <option value='https://www.ensembl.info/2010/12/'> December 2010 </option> <option value='https://www.ensembl.info/2010/11/'> November 2010 </option> <option value='https://www.ensembl.info/2010/10/'> October 2010 </option> <option value='https://www.ensembl.info/2010/09/'> September 2010 </option> <option value='https://www.ensembl.info/2010/08/'> August 2010 </option> <option value='https://www.ensembl.info/2010/07/'> July 2010 </option> <option value='https://www.ensembl.info/2010/06/'> June 2010 </option> <option value='https://www.ensembl.info/2010/05/'> May 2010 </option> <option value='https://www.ensembl.info/2010/04/'> April 2010 </option> <option value='https://www.ensembl.info/2010/03/'> March 2010 </option> <option value='https://www.ensembl.info/2010/02/'> February 2010 </option> <option value='https://www.ensembl.info/2010/01/'> January 2010 </option> <option value='https://www.ensembl.info/2009/12/'> December 2009 </option> <option value='https://www.ensembl.info/2009/11/'> November 2009 </option> <option value='https://www.ensembl.info/2009/10/'> October 2009 </option> <option value='https://www.ensembl.info/2009/09/'> September 2009 </option> <option value='https://www.ensembl.info/2009/08/'> August 2009 </option> <option value='https://www.ensembl.info/2009/07/'> July 2009 </option> <option value='https://www.ensembl.info/2009/06/'> June 2009 </option> <option value='https://www.ensembl.info/2009/05/'> May 2009 </option> <option value='https://www.ensembl.info/2009/04/'> April 2009 </option> <option value='https://www.ensembl.info/2009/03/'> March 2009 </option> <option value='https://www.ensembl.info/2009/02/'> February 2009 </option> <option value='https://www.ensembl.info/2009/01/'> January 2009 </option> <option value='https://www.ensembl.info/2008/12/'> December 2008 </option> <option value='https://www.ensembl.info/2008/11/'> November 2008 </option> <option value='https://www.ensembl.info/2008/10/'> October 2008 </option> <option value='https://www.ensembl.info/2008/09/'> September 2008 </option> <option value='https://www.ensembl.info/2008/08/'> August 2008 </option> <option value='https://www.ensembl.info/2008/07/'> July 2008 </option> <option value='https://www.ensembl.info/2008/06/'> June 2008 </option> <option value='https://www.ensembl.info/2008/05/'> May 2008 </option> <option value='https://www.ensembl.info/2008/04/'> April 2008 </option> <option value='https://www.ensembl.info/2008/03/'> March 2008 </option> <option value='https://www.ensembl.info/2008/02/'> February 2008 </option> <option value='https://www.ensembl.info/2008/01/'> January 2008 </option> <option value='https://www.ensembl.info/2007/11/'> November 2007 </option> <option value='https://www.ensembl.info/2007/08/'> August 2007 </option> <option value='https://www.ensembl.info/2007/06/'> June 2007 </option> </select> <script type="text/javascript"> /* <![CDATA[ */ (function() { var dropdown = document.getElementById( "archives-dropdown-2" ); function onSelectChange() { if ( dropdown.options[ dropdown.selectedIndex ].value !== '' ) { document.location.href = this.options[ this.selectedIndex ].value; } } dropdown.onchange = onSelectChange; })(); /* ]]> */ </script> </aside> <aside id="meta-2" class="widget widget_meta"> <h3 class='widget-title'>Meta</h3> <ul> <li><a href="https://www.ensembl.info/wp-login.php">Log in</a></li> <li><a href="https://www.ensembl.info/feed/">Entries feed</a></li> <li><a href="https://www.ensembl.info/comments/feed/">Comments feed</a></li> <li><a href="https://en-gb.wordpress.org/">WordPress.org</a></li> </ul> </aside> <aside id="custom_html-3" class="widget_text widget widget_custom_html"> <div class="textwidget custom-html-widget">© 2018 EMBL-EBI. All rights reserved. Background image by Spencer Phillips</div> </aside> <aside id="eu_cookie_law_widget-2" class="widget widget_eu_cookie_law_widget"> <div class="hide-on-button" data-hide-timeout="30" data-consent-expiration="365" id="eu-cookie-law" > <form method="post"> <input type="submit" value="Close and accept" class="accept" /> </form> This website requires cookies, and the limited processing of your personal data in order to function. By using the site you are agreeing to this as outlined in our <a href="https://www.ebi.ac.uk/data-protection/ensembl/privacy-notice" rel=""> Privacy Policy </a> </div> </aside> </div> </div> </div> <div id="primary" class="content-area col-lg-9" itemprop="mainContentOfPage"> <div itemscope itemtype="http://schema.org/Article" id="content" class="site-content" role="main"> <article id="post-9395" class="post-9395 page type-page status-publish hentry"> <header class="entry-header"> <h1 class="entry-title" itemprop="name">Project ideas</h1> </header> <div class="entry-content" itemprop="articleBody"> <p>Ensembl is part of the Genome Assembly and Annotation (GAA) section of EMBL-EBI. The GAA section contains other popular services such as <a href="https://www.ebi.ac.uk/metagenomics/">MGnify</a>, <a href="https://www.genenames.org/">HGNC</a>/<a href="https://vertebrate.genenames.org/">VGNC</a> and <a href="https://wormbase.org/">Wormbase</a>. Below are project ideas related to the activities of the section. GAA is able to host funded contributors on summer projects or internships at various times throughout the year<span id="more-9395"></span></p> <h2 id="Studentprojects-Projects:EnsemblGenebuild">Projects: Ensembl Genebuild</h2> <h3 id="Studentprojects-UsingDeepLearningtoClassifyRepeatFeatures">Using Deep Learning to Classify Repeat Features</h3> <p><strong>Brief Explanation</strong></p> <p>Finding and classifying repetitive DNA sequence in eukaryotic genomes is both an important first step ahead of further genome annotation, and also interesting in its own right as repeats frequently drive genome evolution. Repeats in DNA can be broken into a number of different major classes such as LINEs, SINEs and LTRs. Global biodiversity efforts such as Darwin Tree of Life, the European Reference Genome Atlas and the Earth BioGenome Project are producing hundreds and soon thousands of high-quality reference genomes, that will all need repeat annotation. Currently we have two potential approach to annotating repeats. The first is building a repeat library for a species (using RepeatModeler) and then annotating the repeats on the genome (using RepeatMasker). This method both finds and classifies the repeats and finds lineage specific repeats, however building a repeat library is computationally costly. The second approach is to use an extremely fast k-mer approach (REpeatDetector, aka Red), to mask the genome in a fraction of this time. The downside is that this approach does not classify repeats and so is not very informative for researchers studying repeat evolution.</p> <p>In this project we want to explore Deep Learning in order to help classify repeats. We have large existing training sets across hundreds of species, spanning billions of classified repeats. As part of this projects you would train a neural network to take as input an unclassified repeat sequence and label it according to the class of repeats it belongs. You will explore the most efficient approach in terms of both preparing the training data and constructing the network. If the training is successful, we will then test the resulting model from a perspective of compute efficiency, i.e. does the model produce similar results to our existing method of classification (i.e. building a repeat library for the species and then using it to find and classify repeats) and what is the relative compute cost in each approach.</p> <p>Depending on the success and progress related to the above, there may also be the opportunity to take the project a step further, in terms of generative repeat library construction, i.e. given a fast k-mer derived set of repeat sequences and their coordinates on the genome, is it possible to generate a repeat library. This would be highly experimental and only considered after fast and excellent progress on the core project.</p> <p><strong>Expected results</strong></p> <ul> <li>A Deep Learning model for classifying repeat sequences into major classes</li> <li>A comparison of the efficiency of said model to our traditional approach in terms of compute cost</li> </ul> <p><strong>Required knowledge</strong></p> <ul> <li>ML frameworks such as PyTorch/Keras or similar</li> <li>Python</li> </ul> <p><strong>Desirable knowledge</strong></p> <ul> <li>Understanding of repeat biology and associated software</li> <li>Training on Slurm/LSF</li> </ul> <p><strong>Difficulty</strong></p> <p>Medium</p> <p><strong>Length</strong></p> <p>350h</p> <p><strong>Mentors</strong></p> <p><strong><a class="confluence-userlink user-mention userlink-1" title="" href="https://www.ebi.ac.uk/people/person/fergal-martin/" data-username="fergal" data-linked-resource-id="23761542" data-linked-resource-version="3" data-linked-resource-type="userinfo" data-base-url="https://www.ebi.ac.uk/seqdb/confluence" data-user-hover-bound="true">Fergal Martin</a> <a class="confluence-userlink user-mention userlink-2" title="" href="https://www.ebi.ac.uk/people/person/leanne-haggerty/" data-username="leanne" data-linked-resource-id="50472679" data-linked-resource-version="2" data-linked-resource-type="userinfo" data-base-url="https://www.ebi.ac.uk/seqdb/confluence" data-user-hover-bound="true">Leanne Haggerty</a> </strong></p> <h3 id="Studentprojects-ANextflowPipelineforRepeatAnnotation">A Nextflow Pipeline for Repeat Annotation</h3> <p><strong>Brief Explanation</strong></p> <p>Finding and classifying repetitive DNA sequence in eukaryotic genomes is both an important first step ahead of further genome annotation, and also interesting in its own right as repeats frequently drive genome evolution. Global biodiversity efforts such as Darwin Tree of Life, the European Reference Genome Atlas and the Earth BioGenome Project are producing hundreds and soon thousands of high-quality reference genomes, that will all need repeat annotation. Currently our repeat annotation pipelines are run via an in-house workflow management system, eHive. eHive is Perl based and nearing end of life and as a result we are transitioning much or our infrastructure to other workflow managers such as Nextflow.</p> <p>In this project you will work together with us to help redesign our repeat annotation pipeline. We will identify all the existing components, decide what to keep and what to remove and then come up with a final workflow. You will then implement this workflow using Nextflow and test the deployment both locally and on our various cloud partners. Time permitting we will work on costing the pipeline using a variety of species to come up with a cost per gigabase of sequence to mask repeats. Similarly, if there is additional time, we will look at large scale deployment of the pipeline on our species to build a consistent set of repeat resources for public use.</p> <p><strong>Expected results</strong></p> <ul> <li>A Nextflow pipeline to generate resources related to repetitive elements in eukaryotic genomes</li> <li>Test deployment in a production environment</li> </ul> <p><strong>Required knowledge</strong></p> <ul> <li>Nextflow</li> <li>Python</li> </ul> <p><strong>Desirable knowledge</strong></p> <ul> <li>Cloud deployment</li> <li>Containerisation</li> <li>Slurm/LSF</li> </ul> <p><strong>Difficulty</strong></p> <p>Medium</p> <p><strong>Length</strong></p> <p>350h</p> <p><strong>Mentors</strong></p> <p><strong><a class="confluence-userlink user-mention userlink-2" title="" href="https://www.ebi.ac.uk/people/person/leanne-haggerty/" data-username="leanne" data-linked-resource-id="50472679" data-linked-resource-version="2" data-linked-resource-type="userinfo" data-base-url="https://www.ebi.ac.uk/seqdb/confluence" data-user-hover-bound="true">Leanne Haggerty</a><a class="confluence-userlink user-mention userlink-3" title="" href="https://www.ebi.ac.uk/people/person/thiago-genez/" data-username="thiagogenez" data-linked-resource-id="129405047" data-linked-resource-version="1" data-linked-resource-type="userinfo" data-base-url="https://www.ebi.ac.uk/seqdb/confluence" data-user-hover-bound="true">Thiago Genez</a> <a class="confluence-userlink user-mention userlink-1" title="" href="https://www.ebi.ac.uk/people/person/fergal-martin/" data-username="fergal" data-linked-resource-id="23761542" data-linked-resource-version="3" data-linked-resource-type="userinfo" data-base-url="https://www.ebi.ac.uk/seqdb/confluence" data-user-hover-bound="true">Fergal Martin</a> </strong></p> <h3 id="Studentprojects-UsingDeepLearningtoIdentifyFeaturesofProtein-CodingGenes">Using Deep Learning to Identify Features of Protein-Coding Genes</h3> <p><strong>Brief Explanation</strong></p> <p>Protein-coding genes form the basis of many scientific analyses. They have directly links to important real world problems such as human health, food security and ecosystem conservation. Global biodiversity efforts such as Darwin Tree of Life, the European Reference Genome Atlas and the Earth BioGenome Project are producing hundreds and soon thousands of high-quality reference genomes, and these genomes need structural annotation of genes.</p> <p>Protein-coding genes are made up of many distinct features such as exons, introns, splice sites, codons, transcription/translation initiation and termination sites, UTRs and promotors. These features have associated signals that can be very strong, for example the coding regions almost always start with the sequence ATG, followed by triplets of nucleotides, and end in either TAG, TAA or TGA. Similarly splice sites usually have a donor GT and an acceptor GT sequence. Other signals such as promotors or transcription initiation/termination sites can be more complex and degenerative. It should be possible to describe the potential set of genes and the coordinates of the underlying features from the DNA sequence of the genome alone, however existing methods to do this, often based on Hidden Markov Models (HMMs) are generally inaccurate and do not produce a highly detailed or accurate annotation of the genes.</p> <p>Traditional methods to produce a high-quality annotation generally use RNA or protein data and attempt to match these data against corresponding regions of the genome to identify the genes and the corresponding substructures. The problem with these methods is that they are costly and time consuming to generate input data for (RNA), or the data may be taken from other species and the greater the evolutionary distance the higher the number of errors in the resulting annotation (often true for protein data). As such traditional methods will not work for every species, as we will not have the appropriate RNA/protein data in the majority of cases.</p> <p>In this project we want to explore Deep Learning in order to help accurately identity structures associated with protein-coding genes from the genome of the species. Together we will construct a training set of high-confidence annotations of genes using existing annotation from both our data and other sources for hundreds of eukaryotic genomes. We will attempt to first find high level features, i.e. regions of the genome likely to contain protein-coding genes based on the k-mer profiles and absence of repetitive sequence, which we will train a model to recognise. Once this initial model is accurately identifying candidate regions we will build a model to then look at fine grained feature extraction.</p> <p>We will work with you to build the high confidence set of training data we can use in terms of the analysis. Your main task will be to implement the high-level and fine-grained models and test different network architectures and methods of pre-processing the training data. We will work together on analysing the results against both gold standard, near complete annotations and also compare species annotated from a variety of other methods and with varying levels of quality.</p> <p><strong>Expected results</strong></p> <ul> <li>A Deep Learning approach to identifying key features of protein-coding genes</li> <li>A comparison of the results against gold standard reference annotations and other annotation approaches</li> </ul> <p><strong>Required knowledge</strong></p> <ul> <li>ML frameworks such as PyTorch/Keras or similar</li> <li>Python</li> </ul> <p><strong>Desirable knowledge</strong></p> <ul> <li>Understanding of gene annotation and associated software</li> <li>Training on Slurm/LSF</li> </ul> <p><strong>Difficulty</strong></p> <p>Very hard</p> <p><strong>Length</strong></p> <p>350h</p> <p><strong>Mentors</strong></p> <p><strong><a class="confluence-userlink user-mention userlink-1" title="" href="https://www.ebi.ac.uk/people/person/fergal-martin/" data-username="fergal" data-linked-resource-id="23761542" data-linked-resource-version="3" data-linked-resource-type="userinfo" data-base-url="https://www.ebi.ac.uk/seqdb/confluence" data-user-hover-bound="true">Fergal Martin</a> <a class="confluence-userlink user-mention userlink-2" title="" href="https://www.ebi.ac.uk/people/person/leanne-haggerty/" data-username="leanne" data-linked-resource-id="50472679" data-linked-resource-version="2" data-linked-resource-type="userinfo" data-base-url="https://www.ebi.ac.uk/seqdb/confluence" data-user-hover-bound="true">Leanne Haggerty</a> </strong></p> <h3 id="Studentprojects-ImprovedTranscriptRepresentationinNon-ModelOrganismsviaDeepLearning">Improved Transcript Representation in Non-Model Organisms via Deep Learning</h3> <p><strong>Brief Explanation</strong></p> <p>Genes form the basis of many scientific analyses. They have directly links to important real world problems such as human health, food security and ecosystem conservation. Global biodiversity efforts such as Darwin Tree of Life, the European Reference Genome Atlas and the Earth BioGenome Project are producing hundreds and soon thousands of high-quality reference genomes, and these genomes need structural annotation of genes.</p> <p>Genes are made up of many different features, but exons are arguably the key feature as they represent the blocks of the genome that are transcribed in to RNA, which may form functional structures, regulate the expression of other genes or encode proteins. Under certain conditions exons may be included or skipped in the transcribed RNA, sometimes leading to different functional outcomes. A particular permutation of exons that forms a transcribed RNA is known as a transcript. While there is often on particular transcript that represents the normal state of the gene, and thus is most prevalent, it is very common to have alternative transcripts expressed, particularly in higher eukaryotes. These may be expressed in different tissues or points in time, or simply expressed continuously but at a lower level to the dominant transcript.</p> <p>It is important to have as complete a representation of the full set of transcripts in a gene as possible. Short read sequencing is a common method for finding alternative transcript structures, however the nature of the technology means we cannot be certain that the permutations of exons we infer from short read data actually exist in reality. Long read data allows us to directly observer full length RNA and thus should allow us to confidently identify alternative transcripts, but the technology is less common place and also does not capture as many genes as short read data. There are also frequently fragmented data present.</p> <p>The objective of this project will be to examine methods of better representing potential full length transcripts via deep learning. We will preform our test in mammals, where there are several high-quality reference annotations (human and mouse in particular). We will take genes from mammals where large quantities of long read data are available and identify high confidence sets of alternative transcripts. We will then utilise the union of the exons described in these transcripts to attempt to help train a model capable of validating alternative transcripts. There are two approaches we could take, the first would be to find the longest possible exon chain, assume this is the dominant transcript and automatically generate a set of alternative transcripts with exon skipping, where the model would produce a binary output as to whether or not a permutation was valid. This approach would be sraightforward, but as some genes can have many exons, this could generate many permutations. The other approach would be to try and build a generative model, where the input is the union of all unique exons across the input set, while the output would be a set of transcripts and the exons contained in each. This would be more robust, but would require a more complex model.</p> <p>The project will involve you working with us to identify suitable training data from our existing annotations and assessing and implementing a suitable approach to using the data to train a model. Your work will help decide which approach is most viable and you will be responsible for implementing and training the corresponding model.</p> <p>We will test the resulting model in terms of how accurately it can validate true alternative transcripts in both gold standard and non-model mammalian species. Time permitting we may consider extending past mammals into other eukaryotes to see how generalisable it is</p> <p><strong>Expected results</strong></p> <ul> <li>A Deep Learning approach to identifying valid alternative transcripts</li> <li>A comparison of the results against gold standard reference annotations and non-model annotations</li> </ul> <p><strong>Required knowledge</strong></p> <ul> <li>ML frameworks such as PyTorch/Keras or similar</li> <li>Python</li> </ul> <p><strong>Desirable knowledge</strong></p> <ul> <li>Understanding of gene annotation, particularly transcriptomic data and associated software</li> <li>Training on Slurm/LSF</li> </ul> <p><strong>Difficulty</strong></p> <p>Hard</p> <p><strong>Length</strong></p> <p>350h</p> <p><strong>Mentors</strong></p> <p><strong><a class="confluence-userlink user-mention userlink-1" title="" href="https://www.ebi.ac.uk/people/person/fergal-martin/" data-username="fergal" data-linked-resource-id="23761542" data-linked-resource-version="3" data-linked-resource-type="userinfo" data-base-url="https://www.ebi.ac.uk/seqdb/confluence" data-user-hover-bound="true">Fergal Martin</a> <a class="confluence-userlink user-mention userlink-2" title="" href="https://www.ebi.ac.uk/people/person/leanne-haggerty/" data-username="leanne" data-linked-resource-id="50472679" data-linked-resource-version="2" data-linked-resource-type="userinfo" data-base-url="https://www.ebi.ac.uk/seqdb/confluence" data-user-hover-bound="true">Leanne Haggerty</a> <a class="confluence-userlink user-mention userlink-4" title="" href="https://www.ebi.ac.uk/people/person/adam-frankish/" data-username="frankish" data-linked-resource-id="57283199" data-linked-resource-version="1" data-linked-resource-type="userinfo" data-base-url="https://www.ebi.ac.uk/seqdb/confluence" data-user-hover-bound="true">Adam Frankish</a> </strong></p> <h3 id="Studentprojects-ImprovingthedefinitionofUTRboundariesviaDeepLearning">Improving the definition of UTR boundaries via Deep Learning</h3> <p><strong>Brief Explanation</strong></p> <p>Untranslated regions (UTRs) represent the boundaries of protein-coding genes. These regions are important for understanding where one gene ends and a neighbouring gene starts. UTR regions sometimes house features that regulate the expression of the gene in addition to being key to analysing the expression of the gene when using single cell data.</p> <p>Annotating UTRs is difficult. It is clear from long and short read transcriptomic data that there is rarely a precise start/end to the UTRs of a gene. There are usually regions where there transcriptional machinary is more likely to attach or detach. In particular, short read data (which is most frequently available) is naturally imprecise for determining the start/end of the UTR as each read represents a small fragment of the gene. If the sampling of these small fragments is uneven, it leads to incorrect identification of the start/end. At the same time the cellular machinery for transcription is able to identify these binding/release regions despite not fundamentally changing across eukaryotes, so it should be possible to directly identify their approximate locations directly from the genome sequence.</p> <p>In this project we will explore the use of long read data and high-quality reference annotations to train a model to predict the location of a UTR start or end from a sequence adjacent to a coding region start/end. While it will not be possible to do this for all UTRs, particularly ones that are very long or have large introns contained within, we will be able to train to predict simple UTR start/ends within a fixed window. This will assist with better representation of UTRs, particularly in species lacking transcriptomic data.</p> <p>We will work together to build a training set consisting of genes where we are confident we have captured repesentative UTR boundaries. When several possible boundaries in one of these genes are present, we will select the longest UTR boundary, unless it is infrequently observed relative to the number of long reads mapped to the gene (in which case the boundary will be set to be a balance of the longest UTR observed in more than 20 percent of the reads). We will use as much of the sequence of the flanking region as possible along with the coordinate of the selected boundary, to then train the model to predict the boundary coordinates. You will be responsible for building the network and testing different hyperparameters during training. We will then compare to gold standard reference annotations and look at the approximate distance between the predicted and true boundaries to evaluate the model.</p> <p><strong>Expected results</strong></p> <ul> <li>A Deep Learning model for predicting the coordinates of 5′ and 3′ UTR boundaries</li> <li>A comparison of the results against gold standard reference annotations</li> </ul> <p><strong>Required knowledge</strong></p> <ul> <li>ML frameworks such as PyTorch/Keras or similar</li> <li>Python</li> </ul> <p><strong>Desirable knowledge</strong></p> <ul> <li>Understanding of gene structures, particularly UTRs</li> <li>Training on Slurm/LSF</li> </ul> <p><strong>Difficulty</strong></p> <p>Hard</p> <p><strong>Length</strong></p> <p>350h</p> <p><strong>Mentors</strong></p> <p><strong><a class="confluence-userlink user-mention userlink-1" title="" href="https://www.ebi.ac.uk/people/person/fergal-martin/" data-username="fergal" data-linked-resource-id="23761542" data-linked-resource-version="3" data-linked-resource-type="userinfo" data-base-url="https://www.ebi.ac.uk/seqdb/confluence" data-user-hover-bound="true">Fergal Martin</a> <a class="confluence-userlink user-mention userlink-2" title="" href="https://www.ebi.ac.uk/people/person/leanne-haggerty/" data-username="leanne" data-linked-resource-id="50472679" data-linked-resource-version="2" data-linked-resource-type="userinfo" data-base-url="https://www.ebi.ac.uk/seqdb/confluence" data-user-hover-bound="true">Leanne Haggerty</a> <a class="confluence-userlink user-mention userlink-4" title="" href="https://www.ebi.ac.uk/people/person/adam-frankish/" data-username="frankish" data-linked-resource-id="57283199" data-linked-resource-version="1" data-linked-resource-type="userinfo" data-base-url="https://www.ebi.ac.uk/seqdb/confluence" data-user-hover-bound="true">Adam Frankish</a> </strong></p> <h2 id="Studentprojects-Projects:SoftwareDevelopmentforMGnify">Projects: Software Development for <a class="external-link" href="https://www.ebi.ac.uk/metagenomics/" rel="nofollow">MGnify</a></h2> <h3 id="Studentprojects-MGnifyDataVisualisations">MGnify Data Visualisations</h3> <p id="Studentprojects-BriefExplanation"><strong>Brief Explanation</strong></p> <p>MGnify (<a class="external-link" href="https://www.ebi.ac.uk/metagenomics" rel="nofollow">https://www.ebi.ac.uk/metagenomics</a>) is a freely available hub for the analysis and exploration of metagenomic, metatranscriptomic, amplicon and assembly data. The resource provides rich functional and taxonomic analyses of user-submitted sequences, as well as analysis of publicly available metagenomic datasets held within the European Nucleotide Archive (ENA).</p> <p>The public-facing service is a React.js website backed by a Python/Django REST API, which serves metagenomics data and associated analyses via API endpoints and data files. There are also micro-services for specific tasks like sequence searches. In addition to the website, MGnify provides <a class="external-link" href="https://docs.mgnify.org/src/docs/notebooks.html" rel="nofollow">hosted Jupyter Notebooks</a> to cover extra use cases and showcase how the MGnify API-provided data can be used in downstream data analysis tasks (using R and Python).</p> <p>Together, the website and notebooks include many data visualisation built using various technologies: Highcharts (Javascript) for website graphics like nucleotide distributions, specialised javascript components like the Integrative Genomics Viewer for genome annotations, and matplotlib and ggplot for graphics created in the Jupyter notebooks.</p> <p>As MGnify approach the release of our next-generation analysis pipeline, the aim is develop a reusable framework for managing these visualisations. Specifically, we aim to reuse components and libraries in as many places as possible, and to support FAIR (Findable, Accessible, Interoperable, Reusable) principles by enabling our users to easily build upon the visualisations we provide. An example could be: the MGnify website using a <a class="external-link" href="https://d3js.org/" rel="nofollow">d3.js</a> histogram to display protein annotation information, from where users can jump to an <a class="external-link" href="https://observablehq.com/" rel="nofollow">Observable JS Notebook</a> with the required API fetching code and d3 visualisation code ready for them to modify to produce a graphic suitable for their own publication.</p> <p id="Studentprojects-Expectedresults"><strong>Expected results</strong></p> <ul> <li>Propose a rational approach to data visualisations across MGnify frontends</li> <li>Migrate a subset of the existing website visualisations to conform with the new approach</li> <li>Implement user-editable visualisations (e.g. Notebooks)</li> <li>Document and provide code examples for the approaches and libraries used</li> </ul> <p id="Studentprojects-Requiredknowledge"><strong>Required knowledge</strong></p> <ul> <li>Python: some data analysis experience required (e.g. Pandas, Matplotlib, Spark)</li> <li>Javascript: some modern front-end work required (e.g. React) and visualisation experience (e.g. d3.js)</li> <li>Ideally some experience with notebook coding: Jupyter or Observable.</li> </ul> <p id="Studentprojects-Difficulty"><strong>Difficulty</strong></p> <p>Adaptable</p> <p id="Studentprojects-Expectedsizeofproject(175or350hour)"><strong>Expected size of project (175 or 350 hour) </strong></p> <p>175 hours</p> <p id="Studentprojects-Mentors"><strong>Mentors</strong></p> <p><a class="confluence-userlink user-mention userlink-5" title="" href="https://www.ebi.ac.uk/people/person/martin-beracochea/" data-username="mbc" data-linked-resource-id="101067667" data-linked-resource-version="1" data-linked-resource-type="userinfo" data-base-url="https://www.ebi.ac.uk/seqdb/confluence" data-user-hover-bound="true">Martin Beracochea</a> <a class="confluence-userlink user-mention userlink-6" title="" href="https://www.ebi.ac.uk/people/person/sandy-alexander-rogers/" data-username="sandyr" data-linked-resource-id="149785280" data-linked-resource-version="1" data-linked-resource-type="userinfo" data-base-url="https://www.ebi.ac.uk/seqdb/confluence" data-user-hover-bound="true">Alexander Rogers</a></p> <h2 id="Studentprojects-Projects:HAVANA">Projects: HAVANA</h2> <h3 id="Studentprojects-Defininggeneboundaries">Defining gene boundaries</h3> <p><strong>Brief Explanation</strong></p> <p>Understanding the impact of genetic variation on disease requires comprehensive gene annotation. Human genes are well characterised following more than two decades of work on their annotation, however, we know that this annotation is not complete and that new experimental methods are generating data to help us towards the goal of complete gene annotation. Long transcriptomic reads allow us to identify and annotate many new features, including the start and end of a transcript which can be combined to give information for genes. We would like to develop a pipeline to extract long transcriptomic data from the European Nucleotide Archive (ENA), map to the human reference genome and extract the terminal co-ordinates to create a growing collection of transcript start/end positions. This data will support improving the accuracy of gene annotation of individual transcripts and genes and give insight into any differences between transcript start and end sites across different tissues</p> <p><strong>Expected results</strong></p> <ul> <li>Code to extract read data from ENA, map to genome and calculate termini</li> <li>Database of termini and read metadata</li> <li>Code to extract data from database and format for browser viewing</li> </ul> <p><strong>Required knowledge</strong></p> <ul> <li>Transcript mapping (long-read RNA-seq alignment)</li> <li>Workflow manager</li> </ul> <p><strong>Difficulty</strong></p> <p>Adaptable</p> <p><strong>Length</strong></p> <p>350h</p> <p><strong>Mentors</strong></p> <p><a class="confluence-userlink user-mention userlink-8" title="" href="https://www.ebi.ac.uk/people/person/jonathan-mudge/" data-username="jmudge" data-linked-resource-id="54629678" data-linked-resource-version="1" data-linked-resource-type="userinfo" data-base-url="https://www.ebi.ac.uk/seqdb/confluence" data-user-hover-bound="true">Jonathan Mudge</a> <a class="confluence-userlink user-mention userlink-9" title="" href="https://www.ebi.ac.uk/people/person/jose-manuel-gonzalez-martinez/" data-username="jmgonzalez" data-linked-resource-id="57304957" data-linked-resource-version="2" data-linked-resource-type="userinfo" data-base-url="https://www.ebi.ac.uk/seqdb/confluence" data-user-hover-bound="true">Jose Gonzalez</a> <a class="confluence-userlink user-mention userlink-4" title="" href="https://www.ebi.ac.uk/people/person/adam-frankish/" data-username="frankish" data-linked-resource-id="57283199" data-linked-resource-version="1" data-linked-resource-type="userinfo" data-base-url="https://www.ebi.ac.uk/seqdb/confluence" data-user-hover-bound="true">Adam Frankish</a></p> <h3 id="Studentprojects-Usingmachinelearningtoannotatedifficultgenes">Using machine learning to annotate difficult genes</h3> <p><strong>Brief Explanation</strong></p> <p>Understanding the impact of genetic variation on disease requires comprehensive gene annotation. Human genes are well characterised following more than two decades of work on their annotation, however, we know that this annotation is not complete and that new experimental methods are generating data to help us towards the goal of complete gene annotation. We have developed an automated workflow to use long transcriptomic data to add novel alternatively spliced transcripts to our gene annotation. Our method uses very strict thresholds to ensure that no poor-quality models are added to the gene annotation, although as a consequence we reject significant numbers of viable novel transcripts. We want to use machine learning to recover good quality but rejected transcripts and improve the setting of initial filters for new datasets.</p> <p><strong>Expected results</strong></p> <ul> <li>Install and learn to use a machine learning package</li> <li>Run it on known gene annotation</li> <li>Deliverable: simple model that helps to recover valid transcripts; set of most relevant features for decision making</li> </ul> <p><strong>Required knowledge</strong></p> <ul> <li>Machine learning</li> </ul> <p><strong>Difficulty</strong></p> <p>Adaptable</p> <p><strong>Length</strong></p> <p>350h</p> <p><strong>Mentors</strong></p> <p><a class="confluence-userlink user-mention userlink-8" title="" href="https://www.ebi.ac.uk/people/person/jonathan-mudge/" data-username="jmudge" data-linked-resource-id="54629678" data-linked-resource-version="1" data-linked-resource-type="userinfo" data-base-url="https://www.ebi.ac.uk/seqdb/confluence" data-user-hover-bound="true">Jonathan Mudge</a> <a class="confluence-userlink user-mention userlink-9" title="" href="https://www.ebi.ac.uk/people/person/jose-manuel-gonzalez-martinez/" data-username="jmgonzalez" data-linked-resource-id="57304957" data-linked-resource-version="2" data-linked-resource-type="userinfo" data-base-url="https://www.ebi.ac.uk/seqdb/confluence" data-user-hover-bound="true">Jose Gonzalez</a> <a class="confluence-userlink user-mention userlink-4" title="" href="https://www.ebi.ac.uk/people/person/adam-frankish/" data-username="frankish" data-linked-resource-id="57283199" data-linked-resource-version="1" data-linked-resource-type="userinfo" data-base-url="https://www.ebi.ac.uk/seqdb/confluence" data-user-hover-bound="true">Adam Frankish</a></p> <h2 id="Studentprojects-Projects:Metazoa">Projects: Metazoa</h2> <h3 id="Studentprojects-Developanautomaticsystemtoflaganyupdated/newspeciesandrankthem">Develop an automatic system to flag any updated/new species and rank them</h3> <p><strong>Brief Explanation</strong></p> <p>Ensembl Metazoa plans every release by manually collecting a list of available species from INSDC resources a few months in advance, and then going over their available information (e.g. taxonomic clade, assembly quality, annotation availability/quality, RefSeq availability, etc.) to filter out and select about 20 species that will be processed and loaded into the next Ensembl release. As an example, taxonomic information is used to highlight species that cover new clades not present in Ensembl, as well as those that bring novel information to existing clades, e.g. new locust genomes in the well-known Neoptera clade.</p> <p>In our plans to expand our Ensembl Metazoa resources we would like to introduce automation in the process described above to check available new species/updates from INSDC resources, as well as create a system that allows us to rank them depending on different criteria. This system should collect the data on a regular basis, e.g. monthly, and provide all the required information to easily ingest it into our production loading system, e.g. GCA, species name, strain, common name, taxonomy,… Additionally, it would be desirable if the new system could rely on our JIRA tracking system to create and update this information, so we can feed this information programmatically into our processing and loading system.</p> <p><strong>Expected results</strong></p> <ul> <li>Automatic system that can run monthly and provide a list of available updates and new species in INSDC</li> </ul> <p><strong>Required knowledge</strong></p> <ul> <li>Python + pytest</li> </ul> <p><strong>Desirable knowledge</strong></p> <ul> <li>Understanding of taxonomy information</li> <li>Understanding of INSDC resources and their REST end-points, i.e. Entrez, ENA Portal API</li> <li>JIRA</li> </ul> <p><strong>Difficulty</strong></p> <p>Medium</p> <p><strong>Length</strong></p> <p>350h</p> <p><strong>Mentors</strong></p> <p><strong><a class="confluence-userlink user-mention userlink-1" title="" href="https://www.ebi.ac.uk/seqdb/confluence/display/~jalvarez" data-username="jalvarez" data-linked-resource-id="97129775" data-linked-resource-version="3" data-linked-resource-type="userinfo" data-base-url="https://www.ebi.ac.uk/seqdb/confluence" data-user-hover-bound="true">Jorge Alvarez</a></strong></p> <h3 id="Studentprojects-Expandthespeciessearchfunctionalityforbetawebsite">Expand the species search functionality for beta website</h3> <p><strong>Brief Explanation</strong></p> <p>The search engine of any website can be one of the most useful tools for users to help them easily retrieve the information they are looking for. Currently, Ensembl’s search tool works based on indexed fields of our databases, that mainly covers key information, e.g. genes, species, proteins, including many synonyms for every one of them. As we plan to move to our new beta website by the end of 2023, we want to make our search engine even better so our users can enjoy the experience of using Ensembl even more.</p> <p>We would like to expand our Ensembl beta’s search functionality to include and support searching based on taxonomic information. In particular, we are interested in providing users a list of close relatives when a given species is requested and it is not part of Ensembl (yet), return the list of species available given a taxonomic clade instead of a species name, or find a species even when a (homotypic) synonym is provided instead of its current scientific name. The objective of this project is to create a standalone Elasticsearch tool that can handle taxonomic-related requests.</p> <p><strong>Expected results</strong></p> <ul> <li>Search tool returns the actual species’ link when the species is in Ensembl, including checking for taxonomy synonyms</li> <li>Search tool returns options for close-relatives of introduced species (if any) when the species is not part of Ensembl</li> <li>Search tool returns options for species within the given taxonomy clade (if any)</li> </ul> <p><strong>Required knowledge</strong></p> <ul> <li>Python</li> <li>Elasticsearch</li> <li>MySQL</li> </ul> <p><strong>Desirable knowledge</strong></p> <ul> <li>Understanding of taxonomy information</li> <li>Django</li> </ul> <p><strong>Difficulty</strong></p> <p>Medium</p> <p><strong>Length</strong></p> <p>350h</p> <p><strong>Mentors</strong></p> <p><strong><a class="confluence-userlink user-mention userlink-10" title="" href="https://www.ebi.ac.uk/seqdb/confluence/display/~sdyer" data-username="sdyer" data-linked-resource-id="140934285" data-linked-resource-version="1" data-linked-resource-type="userinfo" data-base-url="https://www.ebi.ac.uk/seqdb/confluence" data-user-hover-bound="true">Sarah Dyer</a> <a class="confluence-userlink user-mention userlink-1" title="" href="https://www.ebi.ac.uk/seqdb/confluence/display/~jalvarez" data-username="jalvarez" data-linked-resource-id="97129775" data-linked-resource-version="3" data-linked-resource-type="userinfo" data-base-url="https://www.ebi.ac.uk/seqdb/confluence" data-user-hover-bound="true">Jorge Alvarez</a></strong></p> </div> </article> </div> <footer class="site-info" itemscope itemtype="http://schema.org/WPFooter"> <a href="http://wordpress.org/" title="Semantic Personal Publishing Platform">Proudly powered by WordPress</a>. Theme: Flat 1.0.0 by <a rel="nofollow" href="https://themeisle.com/themes/flat/" title="Flat WordPress Theme">Themeisle</a>. </footer> </div> </div> </div> </div> <script type="text/javascript" src="https://www.ensembl.info/wp-content/plugins/jetpack/_inc/build/widgets/eu-cookie-law/eu-cookie-law.min.js?ver=20180522" id="eu-cookie-law-script-js"></script> <script type="text/javascript" src="https://www.ensembl.info/wp-content/plugins/page-links-to/dist/new-tab.js?ver=3.3.6" id="page-links-to-js"></script> <script defer type="text/javascript" src="https://stats.wp.com/e-202447.js" id="jetpack-stats-js"></script> <script type="text/javascript" id="jetpack-stats-js-after"> /* <![CDATA[ */ _stq = window._stq || []; _stq.push([ "view", {v:'ext',blog:'52918201',post:'9395',tz:'0',srv:'www.ensembl.info',j:'1:12.9.4'} ]); _stq.push([ "clickTrackerInit", "52918201", "9395" ]); /* ]]> */ </script> </body> </html> <!-- Dynamic page generated in 0.748 seconds. --> <!-- Cached page generated by WP-Super-Cache on 2024-11-24 02:45:53 --> <!-- super cache -->