CINXE.COM
<!doctype html><html lang="en"><head><title data-rh="true">Neural Machine Translation. Machine Translation using Recurrent… | by Quinn Lanners | Towards Data Science</title><meta data-rh="true" charset="utf-8"/><meta data-rh="true" name="viewport" content="width=device-width,minimum-scale=1,initial-scale=1,maximum-scale=1"/><meta data-rh="true" name="theme-color" content="#000000"/><meta data-rh="true" name="twitter:app:name:iphone" content="Medium"/><meta data-rh="true" name="twitter:app:id:iphone" content="828256236"/><meta data-rh="true" property="al:ios:app_name" content="Medium"/><meta data-rh="true" property="al:ios:app_store_id" content="828256236"/><meta data-rh="true" property="al:android:package" content="com.medium.reader"/><meta data-rh="true" property="fb:app_id" content="542599432471018"/><meta data-rh="true" property="og:site_name" content="Medium"/><meta data-rh="true" property="og:type" content="article"/><meta data-rh="true" property="article:published_time" content="2019-06-07T15:48:12.033Z"/><meta data-rh="true" name="title" content="Neural Machine Translation. Machine Translation using Recurrent… | by Quinn Lanners | Towards Data Science"/><meta data-rh="true" property="og:title" content="Neural Machine Translation"/><meta data-rh="true" property="al:android:url" content="medium://p/15ecf6b0b"/><meta data-rh="true" property="al:ios:url" content="medium://p/15ecf6b0b"/><meta data-rh="true" property="al:android:app_name" content="Medium"/><meta data-rh="true" name="description" content="A guide to Neural Machine Translation using an Encoder Decoder structure with attention. Includes a detailed tutorial using PyTorch in Google Colaboratory."/><meta data-rh="true" property="og:description" content="Machine Translation using Recurrent Neural Networks (includes tutorial in PyTorch)"/><meta data-rh="true" property="og:url" content="https://towardsdatascience.com/neural-machine-translation-15ecf6b0b"/><meta data-rh="true" property="al:web:url" content="https://towardsdatascience.com/neural-machine-translation-15ecf6b0b"/><meta data-rh="true" property="og:image" content="https://miro.medium.com/v2/resize:fit:960/1*H441VINdbjxItCdtgb-1Xw.jpeg"/><meta data-rh="true" property="article:author" content="https://medium.com/@lannersq"/><meta data-rh="true" name="author" content="Quinn Lanners"/><meta data-rh="true" name="robots" content="index,noarchive,follow,max-image-preview:large"/><meta data-rh="true" name="referrer" content="unsafe-url"/><meta data-rh="true" property="twitter:title" content="Neural Machine Translation"/><meta data-rh="true" name="twitter:site" content="@TDataScience"/><meta data-rh="true" name="twitter:app:url:iphone" content="medium://p/15ecf6b0b"/><meta data-rh="true" property="twitter:description" content="Machine Translation using Recurrent Neural Networks (includes tutorial in PyTorch)"/><meta data-rh="true" name="twitter:image:src" content="https://miro.medium.com/v2/resize:fit:960/1*H441VINdbjxItCdtgb-1Xw.jpeg"/><meta data-rh="true" name="twitter:card" content="summary_large_image"/><meta data-rh="true" name="twitter:label1" content="Reading time"/><meta data-rh="true" name="twitter:data1" content="21 min read"/><link data-rh="true" rel="icon" href="https://miro.medium.com/v2/resize:fill:256:256/1*VzTUkfeGymHP4Bvav-T-lA.png"/><link data-rh="true" rel="search" type="application/opensearchdescription+xml" title="Medium" href="/osd.xml"/><link data-rh="true" rel="apple-touch-icon" sizes="152x152" href="https://miro.medium.com/v2/resize:fill:304:304/10fd5c419ac61637245384e7099e131627900034828f4f386bdaa47a74eae156"/><link data-rh="true" rel="apple-touch-icon" sizes="120x120" href="https://miro.medium.com/v2/resize:fill:240:240/10fd5c419ac61637245384e7099e131627900034828f4f386bdaa47a74eae156"/><link data-rh="true" rel="apple-touch-icon" sizes="76x76" href="https://miro.medium.com/v2/resize:fill:152:152/10fd5c419ac61637245384e7099e131627900034828f4f386bdaa47a74eae156"/><link data-rh="true" rel="apple-touch-icon" sizes="60x60" href="https://miro.medium.com/v2/resize:fill:120:120/10fd5c419ac61637245384e7099e131627900034828f4f386bdaa47a74eae156"/><link data-rh="true" rel="mask-icon" href="https://miro.medium.com/v2/resize:fill:1000:1000/7*GAOKVe--MXbEJmV9230oOQ.png" color="#171717"/><link data-rh="true" id="glyph_preload_link" rel="preload" as="style" type="text/css" href="https://glyph.medium.com/css/unbound.css"/><link data-rh="true" id="glyph_link" rel="stylesheet" type="text/css" href="https://glyph.medium.com/css/unbound.css"/><link data-rh="true" rel="author" href="https://medium.com/@lannersq"/><link data-rh="true" rel="canonical" href="https://towardsdatascience.com/neural-machine-translation-15ecf6b0b"/><link data-rh="true" rel="alternate" href="android-app://com.medium.reader/https/medium.com/p/15ecf6b0b"/><script data-rh="true" type="application/ld+json">{"@context":"http:\u002F\u002Fschema.org","@type":"NewsArticle","image":["https:\u002F\u002Fmiro.medium.com\u002Fv2\u002Fresize:fit:1200\u002F1*H441VINdbjxItCdtgb-1Xw.jpeg"],"url":"https:\u002F\u002Ftowardsdatascience.com\u002Fneural-machine-translation-15ecf6b0b","dateCreated":"2019-06-03T22:52:04.362Z","datePublished":"2019-06-03T22:52:04.362Z","dateModified":"2022-03-30T20:25:23.343Z","headline":"Neural Machine Translation - Towards Data Science","name":"Neural Machine Translation - Towards Data Science","description":"A guide to Neural Machine Translation using an Encoder Decoder structure with attention. Includes a detailed tutorial using PyTorch in Google Colaboratory.","identifier":"15ecf6b0b","author":{"@type":"Person","name":"Quinn Lanners","url":"https:\u002F\u002Ftowardsdatascience.com\u002F@lannersq"},"creator":["Quinn Lanners"],"publisher":{"@type":"Organization","name":"Towards Data Science","url":"towardsdatascience.com","logo":{"@type":"ImageObject","width":192,"height":60,"url":"https:\u002F\u002Fmiro.medium.com\u002Fv2\u002Fresize:fit:384\u002F1*cFFKn8rFH4ZndmaYeAs6iQ.png"}},"mainEntityOfPage":"https:\u002F\u002Ftowardsdatascience.com\u002Fneural-machine-translation-15ecf6b0b"}</script><style type="text/css" data-fela-rehydration="586" data-fela-type="STATIC">html{box-sizing:border-box;-webkit-text-size-adjust:100%}*, *:before, *:after{box-sizing:inherit}body{margin:0;padding:0;text-rendering:optimizeLegibility;-webkit-font-smoothing:antialiased;color:rgba(0,0,0,0.8);position:relative;min-height:100vh}h1, h2, h3, h4, h5, h6, dl, dd, ol, ul, menu, figure, blockquote, p, pre, form{margin:0}menu, ol, ul{padding:0;list-style:none;list-style-image:none}main{display:block}a{color:inherit;text-decoration:none}a, button, input{-webkit-tap-highlight-color:transparent}img, svg{vertical-align:middle}button{background:transparent;overflow:visible}button, input, optgroup, select, textarea{margin:0}:root{--reach-tabs:1;--reach-menu-button:1}#speechify-root{font-family:Sohne, sans-serif}div[data-popper-reference-hidden="true"]{visibility:hidden;pointer-events:none}.grecaptcha-badge{visibility:hidden} /*XCode style (c) Angel Garcia <angelgarcia.mail@gmail.com>*/.hljs {background: #fff;color: black; }/* Gray DOCTYPE selectors like WebKit */ .xml .hljs-meta {color: #c0c0c0; }.hljs-comment, .hljs-quote {color: #007400; }.hljs-tag, .hljs-attribute, .hljs-keyword, .hljs-selector-tag, .hljs-literal, .hljs-name {color: #aa0d91; }.hljs-variable, .hljs-template-variable {color: #3F6E74; }.hljs-code, .hljs-string, .hljs-meta .hljs-string {color: #c41a16; }.hljs-regexp, .hljs-link {color: #0E0EFF; }.hljs-title, .hljs-symbol, .hljs-bullet, .hljs-number {color: #1c00cf; }.hljs-section, .hljs-meta {color: #643820; }.hljs-title.class_, .hljs-class .hljs-title, .hljs-type, .hljs-built_in, .hljs-params {color: #5c2699; }.hljs-attr {color: #836C28; }.hljs-subst {color: #000; }.hljs-formula {background-color: #eee;font-style: italic; }.hljs-addition {background-color: #baeeba; }.hljs-deletion {background-color: #ffc8bd; }.hljs-selector-id, .hljs-selector-class {color: #9b703f; }.hljs-doctag, .hljs-strong {font-weight: bold; }.hljs-emphasis {font-style: italic; } </style><style type="text/css" data-fela-rehydration="586" data-fela-type="KEYFRAME">@-webkit-keyframes k1{0%{opacity:0.8}50%{opacity:0.5}100%{opacity:0.8}}@-moz-keyframes k1{0%{opacity:0.8}50%{opacity:0.5}100%{opacity:0.8}}@keyframes k1{0%{opacity:0.8}50%{opacity:0.5}100%{opacity:0.8}}</style><style type="text/css" data-fela-rehydration="586" data-fela-type="RULE">.a{font-family:medium-content-sans-serif-font, -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, Oxygen, Ubuntu, Cantarell, "Open Sans", "Helvetica Neue", sans-serif}.b{font-weight:400}.c{background-color:rgba(255, 255, 255, 1)}.l{display:block}.m{position:sticky}.n{top:0}.o{z-index:500}.p{padding:0 24px}.q{align-items:center}.r{border-bottom:solid 1px #F2F2F2}.y{height:41px}.z{line-height:20px}.ab{display:flex}.ac{height:57px}.ae{flex:1 0 auto}.af{color:inherit}.ag{fill:inherit}.ah{font-size:inherit}.ai{border:inherit}.aj{font-family:inherit}.ak{letter-spacing:inherit}.al{font-weight:inherit}.am{padding:0}.an{margin:0}.ao{cursor:pointer}.ap:disabled{cursor:not-allowed}.aq:disabled{color:#6B6B6B}.ar:disabled{fill:#6B6B6B}.au{width:auto}.av path{fill:#242424}.aw{height:25px}.ax{margin-left:16px}.ay{border:none}.az{border-radius:20px}.ba{width:240px}.bb{background:#F9F9F9}.bc path{fill:#6B6B6B}.be{outline:none}.bf{font-family:sohne, "Helvetica Neue", Helvetica, Arial, sans-serif}.bg{font-size:14px}.bh{width:100%}.bi{padding:10px 20px 10px 0}.bj{background-color:transparent}.bk{color:#242424}.bl::placeholder{color:#6B6B6B}.bm{display:inline-block}.bn{margin-left:12px}.bo{margin-right:12px}.bp{border-radius:4px}.bq{margin-left:24px}.br{height:24px}.bx{background-color:#F9F9F9}.by{border-radius:50%}.bz{height:32px}.ca{width:32px}.cb{justify-content:center}.ch{max-width:680px}.ci{min-width:0}.cj{animation:k1 1.2s ease-in-out infinite}.ck{height:100vh}.cl{margin-bottom:16px}.cm{margin-top:48px}.cn{align-items:flex-start}.co{flex-direction:column}.cp{justify-content:space-between}.cq{margin-bottom:24px}.cw{width:80%}.cx{background-color:#F2F2F2}.dd{height:44px}.de{width:44px}.df{margin:auto 0}.dg{margin-bottom:4px}.dh{height:16px}.di{width:120px}.dj{width:80px}.dp{margin-bottom:8px}.dq{width:96%}.dr{width:98%}.ds{width:81%}.dt{margin-left:8px}.du{color:#6B6B6B}.dv{font-size:13px}.dw{height:100%}.ep{color:#FFFFFF}.eq{fill:#FFFFFF}.er{background:rgba(102, 138, 170, 1)}.es{border-color:rgba(102, 138, 170, 1)}.ew:disabled{cursor:inherit !important}.ex:disabled{opacity:0.3}.ey:disabled:hover{background:rgba(102, 138, 170, 1)}.ez:disabled:hover{border-color:rgba(102, 138, 170, 1)}.fa{border-radius:99em}.fb{border-width:1px}.fc{border-style:solid}.fd{box-sizing:border-box}.fe{text-decoration:none}.ff{text-align:center}.fi{margin-right:32px}.fj{position:relative}.fk{fill:#6B6B6B}.fn{background:transparent}.fo svg{margin-left:4px}.fp svg{fill:#6B6B6B}.fr{box-shadow:inset 0 0 0 1px rgba(0, 0, 0, 0.05)}.fs{position:absolute}.fz{margin:0 24px}.gd{background:rgba(255, 255, 255, 1)}.ge{border:1px solid #F2F2F2}.gf{box-shadow:0 1px 4px #F2F2F2}.gg{max-height:100vh}.gh{overflow-y:auto}.gi{left:0}.gj{top:calc(100vh + 100px)}.gk{bottom:calc(100vh + 100px)}.gl{width:10px}.gm{pointer-events:none}.gn{word-break:break-word}.go{word-wrap:break-word}.gp:after{display:block}.gq:after{content:""}.gr:after{clear:both}.gs{line-height:1.23}.gt{letter-spacing:0}.gu{font-style:normal}.gv{font-weight:700}.hq{margin-bottom:-0.27em}.hr{line-height:1.394}.im{align-items:baseline}.in{width:48px}.io{height:48px}.ip{border:2px solid rgba(255, 255, 255, 1)}.iq{z-index:0}.ir{box-shadow:none}.is{border:1px solid rgba(0, 0, 0, 0.05)}.it{margin-left:-12px}.iu{width:28px}.iv{height:28px}.iw{z-index:1}.ix{width:24px}.iy{margin-bottom:2px}.iz{flex-wrap:nowrap}.ja{font-size:16px}.jb{line-height:24px}.jd{margin:0 8px}.je{display:inline}.jf{color:rgba(102, 138, 170, 1)}.jg{fill:rgba(102, 138, 170, 1)}.jj{flex:0 0 auto}.jm{flex-wrap:wrap}.jp{white-space:pre-wrap}.jq{margin-right:4px}.jr{overflow:hidden}.js{max-height:20px}.jt{text-overflow:ellipsis}.ju{display:-webkit-box}.jv{-webkit-line-clamp:1}.jw{-webkit-box-orient:vertical}.jx{word-break:break-all}.jz{padding-left:8px}.ka{padding-right:8px}.lb> *{flex-shrink:0}.lc{overflow-x:scroll}.ld::-webkit-scrollbar{display:none}.le{scrollbar-width:none}.lf{-ms-overflow-style:none}.lg{width:74px}.lh{flex-direction:row}.li{z-index:2}.ll{-webkit-user-select:none}.lm{border:0}.ln{fill:rgba(117, 117, 117, 1)}.lq{outline:0}.lr{user-select:none}.ls> svg{pointer-events:none}.mb{cursor:progress}.mc{margin-left:4px}.md{margin-top:0px}.me{opacity:1}.mf{padding:4px 0}.mi{width:16px}.mk{display:inline-flex}.mq{max-width:100%}.mr{padding:8px 2px}.ms svg{color:#6B6B6B}.nj{margin-left:auto}.nk{margin-right:auto}.nl{max-width:960px}.nr{clear:both}.nt{cursor:zoom-in}.nu{z-index:auto}.nw{height:auto}.nx{margin-top:10px}.ny{max-width:728px}.ob{line-height:1.58}.oc{letter-spacing:-0.004em}.od{font-family:source-serif-pro, Georgia, Cambria, "Times New Roman", Times, serif}.ow{margin-bottom:-0.46em}.ox{text-decoration:underline}.oy{font-style:italic}.oz{margin-top:32px}.pa{margin-bottom:14px}.pb{padding-top:24px}.pc{padding-bottom:10px}.pd{background-color:#000000}.pe{height:3px}.pf{width:3px}.pg{margin-right:20px}.ph{line-height:1.12}.pi{letter-spacing:-0.022em}.pj{font-weight:600}.qc{margin-bottom:-0.28em}.qi{max-width:1868px}.qj{max-width:163px}.qk{max-width:753px}.ql{max-width:1036px}.qm{max-width:100px}.qn{max-width:1972px}.qo{max-width:312px}.qp{max-width:441px}.qq{max-width:752px}.qr{max-width:963px}.qs{max-width:970px}.qt{max-width:498px}.qu{max-width:419px}.qv{max-width:788px}.qw{max-width:422px}.qx{margin:auto}.qy{padding-bottom:100%}.qz{height:0}.ra{max-width:1116px}.rb{max-width:688px}.rc{max-width:383px}.rd{max-width:230px}.re{max-width:446px}.rf{max-width:1260px}.rg{max-width:445px}.rh{max-width:760px}.ri{max-width:496px}.rj{max-width:802px}.rk{margin-bottom:26px}.rl{margin-top:6px}.rm{margin-top:8px}.rn{margin-right:8px}.ro{padding:8px 16px}.rp{border-radius:100px}.rq{transition:background 300ms ease}.rs{white-space:nowrap}.rt{border-top:none}.ru{height:52px}.rv{max-height:52px}.rw{box-sizing:content-box}.rx{position:static}.rz{max-width:155px}.sk{height:0px}.sl{margin-bottom:40px}.sm{margin-bottom:48px}.ta{border-radius:2px}.tc{height:64px}.td{width:64px}.te{align-self:flex-end}.tf{color:rgba(255, 255, 255, 1)}.tg{fill:rgba(255, 255, 255, 1)}.th{background:rgba(25, 25, 25, 1)}.ti{border-color:rgba(25, 25, 25, 1)}.tl:disabled{opacity:0.1}.tm:disabled:hover{background:rgba(25, 25, 25, 1)}.tn:disabled:hover{border-color:rgba(25, 25, 25, 1)}.to{flex:1 1 auto}.tu{padding-right:4px}.tv{font-weight:500}.ui{margin-top:16px}.ur{gap:18px}.us{fill:rgba(61, 61, 61, 1)}.uu{fill:#242424}.uv{background:0}.uw{border-color:#242424}.ux:disabled:hover{color:#242424}.uy:disabled:hover{fill:#242424}.uz:disabled:hover{border-color:#242424}.vk{border-bottom:solid 1px #E5E5E5}.vl{margin-top:72px}.vm{padding:24px 0}.vn{margin-bottom:0px}.vo{margin-right:16px}.as:hover:not(:disabled){color:rgba(25, 25, 25, 1)}.at:hover:not(:disabled){fill:rgba(25, 25, 25, 1)}.et:hover{background:rgba(90, 118, 144, 1)}.eu:hover{border-color:rgba(90, 118, 144, 1)}.ev:hover{cursor:pointer}.fl:hover{color:#242424}.fm:hover{fill:#242424}.fq:hover svg{fill:#242424}.ft:hover{background-color:rgba(0, 0, 0, 0.1)}.jc:hover{text-decoration:underline}.jh:hover:not(:disabled){color:rgba(90, 118, 144, 1)}.ji:hover:not(:disabled){fill:rgba(90, 118, 144, 1)}.lp:hover{fill:rgba(8, 8, 8, 1)}.mg:hover{fill:#000000}.mh:hover p{color:#000000}.mj:hover{color:#000000}.mt:hover svg{color:#000000}.rr:hover{background-color:#F2F2F2}.tb:hover{background-color:none}.tj:hover{background:#000000}.tk:hover{border-color:#242424}.ut:hover{fill:rgba(25, 25, 25, 1)}.bd:focus-within path{fill:#242424}.lo:focus{fill:rgba(8, 8, 8, 1)}.mu:focus svg{color:#000000}.nv:focus{transform:scale(1.01)}.lt:active{border-style:none}</style><style type="text/css" data-fela-rehydration="586" data-fela-type="RULE" media="all and (min-width: 1080px)">.d{display:none}.bw{width:64px}.cg{margin:0 64px}.cv{height:48px}.dc{margin-bottom:52px}.do{margin-bottom:48px}.ef{font-size:14px}.eg{line-height:20px}.em{font-size:13px}.eo{padding:5px 12px}.fh{display:flex}.fy{margin-bottom:68px}.gc{max-width:680px}.hm{font-size:42px}.hn{margin-top:1.19em}.ho{line-height:52px}.hp{letter-spacing:-0.011em}.ie{font-size:22px}.if{margin-top:0.92em}.ig{line-height:28px}.il{align-items:center}.kn{border-top:solid 1px #F2F2F2}.ko{border-bottom:solid 1px #F2F2F2}.kp{margin:32px 0 0}.kq{padding:3px 8px}.kz> *{margin-right:24px}.la> :last-child{margin-right:0}.ma{margin-top:0px}.mp{margin:0}.nq{margin-top:56px}.os{font-size:20px}.ot{margin-top:2.14em}.ou{line-height:32px}.ov{letter-spacing:-0.003em}.py{font-size:24px}.pz{margin-top:1.25em}.qa{line-height:30px}.qb{letter-spacing:-0.016em}.qh{margin-top:0.94em}.se{display:inline-block}.sj{margin-bottom:104px}.sn{flex-direction:row}.sq{margin-bottom:0}.sr{margin-right:20px}.tp{max-width:500px}.ug{line-height:24px}.uh{letter-spacing:0}.un{margin-bottom:88px}.uq{margin-bottom:72px}.ve{width:min-width}.vj{padding-top:72px}</style><style type="text/css" data-fela-rehydration="586" data-fela-type="RULE" media="all and (max-width: 1079.98px)">.e{display:none}.lz{margin-top:0px}.nz{margin-left:auto}.oa{text-align:center}.sd{display:inline-block}</style><style type="text/css" data-fela-rehydration="586" data-fela-type="RULE" media="all and (max-width: 903.98px)">.f{display:none}.ly{margin-top:0px}.sc{display:inline-block}</style><style type="text/css" data-fela-rehydration="586" data-fela-type="RULE" media="all and (max-width: 727.98px)">.g{display:none}.lw{margin-top:0px}.lx{margin-right:0px}.sb{display:inline-block}</style><style type="text/css" data-fela-rehydration="586" data-fela-type="RULE" media="all and (max-width: 551.98px)">.h{display:none}.s{display:flex}.t{justify-content:space-between}.bs{width:24px}.cc{margin:0 24px}.cr{height:40px}.cy{margin-bottom:44px}.dk{margin-bottom:32px}.dx{font-size:13px}.dy{line-height:20px}.eh{padding:0px 8px 1px}.fu{margin-bottom:4px}.gw{font-size:32px}.gx{margin-top:1.01em}.gy{line-height:38px}.gz{letter-spacing:-0.014em}.hs{font-size:18px}.ht{margin-top:0.79em}.hu{line-height:24px}.ih{align-items:flex-start}.jk{flex-direction:column}.jn{margin-bottom:2px}.kb{margin:24px -24px 0}.kc{padding:0}.kr> *{margin-right:8px}.ks> :last-child{margin-right:24px}.lj{margin-left:0px}.lu{margin-top:0px}.lv{margin-right:0px}.ml{margin:0}.mv{border:1px solid #F2F2F2}.mw{border-radius:99em}.mx{padding:0px 16px 0px 12px}.my{height:38px}.mz{align-items:center}.nb svg{margin-right:8px}.nm{margin-top:40px}.oe{margin-top:1.56em}.of{line-height:28px}.og{letter-spacing:-0.003em}.pk{font-size:20px}.pl{margin-top:0.93em}.pm{letter-spacing:0}.qd{margin-top:0.67em}.sa{display:inline-block}.sf{margin-bottom:96px}.sy{margin-bottom:20px}.sz{margin-right:0}.tt{max-width:100%}.tw{font-size:24px}.tx{line-height:30px}.ty{letter-spacing:-0.016em}.uj{margin-bottom:64px}.va{width:100%}.vf{padding-top:48px}.na:hover{border-color:#E5E5E5}</style><style type="text/css" data-fela-rehydration="586" data-fela-type="RULE" media="all and (min-width: 904px) and (max-width: 1079.98px)">.i{display:none}.bv{width:64px}.cf{margin:0 64px}.cu{height:48px}.db{margin-bottom:52px}.dn{margin-bottom:48px}.ed{font-size:14px}.ee{line-height:20px}.ek{font-size:13px}.el{padding:5px 12px}.fg{display:flex}.fx{margin-bottom:68px}.gb{max-width:680px}.hi{font-size:42px}.hj{margin-top:1.19em}.hk{line-height:52px}.hl{letter-spacing:-0.011em}.ib{font-size:22px}.ic{margin-top:0.92em}.id{line-height:28px}.ik{align-items:center}.kj{border-top:solid 1px #F2F2F2}.kk{border-bottom:solid 1px #F2F2F2}.kl{margin:32px 0 0}.km{padding:3px 8px}.kx> *{margin-right:24px}.ky> :last-child{margin-right:0}.mo{margin:0}.np{margin-top:56px}.oo{font-size:20px}.op{margin-top:2.14em}.oq{line-height:32px}.or{letter-spacing:-0.003em}.pu{font-size:24px}.pv{margin-top:1.25em}.pw{line-height:30px}.px{letter-spacing:-0.016em}.qg{margin-top:0.94em}.si{margin-bottom:104px}.so{flex-direction:row}.ss{margin-bottom:0}.st{margin-right:20px}.tq{max-width:500px}.ue{line-height:24px}.uf{letter-spacing:0}.um{margin-bottom:88px}.up{margin-bottom:72px}.vd{width:min-width}.vi{padding-top:72px}</style><style type="text/css" data-fela-rehydration="586" data-fela-type="RULE" media="all and (min-width: 728px) and (max-width: 903.98px)">.j{display:none}.w{display:flex}.x{justify-content:space-between}.bu{width:64px}.ce{margin:0 48px}.ct{height:48px}.da{margin-bottom:52px}.dm{margin-bottom:48px}.eb{font-size:13px}.ec{line-height:20px}.ej{padding:0px 8px 1px}.fw{margin-bottom:68px}.ga{max-width:680px}.he{font-size:42px}.hf{margin-top:1.19em}.hg{line-height:52px}.hh{letter-spacing:-0.011em}.hy{font-size:22px}.hz{margin-top:0.92em}.ia{line-height:28px}.ij{align-items:center}.kf{border-top:solid 1px #F2F2F2}.kg{border-bottom:solid 1px #F2F2F2}.kh{margin:32px 0 0}.ki{padding:3px 8px}.kv> *{margin-right:24px}.kw> :last-child{margin-right:0}.mn{margin:0}.no{margin-top:56px}.ok{font-size:20px}.ol{margin-top:2.14em}.om{line-height:32px}.on{letter-spacing:-0.003em}.pq{font-size:24px}.pr{margin-top:1.25em}.ps{line-height:30px}.pt{letter-spacing:-0.016em}.qf{margin-top:0.94em}.sh{margin-bottom:104px}.sp{flex-direction:row}.su{margin-bottom:0}.sv{margin-right:20px}.tr{max-width:500px}.uc{line-height:24px}.ud{letter-spacing:0}.ul{margin-bottom:88px}.uo{margin-bottom:72px}.vc{width:min-width}.vh{padding-top:72px}</style><style type="text/css" data-fela-rehydration="586" data-fela-type="RULE" media="all and (min-width: 552px) and (max-width: 727.98px)">.k{display:none}.u{display:flex}.v{justify-content:space-between}.bt{width:24px}.cd{margin:0 24px}.cs{height:40px}.cz{margin-bottom:44px}.dl{margin-bottom:32px}.dz{font-size:13px}.ea{line-height:20px}.ei{padding:0px 8px 1px}.fv{margin-bottom:4px}.ha{font-size:32px}.hb{margin-top:1.01em}.hc{line-height:38px}.hd{letter-spacing:-0.014em}.hv{font-size:18px}.hw{margin-top:0.79em}.hx{line-height:24px}.ii{align-items:flex-start}.jl{flex-direction:column}.jo{margin-bottom:2px}.kd{margin:24px 0 0}.ke{padding:0}.kt> *{margin-right:8px}.ku> :last-child{margin-right:8px}.lk{margin-left:0px}.mm{margin:0}.nc{border:1px solid #F2F2F2}.nd{border-radius:99em}.ne{padding:0px 16px 0px 12px}.nf{height:38px}.ng{align-items:center}.ni svg{margin-right:8px}.nn{margin-top:40px}.oh{margin-top:1.56em}.oi{line-height:28px}.oj{letter-spacing:-0.003em}.pn{font-size:20px}.po{margin-top:0.93em}.pp{letter-spacing:0}.qe{margin-top:0.67em}.sg{margin-bottom:96px}.sw{margin-bottom:20px}.sx{margin-right:0}.ts{max-width:100%}.tz{font-size:24px}.ua{line-height:30px}.ub{letter-spacing:-0.016em}.uk{margin-bottom:64px}.vb{width:100%}.vg{padding-top:48px}.nh:hover{border-color:#E5E5E5}</style><style type="text/css" data-fela-rehydration="586" data-fela-type="RULE" media="print">.ry{display:none}</style><style type="text/css" data-fela-rehydration="586" data-fela-type="RULE" media="(orientation: landscape) and (max-width: 903.98px)">.jy{max-height:none}</style><style type="text/css" data-fela-rehydration="586" data-fela-type="RULE" media="(prefers-reduced-motion: no-preference)">.ns{transition:transform 300ms cubic-bezier(0.2, 0, 0.2, 1)}</style></head><body><div id="root"><div class="a b c"><div class="d e f g h i j k"></div><script>document.domain = document.domain;</script><div class="l c"><div class="l m n o c"><div class="p q r s t u v w x i d y z"><a class="du ag dv bf ak b am an ao ap aq ar as at s u w i d q dw z" href="https://rsci.app.link/?%24canonical_url=https%3A%2F%2Fmedium.com%2Fp%2F15ecf6b0b&%7Efeature=LoOpenInAppButton&%7Echannel=ShowPostUnderCollection&source=---top_nav_layout_nav----------------------------------" rel="noopener follow">Open in app<svg xmlns="http://www.w3.org/2000/svg" width="10" height="10" fill="none" viewBox="0 0 10 10" class="dt"><path fill="currentColor" d="M.985 8.485a.375.375 0 1 0 .53.53zM8.75 1.25h.375A.375.375 0 0 0 8.75.875zM8.375 6.5a.375.375 0 1 0 .75 0zM3.5.875a.375.375 0 1 0 0 .75zm-1.985 8.14 7.5-7.5-.53-.53-7.5 7.5zm6.86-7.765V6.5h.75V1.25zM3.5 1.625h5.25v-.75H3.5z"></path></svg></a><div class="ab q"><p class="bf b dx dy dz ea eb ec ed ee ef eg du"><span><a class="bf b dx dy eh dz ea ei eb ec ej ek ee el em eg eo ep eq er es et eu ev ew ex ey ez fa fb fc fd bm fe ff" data-testid="headerSignUpButton" href="https://medium.com/m/signin?operation=register&redirect=https%3A%2F%2Ftowardsdatascience.com%2Fneural-machine-translation-15ecf6b0b&source=post_page---top_nav_layout_nav-----------------------global_nav-----------" rel="noopener follow">Sign up</a></span></p><div class="ax l"><p class="bf b dx dy dz ea eb ec ed ee ef eg du"><span><a class="af ag ah ai aj ak al am an ao ap aq ar as at" data-testid="headerSignInButton" href="https://medium.com/m/signin?operation=login&redirect=https%3A%2F%2Ftowardsdatascience.com%2Fneural-machine-translation-15ecf6b0b&source=post_page---top_nav_layout_nav-----------------------global_nav-----------" rel="noopener follow">Sign in</a></span></p></div></div></div><div class="p q r ab ac"><div class="ab q ae"><a class="af ag ah ai aj ak al am an ao ap aq ar as at ab" aria-label="Homepage" data-testid="headerMediumLogo" href="https://medium.com/?source=---top_nav_layout_nav----------------------------------" rel="noopener follow"><svg xmlns="http://www.w3.org/2000/svg" width="719" height="160" fill="none" viewBox="0 0 719 160" class="au av aw"><path fill="#242424" d="m174.104 9.734.215-.047V8.02H130.39L89.6 103.89 48.81 8.021H1.472v1.666l.212.047c8.018 1.81 12.09 4.509 12.09 14.242V137.93c0 9.734-4.087 12.433-12.106 14.243l-.212.047v1.671h32.118v-1.665l-.213-.048c-8.018-1.809-12.089-4.509-12.089-14.242V30.586l52.399 123.305h2.972l53.925-126.743V140.75c-.687 7.688-4.721 10.062-11.982 11.701l-.215.05v1.652h55.948v-1.652l-.215-.05c-7.269-1.639-11.4-4.013-12.087-11.701l-.037-116.774h.037c0-9.733 4.071-12.432 12.087-14.242m25.555 75.488c.915-20.474 8.268-35.252 20.606-35.507 3.806.063 6.998 1.312 9.479 3.714 5.272 5.118 7.751 15.812 7.368 31.793zm-.553 5.77h65.573v-.275c-.186-15.656-4.721-27.834-13.466-36.196-7.559-7.227-18.751-11.203-30.507-11.203h-.263c-6.101 0-13.584 1.48-18.909 4.16-6.061 2.807-11.407 7.003-15.855 12.511-7.161 8.874-11.499 20.866-12.554 34.343q-.05.606-.092 1.212a50 50 0 0 0-.065 1.151 85.807 85.807 0 0 0-.094 5.689c.71 30.524 17.198 54.917 46.483 54.917 25.705 0 40.675-18.791 44.407-44.013l-1.886-.664c-6.557 13.556-18.334 21.771-31.738 20.769-18.297-1.369-32.314-19.922-31.042-42.395m139.722 41.359c-2.151 5.101-6.639 7.908-12.653 7.908s-11.513-4.129-15.418-11.63c-4.197-8.053-6.405-19.436-6.405-32.92 0-28.067 8.729-46.22 22.24-46.22 5.657 0 10.111 2.807 12.236 7.704zm43.499 20.008c-8.019-1.897-12.089-4.722-12.089-14.951V1.309l-48.716 14.353v1.757l.299-.024c6.72-.543 11.278.386 13.925 2.83 2.072 1.915 3.082 4.853 3.082 8.987v18.66c-4.803-3.067-10.516-4.56-17.448-4.56-14.059 0-26.909 5.92-36.176 16.672-9.66 11.205-14.767 26.518-14.767 44.278-.003 31.72 15.612 53.039 38.851 53.039 13.595 0 24.533-7.449 29.54-20.013v16.865h43.711v-1.746zM424.1 19.819c0-9.904-7.468-17.374-17.375-17.374-9.859 0-17.573 7.632-17.573 17.374s7.721 17.374 17.573 17.374c9.907 0 17.375-7.47 17.375-17.374m11.499 132.546c-8.019-1.897-12.089-4.722-12.089-14.951h-.035V43.635l-43.714 12.551v1.705l.263.024c9.458.842 12.047 4.1 12.047 15.152v81.086h43.751v-1.746zm112.013 0c-8.018-1.897-12.089-4.722-12.089-14.951V43.635l-41.621 12.137v1.71l.246.026c7.733.813 9.967 4.257 9.967 15.36v59.279c-2.578 5.102-7.415 8.131-13.274 8.336-9.503 0-14.736-6.419-14.736-18.073V43.638l-43.714 12.55v1.703l.262.024c9.459.84 12.05 4.097 12.05 15.152v50.17a56.3 56.3 0 0 0 .91 10.444l.787 3.423c3.701 13.262 13.398 20.197 28.59 20.197 12.868 0 24.147-7.966 29.115-20.43v17.311h43.714v-1.747zm169.818 1.788v-1.749l-.213-.05c-8.7-2.006-12.089-5.789-12.089-13.49v-63.79c0-19.89-11.171-31.761-29.883-31.761-13.64 0-25.141 7.882-29.569 20.16-3.517-13.01-13.639-20.16-28.606-20.16-13.146 0-23.449 6.938-27.869 18.657V43.643L545.487 55.68v1.715l.263.024c9.345.829 12.047 4.181 12.047 14.95v81.784h40.787v-1.746l-.215-.053c-6.941-1.631-9.181-4.606-9.181-12.239V66.998c1.836-4.289 5.537-9.37 12.853-9.37 9.086 0 13.692 6.296 13.692 18.697v77.828h40.797v-1.746l-.215-.053c-6.94-1.631-9.18-4.606-9.18-12.239V75.066a42 42 0 0 0-.578-7.26c1.947-4.661 5.86-10.177 13.475-10.177 9.214 0 13.691 6.114 13.691 18.696v77.828z"></path></svg></a><div class="ax h"><div class="ab ay az ba bb q bc bd"><div class="bm" aria-hidden="false" aria-describedby="searchResults" aria-labelledby="searchResults"></div><div class="bn bo ab"><svg xmlns="http://www.w3.org/2000/svg" width="24" height="24" fill="none" viewBox="0 0 24 24"><path fill="currentColor" fill-rule="evenodd" d="M4.092 11.06a6.95 6.95 0 1 1 13.9 0 6.95 6.95 0 0 1-13.9 0m6.95-8.05a8.05 8.05 0 1 0 5.13 14.26l3.75 3.75a.56.56 0 1 0 .79-.79l-3.73-3.73A8.05 8.05 0 0 0 11.042 3z" clip-rule="evenodd"></path></svg></div><input role="combobox" aria-controls="searchResults" aria-expanded="false" aria-label="search" data-testid="headerSearchInput" tabindex="0" class="ay be bf bg z bh bi bj bk bl" placeholder="Search" value=""/></div></div></div><div class="h k w fg fh"><div class="fi ab"><span><a class="af ag ah ai aj ak al am an ao ap aq ar as at" data-testid="headerWriteButton" href="https://medium.com/m/signin?operation=register&redirect=https%3A%2F%2Fmedium.com%2Fnew-story&source=---top_nav_layout_nav-----------------------new_post_topnav-----------" rel="noopener follow"><div class="bf b bg z du fj fk ab q fl fm"><svg xmlns="http://www.w3.org/2000/svg" width="24" height="24" fill="none" viewBox="0 0 24 24" aria-label="Write"><path fill="currentColor" d="M14 4a.5.5 0 0 0 0-1zm7 6a.5.5 0 0 0-1 0zm-7-7H4v1h10zM3 4v16h1V4zm1 17h16v-1H4zm17-1V10h-1v10zm-1 1a1 1 0 0 0 1-1h-1zM3 20a1 1 0 0 0 1 1v-1zM4 3a1 1 0 0 0-1 1h1z"></path><path stroke="currentColor" d="m17.5 4.5-8.458 8.458a.25.25 0 0 0-.06.098l-.824 2.47a.25.25 0 0 0 .316.316l2.47-.823a.25.25 0 0 0 .098-.06L19.5 6.5m-2-2 2.323-2.323a.25.25 0 0 1 .354 0l1.646 1.646a.25.25 0 0 1 0 .354L19.5 6.5m-2-2 2 2"></path></svg><div class="dt l">Write</div></div></a></span></div></div><div class="k j i d"><div class="fi ab"><a class="af ag ah ai aj ak al am an ao ap aq ar as at" data-testid="headerSearchButton" href="https://medium.com/search?source=---top_nav_layout_nav----------------------------------" rel="noopener follow"><div class="bf b bg z du fj fk ab q fl fm"><svg xmlns="http://www.w3.org/2000/svg" width="24" height="24" fill="none" viewBox="0 0 24 24" aria-label="Search"><path fill="currentColor" fill-rule="evenodd" d="M4.092 11.06a6.95 6.95 0 1 1 13.9 0 6.95 6.95 0 0 1-13.9 0m6.95-8.05a8.05 8.05 0 1 0 5.13 14.26l3.75 3.75a.56.56 0 1 0 .79-.79l-3.73-3.73A8.05 8.05 0 0 0 11.042 3z" clip-rule="evenodd"></path></svg></div></a></div></div><div class="fi h k j"><div class="ab q"><p class="bf b dx dy dz ea eb ec ed ee ef eg du"><span><a class="bf b dx dy eh dz ea ei eb ec ej ek ee el em eg eo ep eq er es et eu ev ew ex ey ez fa fb fc fd bm fe ff" data-testid="headerSignUpButton" href="https://medium.com/m/signin?operation=register&redirect=https%3A%2F%2Ftowardsdatascience.com%2Fneural-machine-translation-15ecf6b0b&source=post_page---top_nav_layout_nav-----------------------global_nav-----------" rel="noopener follow">Sign up</a></span></p><div class="ax l"><p class="bf b dx dy dz ea eb ec ed ee ef eg du"><span><a class="af ag ah ai aj ak al am an ao ap aq ar as at" data-testid="headerSignInButton" href="https://medium.com/m/signin?operation=login&redirect=https%3A%2F%2Ftowardsdatascience.com%2Fneural-machine-translation-15ecf6b0b&source=post_page---top_nav_layout_nav-----------------------global_nav-----------" rel="noopener follow">Sign in</a></span></p></div></div></div><div class="l" aria-hidden="false"><button class="ay fn am ab q ao fo fp fq" aria-label="user options menu" data-testid="headerUserIcon"><div class="l fj"><img alt="" class="l fd by bz ca cx" src="https://miro.medium.com/v2/resize:fill:64:64/1*dmbNkD5D-u45r44go_cf0g.png" width="32" height="32" loading="lazy" role="presentation"/><div class="fr by l bz ca fs n ay ft"></div></div></button></div></div></div><div class="l"><div class="fu fv fw fx fy l"><div class="ab cb"><div class="ci bh fz ga gb gc"></div></div><article><div class="l"><div class="l"><span class="l"></span><section><div><div class="fs gi gj gk gl gm"></div><div class="gn go gp gq gr"><div class="ab cb"><div class="ci bh fz ga gb gc"><div><h1 id="4d3b" class="pw-post-title gs gt gu bf gv gw gx gy gz ha hb hc hd he hf hg hh hi hj hk hl hm hn ho hp hq bk" data-testid="storyTitle">Neural Machine Translation</h1></div><div><h2 id="c415" class="pw-subtitle-paragraph hr gt gu bf b hs ht hu hv hw hx hy hz ia ib ic id ie if ig cq du">A guide to Neural Machine Translation using an Encoder Decoder structure with attention. Includes a detailed tutorial using PyTorch in Google Colaboratory.</h2><div><div class="speechify-ignore ab cp"><div class="speechify-ignore bh l"><div class="ih ii ij ik il ab"><div><div class="ab im"><div><div class="bm" aria-hidden="false"><a href="https://medium.com/@lannersq?source=post_page---byline--15ecf6b0b--------------------------------" rel="noopener follow"><div class="l in io by ip iq"><div class="l fj"><img alt="Quinn Lanners" class="l fd by dd de cx" src="https://miro.medium.com/v2/resize:fill:88:88/2*Brk5nEh8iz86Uf-730hMgA.png" width="44" height="44" loading="lazy" data-testid="authorPhoto"/><div class="ir by l dd de fs n is ft"></div></div></div></a></div></div><div class="it ab fj"><div><div class="bm" aria-hidden="false"><a href="https://towardsdatascience.com/?source=post_page---byline--15ecf6b0b--------------------------------" rel="noopener follow"><div class="l iu iv by ip iw"><div class="l fj"><img alt="Towards Data Science" class="l fd by br ix cx" src="https://miro.medium.com/v2/resize:fill:48:48/1*CJe3891yB1A1mzMdqemkdg.jpeg" width="24" height="24" loading="lazy" data-testid="publicationPhoto"/><div class="ir by l br ix fs n is ft"></div></div></div></a></div></div></div></div></div><div class="bn bh l"><div class="ab"><div style="flex:1"><span class="bf b bg z bk"><div class="iy ab q"><div class="ab q iz"><div class="ab q"><div><div class="bm" aria-hidden="false"><p class="bf b ja jb bk"><a class="af ag ah ai aj ak al am an ao ap aq ar jc" data-testid="authorName" href="https://medium.com/@lannersq?source=post_page---byline--15ecf6b0b--------------------------------" rel="noopener follow">Quinn Lanners</a></p></div></div></div><span class="jd je" aria-hidden="true"><span class="bf b bg z du">·</span></span><p class="bf b ja jb du"><span><a class="jf jg ah ai aj ak al am an ao ap aq ar ex jh ji" href="https://medium.com/m/signin?actionUrl=https%3A%2F%2Fmedium.com%2F_%2Fsubscribe%2Fuser%2F8a2f7df48b90&operation=register&redirect=https%3A%2F%2Ftowardsdatascience.com%2Fneural-machine-translation-15ecf6b0b&user=Quinn+Lanners&userId=8a2f7df48b90&source=post_page-8a2f7df48b90--byline--15ecf6b0b---------------------post_header-----------" rel="noopener follow">Follow</a></span></p></div></div></span></div></div><div class="l jj"><span class="bf b bg z du"><div class="ab cn jk jl jm"><div class="jn jo ab"><div class="bf b bg z du ab jp"><span class="jq l jj">Published in</span><div><div class="l" aria-hidden="false"><a class="af ag ah ai aj ak al am an ao ap aq ar jc ab q" data-testid="publicationName" href="https://towardsdatascience.com/?source=post_page---byline--15ecf6b0b--------------------------------" rel="noopener follow"><p class="bf b bg z jr js jt ju jv jw jx jy bk">Towards Data Science</p></a></div></div></div><div class="h k"><span class="jd je" aria-hidden="true"><span class="bf b bg z du">·</span></span></div></div><span class="bf b bg z du"><div class="ab ae"><span data-testid="storyReadTime">21 min read</span><div class="jz ka l" aria-hidden="true"><span class="l" aria-hidden="true"><span class="bf b bg z du">·</span></span></div><span data-testid="storyPublishDate">Jun 3, 2019</span></div></span></div></span></div></div></div><div class="ab cp kb kc kd ke kf kg kh ki kj kk kl km kn ko kp kq"><div class="h k w fg fh q"><div class="lg l"><div class="ab q lh li"><div class="pw-multi-vote-icon fj jq lj lk ll"><span><a class="af ag ah ai aj ak al am an ao ap aq ar as at" data-testid="headerClapButton" href="https://medium.com/m/signin?actionUrl=https%3A%2F%2Fmedium.com%2F_%2Fvote%2Ftowards-data-science%2F15ecf6b0b&operation=register&redirect=https%3A%2F%2Ftowardsdatascience.com%2Fneural-machine-translation-15ecf6b0b&user=Quinn+Lanners&userId=8a2f7df48b90&source=---header_actions--15ecf6b0b---------------------clap_footer-----------" rel="noopener follow"><div><div class="bm" aria-hidden="false"><div class="lm ao ln lo lp lq am lr ls lt ll"><svg xmlns="http://www.w3.org/2000/svg" width="24" height="24" viewBox="0 0 24 24" aria-label="clap"><path fill-rule="evenodd" d="M11.37.828 12 3.282l.63-2.454zM13.916 3.953l1.523-2.112-1.184-.39zM8.589 1.84l1.522 2.112-.337-2.501zM18.523 18.92c-.86.86-1.75 1.246-2.62 1.33a6 6 0 0 0 .407-.372c2.388-2.389 2.86-4.951 1.399-7.623l-.912-1.603-.79-1.672c-.26-.56-.194-.98.203-1.288a.7.7 0 0 1 .546-.132c.283.046.546.231.728.5l2.363 4.157c.976 1.624 1.141 4.237-1.324 6.702m-10.999-.438L3.37 14.328a.828.828 0 0 1 .585-1.408.83.83 0 0 1 .585.242l2.158 2.157a.365.365 0 0 0 .516-.516l-2.157-2.158-1.449-1.449a.826.826 0 0 1 1.167-1.17l3.438 3.44a.363.363 0 0 0 .516 0 .364.364 0 0 0 0-.516L5.293 9.513l-.97-.97a.826.826 0 0 1 0-1.166.84.84 0 0 1 1.167 0l.97.968 3.437 3.436a.36.36 0 0 0 .517 0 .366.366 0 0 0 0-.516L6.977 7.83a.82.82 0 0 1-.241-.584.82.82 0 0 1 .824-.826c.219 0 .43.087.584.242l5.787 5.787a.366.366 0 0 0 .587-.415l-1.117-2.363c-.26-.56-.194-.98.204-1.289a.7.7 0 0 1 .546-.132c.283.046.545.232.727.501l2.193 3.86c1.302 2.38.883 4.59-1.277 6.75-1.156 1.156-2.602 1.627-4.19 1.367-1.418-.236-2.866-1.033-4.079-2.246M10.75 5.971l2.12 2.12c-.41.502-.465 1.17-.128 1.89l.22.465-3.523-3.523a.8.8 0 0 1-.097-.368c0-.22.086-.428.241-.584a.847.847 0 0 1 1.167 0m7.355 1.705c-.31-.461-.746-.758-1.23-.837a1.44 1.44 0 0 0-1.11.275c-.312.24-.505.543-.59.881a1.74 1.74 0 0 0-.906-.465 1.47 1.47 0 0 0-.82.106l-2.182-2.182a1.56 1.56 0 0 0-2.2 0 1.54 1.54 0 0 0-.396.701 1.56 1.56 0 0 0-2.21-.01 1.55 1.55 0 0 0-.416.753c-.624-.624-1.649-.624-2.237-.037a1.557 1.557 0 0 0 0 2.2c-.239.1-.501.238-.715.453a1.56 1.56 0 0 0 0 2.2l.516.515a1.556 1.556 0 0 0-.753 2.615L7.01 19c1.32 1.319 2.909 2.189 4.475 2.449q.482.08.971.08c.85 0 1.653-.198 2.393-.579.231.033.46.054.686.054 1.266 0 2.457-.52 3.505-1.567 2.763-2.763 2.552-5.734 1.439-7.586z" clip-rule="evenodd"></path></svg></div></div></div></a></span></div><div class="pw-multi-vote-count l lu lv lw lx ly lz ma"><p class="bf b dv z du"><span class="mb">--</span></p></div></div></div><div><div class="bm" aria-hidden="false"><button class="ao lm me mf ab q fk mg mh" aria-label="responses"><svg xmlns="http://www.w3.org/2000/svg" width="24" height="24" viewBox="0 0 24 24" class="md"><path d="M18.006 16.803c1.533-1.456 2.234-3.325 2.234-5.321C20.24 7.357 16.709 4 12.191 4S4 7.357 4 11.482c0 4.126 3.674 7.482 8.191 7.482.817 0 1.622-.111 2.393-.327.231.2.48.391.744.559 1.06.693 2.203 1.044 3.399 1.044.224-.008.4-.112.486-.287a.49.49 0 0 0-.042-.518c-.495-.67-.845-1.364-1.04-2.057a4 4 0 0 1-.125-.598zm-3.122 1.055-.067-.223-.315.096a8 8 0 0 1-2.311.338c-4.023 0-7.292-2.955-7.292-6.587 0-3.633 3.269-6.588 7.292-6.588 4.014 0 7.112 2.958 7.112 6.593 0 1.794-.608 3.469-2.027 4.72l-.195.168v.255c0 .056 0 .151.016.295.025.231.081.478.154.733.154.558.398 1.117.722 1.659a5.3 5.3 0 0 1-2.165-.845c-.276-.176-.714-.383-.941-.59z"></path></svg><p class="bf b dv z du"><span class="pw-responses-count mc md">5</span></p></button></div></div></div><div class="ab q kr ks kt ku kv kw kx ky kz la lb lc ld le lf"><div class="mi k j i d"></div><div class="h k"><div><div class="bm" aria-hidden="false"><span><a class="af ag ah ai aj ak al am an ao ap aq ar as at" data-testid="headerBookmarkButton" href="https://medium.com/m/signin?actionUrl=https%3A%2F%2Fmedium.com%2F_%2Fbookmark%2Fp%2F15ecf6b0b&operation=register&redirect=https%3A%2F%2Ftowardsdatascience.com%2Fneural-machine-translation-15ecf6b0b&source=---header_actions--15ecf6b0b---------------------bookmark_footer-----------" rel="noopener follow"><svg xmlns="http://www.w3.org/2000/svg" width="25" height="25" fill="none" viewBox="0 0 25 25" class="du mj" aria-label="Add to list bookmark button"><path fill="currentColor" d="M18 2.5a.5.5 0 0 1 1 0V5h2.5a.5.5 0 0 1 0 1H19v2.5a.5.5 0 1 1-1 0V6h-2.5a.5.5 0 0 1 0-1H18zM7 7a1 1 0 0 1 1-1h3.5a.5.5 0 0 0 0-1H8a2 2 0 0 0-2 2v14a.5.5 0 0 0 .805.396L12.5 17l5.695 4.396A.5.5 0 0 0 19 21v-8.5a.5.5 0 0 0-1 0v7.485l-5.195-4.012a.5.5 0 0 0-.61 0L7 19.985z"></path></svg></a></span></div></div></div><div class="fd mk cn"><div class="l ae"><div class="ab cb"><div class="ml mm mn mo mp mq ci bh"><div class="ab"><div class="bm bh" aria-hidden="false"><div><div class="bm" aria-hidden="false"><button aria-label="Listen" data-testid="audioPlayButton" class="af fk ah ai aj ak al mr an ao ap ex ms mt mh mu mv mw mx my s mz na nb nc nd ne nf u ng nh ni"><svg xmlns="http://www.w3.org/2000/svg" width="24" height="24" fill="none" viewBox="0 0 24 24"><path fill="currentColor" fill-rule="evenodd" d="M3 12a9 9 0 1 1 18 0 9 9 0 0 1-18 0m9-10C6.477 2 2 6.477 2 12s4.477 10 10 10 10-4.477 10-10S17.523 2 12 2m3.376 10.416-4.599 3.066a.5.5 0 0 1-.777-.416V8.934a.5.5 0 0 1 .777-.416l4.599 3.066a.5.5 0 0 1 0 .832" clip-rule="evenodd"></path></svg><div class="j i d"><p class="bf b bg z du">Listen</p></div></button></div></div></div></div></div></div></div></div><div class="bm" aria-hidden="false" aria-describedby="postFooterSocialMenu" aria-labelledby="postFooterSocialMenu"><div><div class="bm" aria-hidden="false"><button aria-controls="postFooterSocialMenu" aria-expanded="false" aria-label="Share Post" data-testid="headerSocialShareButton" class="af fk ah ai aj ak al mr an ao ap ex ms mt mh mu mv mw mx my s mz na nb nc nd ne nf u ng nh ni"><svg xmlns="http://www.w3.org/2000/svg" width="24" height="24" fill="none" viewBox="0 0 24 24"><path fill="currentColor" fill-rule="evenodd" d="M15.218 4.931a.4.4 0 0 1-.118.132l.012.006a.45.45 0 0 1-.292.074.5.5 0 0 1-.3-.13l-2.02-2.02v7.07c0 .28-.23.5-.5.5s-.5-.22-.5-.5v-7.04l-2 2a.45.45 0 0 1-.57.04h-.02a.4.4 0 0 1-.16-.3.4.4 0 0 1 .1-.32l2.8-2.8a.5.5 0 0 1 .7 0l2.8 2.79a.42.42 0 0 1 .068.498m-.106.138.008.004v-.01zM16 7.063h1.5a2 2 0 0 1 2 2v10a2 2 0 0 1-2 2h-11c-1.1 0-2-.9-2-2v-10a2 2 0 0 1 2-2H8a.5.5 0 0 1 .35.15.5.5 0 0 1 .15.35.5.5 0 0 1-.15.35.5.5 0 0 1-.35.15H6.4c-.5 0-.9.4-.9.9v10.2a.9.9 0 0 0 .9.9h11.2c.5 0 .9-.4.9-.9v-10.2c0-.5-.4-.9-.9-.9H16a.5.5 0 0 1 0-1" clip-rule="evenodd"></path></svg><div class="j i d"><p class="bf b bg z du">Share</p></div></button></div></div></div></div></div></div></div></div></div><figure class="nm nn no np nq nr nj nk paragraph-image"><div role="button" tabindex="0" class="ns nt fj nu bh nv"><div class="nj nk nl"><picture><source srcSet="https://miro.medium.com/v2/resize:fit:640/format:webp/1*H441VINdbjxItCdtgb-1Xw.jpeg 640w, https://miro.medium.com/v2/resize:fit:720/format:webp/1*H441VINdbjxItCdtgb-1Xw.jpeg 720w, https://miro.medium.com/v2/resize:fit:750/format:webp/1*H441VINdbjxItCdtgb-1Xw.jpeg 750w, https://miro.medium.com/v2/resize:fit:786/format:webp/1*H441VINdbjxItCdtgb-1Xw.jpeg 786w, https://miro.medium.com/v2/resize:fit:828/format:webp/1*H441VINdbjxItCdtgb-1Xw.jpeg 828w, https://miro.medium.com/v2/resize:fit:1100/format:webp/1*H441VINdbjxItCdtgb-1Xw.jpeg 1100w, https://miro.medium.com/v2/resize:fit:1400/format:webp/1*H441VINdbjxItCdtgb-1Xw.jpeg 1400w" sizes="(min-resolution: 4dppx) and (max-width: 700px) 50vw, (-webkit-min-device-pixel-ratio: 4) and (max-width: 700px) 50vw, (min-resolution: 3dppx) and (max-width: 700px) 67vw, (-webkit-min-device-pixel-ratio: 3) and (max-width: 700px) 65vw, (min-resolution: 2.5dppx) and (max-width: 700px) 80vw, (-webkit-min-device-pixel-ratio: 2.5) and (max-width: 700px) 80vw, (min-resolution: 2dppx) and (max-width: 700px) 100vw, (-webkit-min-device-pixel-ratio: 2) and (max-width: 700px) 100vw, 700px" type="image/webp"/><source data-testid="og" srcSet="https://miro.medium.com/v2/resize:fit:640/1*H441VINdbjxItCdtgb-1Xw.jpeg 640w, https://miro.medium.com/v2/resize:fit:720/1*H441VINdbjxItCdtgb-1Xw.jpeg 720w, https://miro.medium.com/v2/resize:fit:750/1*H441VINdbjxItCdtgb-1Xw.jpeg 750w, https://miro.medium.com/v2/resize:fit:786/1*H441VINdbjxItCdtgb-1Xw.jpeg 786w, https://miro.medium.com/v2/resize:fit:828/1*H441VINdbjxItCdtgb-1Xw.jpeg 828w, https://miro.medium.com/v2/resize:fit:1100/1*H441VINdbjxItCdtgb-1Xw.jpeg 1100w, https://miro.medium.com/v2/resize:fit:1400/1*H441VINdbjxItCdtgb-1Xw.jpeg 1400w" sizes="(min-resolution: 4dppx) and (max-width: 700px) 50vw, (-webkit-min-device-pixel-ratio: 4) and (max-width: 700px) 50vw, (min-resolution: 3dppx) and (max-width: 700px) 67vw, (-webkit-min-device-pixel-ratio: 3) and (max-width: 700px) 65vw, (min-resolution: 2.5dppx) and (max-width: 700px) 80vw, (-webkit-min-device-pixel-ratio: 2.5) and (max-width: 700px) 80vw, (min-resolution: 2dppx) and (max-width: 700px) 100vw, (-webkit-min-device-pixel-ratio: 2) and (max-width: 700px) 100vw, 700px"/><img alt="" class="bh mq nw c" width="700" height="427" loading="eager" role="presentation"/></picture></div></div><figcaption class="nx ff ny nj nk nz oa bf b bg z du">Image from pixabay.com</figcaption></figure><p id="87ab" class="pw-post-body-paragraph ob oc gu od b hs oe of og hv oh oi oj ok ol om on oo op oq or os ot ou ov ow gn bk">Machine Translation (MT) is a subfield of computational linguistics that is focused on translating text from one language to another. With the power of deep learning, Neural Machine Translation (NMT) has arisen as the most powerful algorithm to perform this task. While Google Translate is the leading industry example of NMT, tech companies all over the globe are going <a class="af ox" href="https://slator.com/technology/corporates-going-all-in-on-neural-machine-translation-research/" rel="noopener ugc nofollow" target="_blank">all in on NMT</a>. This state-of-the-art algorithm is an application of deep learning in which massive datasets of translated sentences are used to train a model capable of translating between any two languages. With the vast amount of research in recent years, there are several variations of NMT currently being investigated and deployed in the industry. One of the older and more established versions of NMT is the Encoder Decoder structure. This architecture is composed of two recurrent neural networks (RNNs) used together in tandem to create a translation model. And when coupled with the power of <a class="af ox" href="https://arxiv.org/pdf/1508.04025.pdf" rel="noopener ugc nofollow" target="_blank">attention mechanisms</a>, this architecture can achieve impressive results.</p><p id="394d" class="pw-post-body-paragraph ob oc gu od b hs oe of og hv oh oi oj ok ol om on oo op oq or os ot ou ov ow gn bk">This post is broken into two distinct parts. The first section consists of a brief explanation of NMT and the Encoder Decoder structure. Following this, the latter part of this article provides a tutorial which will allow the chance for you to create one of these structures yourself. This code tutorial is based largely on the <a class="af ox" href="https://pytorch.org/tutorials/intermediate/seq2seq_translation_tutorial.html" rel="noopener ugc nofollow" target="_blank">PyTorch tutorial on NMT</a> with a number of enhancements. Most notably, this code tutorial can be run on a GPU to receive significantly better results.</p><p id="c768" class="pw-post-body-paragraph ob oc gu od b hs oe of og hv oh oi oj ok ol om on oo op oq or os ot ou ov ow gn bk">Before we begin, it is assumed that if you are reading this article you have at least a general knowledge of <a class="af ox" href="https://skymind.ai/wiki/neural-network" rel="noopener ugc nofollow" target="_blank">neural networks and deep learning</a>; particularly the ideas of forward-propagation, loss functions and back-propagation, and the importance of <a class="af ox" rel="noopener" target="_blank" href="/train-validation-and-test-sets-72cb40cba9e7">train and test sets</a>.</p><p id="9249" class="pw-post-body-paragraph ob oc gu od b hs oe of og hv oh oi oj ok ol om on oo op oq or os ot ou ov ow gn bk"><em class="oy">If you are interested in jumping straight to the code, you can find the complete Jupyter notebook (or Python script) of the Google Colab tutorial outlined in this article on my </em><a class="af ox" href="https://github.com/qlanners/nmt_tutorial" rel="noopener ugc nofollow" target="_blank"><em class="oy">GitHub page for this project</em></a><em class="oy">.</em></p></div></div></div><div class="ab cb oz pa pb pc" role="separator"><span class="pd by bm pe pf pg"></span><span class="pd by bm pe pf pg"></span><span class="pd by bm pe pf"></span></div><div class="gn go gp gq gr"><div class="ab cb"><div class="ci bh fz ga gb gc"><h1 id="5789" class="ph pi gu bf pj pk pl hu pm pn po hx pp pq pr ps pt pu pv pw px py pz qa qb qc bk">Brief Explanation of NMT and the Encoder Decoder Structure</h1><p id="9011" class="pw-post-body-paragraph ob oc gu od b hs qd of og hv qe oi oj ok qf om on oo qg oq or os qh ou ov ow gn bk">The ultimate goal of any NMT model is to take a sentence in one language as input and return that sentence translated into a different language as output. The figure below is a naive representation of a translation algorithm (such as Google Translate) tasked with translating from English to Spanish.</p><figure class="nm nn no np nq nr nj nk paragraph-image"><div role="button" tabindex="0" class="ns nt fj nu bh nv"><div class="nj nk qi"><picture><source srcSet="https://miro.medium.com/v2/resize:fit:640/format:webp/1*qWNEL7xMGraPLS6d6hCqyA.png 640w, https://miro.medium.com/v2/resize:fit:720/format:webp/1*qWNEL7xMGraPLS6d6hCqyA.png 720w, https://miro.medium.com/v2/resize:fit:750/format:webp/1*qWNEL7xMGraPLS6d6hCqyA.png 750w, https://miro.medium.com/v2/resize:fit:786/format:webp/1*qWNEL7xMGraPLS6d6hCqyA.png 786w, https://miro.medium.com/v2/resize:fit:828/format:webp/1*qWNEL7xMGraPLS6d6hCqyA.png 828w, https://miro.medium.com/v2/resize:fit:1100/format:webp/1*qWNEL7xMGraPLS6d6hCqyA.png 1100w, https://miro.medium.com/v2/resize:fit:1400/format:webp/1*qWNEL7xMGraPLS6d6hCqyA.png 1400w" sizes="(min-resolution: 4dppx) and (max-width: 700px) 50vw, (-webkit-min-device-pixel-ratio: 4) and (max-width: 700px) 50vw, (min-resolution: 3dppx) and (max-width: 700px) 67vw, (-webkit-min-device-pixel-ratio: 3) and (max-width: 700px) 65vw, (min-resolution: 2.5dppx) and (max-width: 700px) 80vw, (-webkit-min-device-pixel-ratio: 2.5) and (max-width: 700px) 80vw, (min-resolution: 2dppx) and (max-width: 700px) 100vw, (-webkit-min-device-pixel-ratio: 2) and (max-width: 700px) 100vw, 700px" type="image/webp"/><source data-testid="og" srcSet="https://miro.medium.com/v2/resize:fit:640/1*qWNEL7xMGraPLS6d6hCqyA.png 640w, https://miro.medium.com/v2/resize:fit:720/1*qWNEL7xMGraPLS6d6hCqyA.png 720w, https://miro.medium.com/v2/resize:fit:750/1*qWNEL7xMGraPLS6d6hCqyA.png 750w, https://miro.medium.com/v2/resize:fit:786/1*qWNEL7xMGraPLS6d6hCqyA.png 786w, https://miro.medium.com/v2/resize:fit:828/1*qWNEL7xMGraPLS6d6hCqyA.png 828w, https://miro.medium.com/v2/resize:fit:1100/1*qWNEL7xMGraPLS6d6hCqyA.png 1100w, https://miro.medium.com/v2/resize:fit:1400/1*qWNEL7xMGraPLS6d6hCqyA.png 1400w" sizes="(min-resolution: 4dppx) and (max-width: 700px) 50vw, (-webkit-min-device-pixel-ratio: 4) and (max-width: 700px) 50vw, (min-resolution: 3dppx) and (max-width: 700px) 67vw, (-webkit-min-device-pixel-ratio: 3) and (max-width: 700px) 65vw, (min-resolution: 2.5dppx) and (max-width: 700px) 80vw, (-webkit-min-device-pixel-ratio: 2.5) and (max-width: 700px) 80vw, (min-resolution: 2dppx) and (max-width: 700px) 100vw, (-webkit-min-device-pixel-ratio: 2) and (max-width: 700px) 100vw, 700px"/><img alt="" class="bh mq nw c" width="700" height="120" loading="eager" role="presentation"/></picture></div></div><figcaption class="nx ff ny nj nk nz oa bf b bg z du">Figure 1: Translation from English to Spanish of the English sentence “the cat likes to eat pizza”</figcaption></figure><p id="0533" class="pw-post-body-paragraph ob oc gu od b hs oe of og hv oh oi oj ok ol om on oo op oq or os ot ou ov ow gn bk">Before diving into the Encoder Decoder structure that is oftentimes used as the algorithm in the above figure, we first must understand how we overcome a large hurdle in any machine translation task. Namely, we need a way to transform sentences into a data format that can be inputted into a machine learning model. In essence, we must somehow convert our textual data into a numeric form.</p><p id="69f7" class="pw-post-body-paragraph ob oc gu od b hs oe of og hv oh oi oj ok ol om on oo op oq or os ot ou ov ow gn bk">To do this in machine translation, each word is transformed into a One Hot Encoding vector which can then be inputted into the model. A One Hot Encoding vector is simply a vector with a 0 at every index except for a 1 at a single index corresponding to that particular word. In this way, each word has a distinct One Hot Encoding vector and thus we can represent every word in our dataset with a numerical representation.</p><p id="8226" class="pw-post-body-paragraph ob oc gu od b hs oe of og hv oh oi oj ok ol om on oo op oq or os ot ou ov ow gn bk">The first step towards creating these vectors is to assign an index to each unique word in the input language, and then repeat this process for the output language. In assigning a unique index to each unique word, we will be creating what is referred to as a Vocabulary for each language. Ideally, the Vocabulary for each language would simply contain every unique word in that language. However, given that any single language can have hundreds of thousands of words, the vocabulary is often trimmed to the N most common words in the dataset we are working with (where N is chosen arbitrarily, but often ranges from 1,000–100,000 depending on the dataset size).</p><p id="14aa" class="pw-post-body-paragraph ob oc gu od b hs oe of og hv oh oi oj ok ol om on oo op oq or os ot ou ov ow gn bk">To understand how we can then use a Vocabulary to create One Hot Encoding vectors for every word in our dataset, consider a mini-Vocabulary containing just the words in Table 1 below.</p><figure class="nm nn no np nq nr nj nk paragraph-image"><div class="nj nk qj"><picture><source srcSet="https://miro.medium.com/v2/resize:fit:640/format:webp/1*_Pp0bAv3nZPYHbPFlvO7Hg.png 640w, https://miro.medium.com/v2/resize:fit:720/format:webp/1*_Pp0bAv3nZPYHbPFlvO7Hg.png 720w, https://miro.medium.com/v2/resize:fit:750/format:webp/1*_Pp0bAv3nZPYHbPFlvO7Hg.png 750w, https://miro.medium.com/v2/resize:fit:786/format:webp/1*_Pp0bAv3nZPYHbPFlvO7Hg.png 786w, https://miro.medium.com/v2/resize:fit:828/format:webp/1*_Pp0bAv3nZPYHbPFlvO7Hg.png 828w, https://miro.medium.com/v2/resize:fit:1100/format:webp/1*_Pp0bAv3nZPYHbPFlvO7Hg.png 1100w, https://miro.medium.com/v2/resize:fit:326/format:webp/1*_Pp0bAv3nZPYHbPFlvO7Hg.png 326w" sizes="(min-resolution: 4dppx) and (max-width: 700px) 50vw, (-webkit-min-device-pixel-ratio: 4) and (max-width: 700px) 50vw, (min-resolution: 3dppx) and (max-width: 700px) 67vw, (-webkit-min-device-pixel-ratio: 3) and (max-width: 700px) 65vw, (min-resolution: 2.5dppx) and (max-width: 700px) 80vw, (-webkit-min-device-pixel-ratio: 2.5) and (max-width: 700px) 80vw, (min-resolution: 2dppx) and (max-width: 700px) 100vw, (-webkit-min-device-pixel-ratio: 2) and (max-width: 700px) 100vw, 163px" type="image/webp"/><source data-testid="og" srcSet="https://miro.medium.com/v2/resize:fit:640/1*_Pp0bAv3nZPYHbPFlvO7Hg.png 640w, https://miro.medium.com/v2/resize:fit:720/1*_Pp0bAv3nZPYHbPFlvO7Hg.png 720w, https://miro.medium.com/v2/resize:fit:750/1*_Pp0bAv3nZPYHbPFlvO7Hg.png 750w, https://miro.medium.com/v2/resize:fit:786/1*_Pp0bAv3nZPYHbPFlvO7Hg.png 786w, https://miro.medium.com/v2/resize:fit:828/1*_Pp0bAv3nZPYHbPFlvO7Hg.png 828w, https://miro.medium.com/v2/resize:fit:1100/1*_Pp0bAv3nZPYHbPFlvO7Hg.png 1100w, https://miro.medium.com/v2/resize:fit:326/1*_Pp0bAv3nZPYHbPFlvO7Hg.png 326w" sizes="(min-resolution: 4dppx) and (max-width: 700px) 50vw, (-webkit-min-device-pixel-ratio: 4) and (max-width: 700px) 50vw, (min-resolution: 3dppx) and (max-width: 700px) 67vw, (-webkit-min-device-pixel-ratio: 3) and (max-width: 700px) 65vw, (min-resolution: 2.5dppx) and (max-width: 700px) 80vw, (-webkit-min-device-pixel-ratio: 2.5) and (max-width: 700px) 80vw, (min-resolution: 2dppx) and (max-width: 700px) 100vw, (-webkit-min-device-pixel-ratio: 2) and (max-width: 700px) 100vw, 163px"/><img alt="" class="bh mq nw c" width="163" height="356" loading="lazy" role="presentation"/></picture></div><figcaption class="nx ff ny nj nk nz oa bf b bg z du">Table 1: Mini-vocabulary for the English language</figcaption></figure><p id="d89a" class="pw-post-body-paragraph ob oc gu od b hs oe of og hv oh oi oj ok ol om on oo op oq or os ot ou ov ow gn bk">Given this table, we have assigned a unique index 0–12 to every word in our mini-Vocabulary. The <SOS> and <EOS> tokens in the table are added to every Vocabulary and stand for START OF SENTENCE and END OF SENTENCE respectively. They are used by the NMT model to help identify these crucial points in sentences.</p><p id="fd00" class="pw-post-body-paragraph ob oc gu od b hs oe of og hv oh oi oj ok ol om on oo op oq or os ot ou ov ow gn bk">Now, let’s say we want to convert the words in the sentence “the blue whale ate the red fish” to their one hot encoding vectors. Using Table 1, we would do this as shown in Figure 2 below.</p><figure class="nm nn no np nq nr nj nk paragraph-image"><div role="button" tabindex="0" class="ns nt fj nu bh nv"><div class="nj nk qk"><picture><source srcSet="https://miro.medium.com/v2/resize:fit:640/format:webp/1*nxHrAM5dwoqqFFldP0Wv6w.png 640w, https://miro.medium.com/v2/resize:fit:720/format:webp/1*nxHrAM5dwoqqFFldP0Wv6w.png 720w, https://miro.medium.com/v2/resize:fit:750/format:webp/1*nxHrAM5dwoqqFFldP0Wv6w.png 750w, https://miro.medium.com/v2/resize:fit:786/format:webp/1*nxHrAM5dwoqqFFldP0Wv6w.png 786w, https://miro.medium.com/v2/resize:fit:828/format:webp/1*nxHrAM5dwoqqFFldP0Wv6w.png 828w, https://miro.medium.com/v2/resize:fit:1100/format:webp/1*nxHrAM5dwoqqFFldP0Wv6w.png 1100w, https://miro.medium.com/v2/resize:fit:1400/format:webp/1*nxHrAM5dwoqqFFldP0Wv6w.png 1400w" sizes="(min-resolution: 4dppx) and (max-width: 700px) 50vw, (-webkit-min-device-pixel-ratio: 4) and (max-width: 700px) 50vw, (min-resolution: 3dppx) and (max-width: 700px) 67vw, (-webkit-min-device-pixel-ratio: 3) and (max-width: 700px) 65vw, (min-resolution: 2.5dppx) and (max-width: 700px) 80vw, (-webkit-min-device-pixel-ratio: 2.5) and (max-width: 700px) 80vw, (min-resolution: 2dppx) and (max-width: 700px) 100vw, (-webkit-min-device-pixel-ratio: 2) and (max-width: 700px) 100vw, 700px" type="image/webp"/><source data-testid="og" srcSet="https://miro.medium.com/v2/resize:fit:640/1*nxHrAM5dwoqqFFldP0Wv6w.png 640w, https://miro.medium.com/v2/resize:fit:720/1*nxHrAM5dwoqqFFldP0Wv6w.png 720w, https://miro.medium.com/v2/resize:fit:750/1*nxHrAM5dwoqqFFldP0Wv6w.png 750w, https://miro.medium.com/v2/resize:fit:786/1*nxHrAM5dwoqqFFldP0Wv6w.png 786w, https://miro.medium.com/v2/resize:fit:828/1*nxHrAM5dwoqqFFldP0Wv6w.png 828w, https://miro.medium.com/v2/resize:fit:1100/1*nxHrAM5dwoqqFFldP0Wv6w.png 1100w, https://miro.medium.com/v2/resize:fit:1400/1*nxHrAM5dwoqqFFldP0Wv6w.png 1400w" sizes="(min-resolution: 4dppx) and (max-width: 700px) 50vw, (-webkit-min-device-pixel-ratio: 4) and (max-width: 700px) 50vw, (min-resolution: 3dppx) and (max-width: 700px) 67vw, (-webkit-min-device-pixel-ratio: 3) and (max-width: 700px) 65vw, (min-resolution: 2.5dppx) and (max-width: 700px) 80vw, (-webkit-min-device-pixel-ratio: 2.5) and (max-width: 700px) 80vw, (min-resolution: 2dppx) and (max-width: 700px) 100vw, (-webkit-min-device-pixel-ratio: 2) and (max-width: 700px) 100vw, 700px"/><img alt="" class="bh mq nw c" width="700" height="314" loading="lazy" role="presentation"/></picture></div></div><figcaption class="nx ff ny nj nk nz oa bf b bg z du">Figure 2: One Hot Encoding vectors for the sentence “the blue whale ate the red fish”</figcaption></figure><p id="066b" class="pw-post-body-paragraph ob oc gu od b hs oe of og hv oh oi oj ok ol om on oo op oq or os ot ou ov ow gn bk">As you can see above, each word becomes a vector of length 13 (which is the size of our vocabulary) and consists entirely of 0s except for a 1 at the index that was assigned to that word in Table 1.</p><p id="7904" class="pw-post-body-paragraph ob oc gu od b hs oe of og hv oh oi oj ok ol om on oo op oq or os ot ou ov ow gn bk">By creating a vocabulary for both the input and output languages, we can perform this technique on every sentence in each language to completely transform any corpus of translated sentences into a format suitable for the task of machine translation.</p><p id="b472" class="pw-post-body-paragraph ob oc gu od b hs oe of og hv oh oi oj ok ol om on oo op oq or os ot ou ov ow gn bk">Now, with an understanding of how we can represent textual data in a numeric way, let’s look at the magic behind this Encoder Decoder algorithm. At the most basic level, the Encoder portion of the model takes a sentence in the input language and creates a <em class="oy">thought vector</em> from this sentence. This <em class="oy">thought vector</em> stores the meaning of the sentence and is subsequently passed to a Decoder which outputs the translation of the sentence in the output language. This process is shown in the figure below.</p><figure class="nm nn no np nq nr nj nk paragraph-image"><div role="button" tabindex="0" class="ns nt fj nu bh nv"><div class="nj nk ql"><picture><source srcSet="https://miro.medium.com/v2/resize:fit:640/format:webp/1*KeD0mc9o9DQZ59-nO95sPw.png 640w, https://miro.medium.com/v2/resize:fit:720/format:webp/1*KeD0mc9o9DQZ59-nO95sPw.png 720w, https://miro.medium.com/v2/resize:fit:750/format:webp/1*KeD0mc9o9DQZ59-nO95sPw.png 750w, https://miro.medium.com/v2/resize:fit:786/format:webp/1*KeD0mc9o9DQZ59-nO95sPw.png 786w, https://miro.medium.com/v2/resize:fit:828/format:webp/1*KeD0mc9o9DQZ59-nO95sPw.png 828w, https://miro.medium.com/v2/resize:fit:1100/format:webp/1*KeD0mc9o9DQZ59-nO95sPw.png 1100w, https://miro.medium.com/v2/resize:fit:1400/format:webp/1*KeD0mc9o9DQZ59-nO95sPw.png 1400w" sizes="(min-resolution: 4dppx) and (max-width: 700px) 50vw, (-webkit-min-device-pixel-ratio: 4) and (max-width: 700px) 50vw, (min-resolution: 3dppx) and (max-width: 700px) 67vw, (-webkit-min-device-pixel-ratio: 3) and (max-width: 700px) 65vw, (min-resolution: 2.5dppx) and (max-width: 700px) 80vw, (-webkit-min-device-pixel-ratio: 2.5) and (max-width: 700px) 80vw, (min-resolution: 2dppx) and (max-width: 700px) 100vw, (-webkit-min-device-pixel-ratio: 2) and (max-width: 700px) 100vw, 700px" type="image/webp"/><source data-testid="og" srcSet="https://miro.medium.com/v2/resize:fit:640/1*KeD0mc9o9DQZ59-nO95sPw.png 640w, https://miro.medium.com/v2/resize:fit:720/1*KeD0mc9o9DQZ59-nO95sPw.png 720w, https://miro.medium.com/v2/resize:fit:750/1*KeD0mc9o9DQZ59-nO95sPw.png 750w, https://miro.medium.com/v2/resize:fit:786/1*KeD0mc9o9DQZ59-nO95sPw.png 786w, https://miro.medium.com/v2/resize:fit:828/1*KeD0mc9o9DQZ59-nO95sPw.png 828w, https://miro.medium.com/v2/resize:fit:1100/1*KeD0mc9o9DQZ59-nO95sPw.png 1100w, https://miro.medium.com/v2/resize:fit:1400/1*KeD0mc9o9DQZ59-nO95sPw.png 1400w" sizes="(min-resolution: 4dppx) and (max-width: 700px) 50vw, (-webkit-min-device-pixel-ratio: 4) and (max-width: 700px) 50vw, (min-resolution: 3dppx) and (max-width: 700px) 67vw, (-webkit-min-device-pixel-ratio: 3) and (max-width: 700px) 65vw, (min-resolution: 2.5dppx) and (max-width: 700px) 80vw, (-webkit-min-device-pixel-ratio: 2.5) and (max-width: 700px) 80vw, (min-resolution: 2dppx) and (max-width: 700px) 100vw, (-webkit-min-device-pixel-ratio: 2) and (max-width: 700px) 100vw, 700px"/><img alt="" class="bh mq nw c" width="700" height="165" loading="lazy" role="presentation"/></picture></div></div><figcaption class="nx ff ny nj nk nz oa bf b bg z du">Figure 3: Encoder Decoder structure translating the English sentence “the cat likes to eat pizza” to the Spanish sentence “el gato le gusta comer pizza”</figcaption></figure><p id="cb9c" class="pw-post-body-paragraph ob oc gu od b hs oe of og hv oh oi oj ok ol om on oo op oq or os ot ou ov ow gn bk">In the above architecture, the Encoder and the Decoder are both recurrent neural networks (RNN). In this particular tutorial, we will be using Long Short-Term Memory (LSTM) models, which are a type of RNN. However other RNN architectures, such as a GRU, are often used. At a basic level, RNNs are neural networks designed specifically to deal with temporal/textual data. This article will give a high-level overview of how RNNs work in the context of NMT, however, I would strongly recommend looking further into these concepts if you are not already familiar with them. For a more thorough explanation of RNNs and LSTMs see <a class="af ox" href="https://colah.github.io/posts/2015-08-Understanding-LSTMs/" rel="noopener ugc nofollow" target="_blank">here</a>, and for a deeper article on LSTMs in the context of language translation, in particular, see <a class="af ox" href="http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.248.4448&rep=rep1&type=pdf" rel="noopener ugc nofollow" target="_blank">here</a>.</p><p id="cdf9" class="pw-post-body-paragraph ob oc gu od b hs oe of og hv oh oi oj ok ol om on oo op oq or os ot ou ov ow gn bk">In the case of the Encoder, each word in the input sentence is fed separately into the model in a number of consecutive time-steps. At each time-step, <em class="oy">t</em>, the model updates a hidden vector, <em class="oy">h</em>,<em class="oy"> </em>using information from the word inputted to the model at that time-step<em class="oy">. </em>This hidden vector works to store information about the inputted sentence. In this way, since no words have yet been inputted to the Encoder at time-step <em class="oy">t=</em>0, the hidden state in the Encoder starts out as an empty vector at this time-step. We represent this hidden state with the blue box in Figure 4, where the subscript <em class="oy">t</em>=0 indicates the time-step and the superscript E corresponds to the fact that it’s a hidden state of the Encoder (rather than a D for the Decoder).</p><figure class="nm nn no np nq nr nj nk paragraph-image"><div class="nj nk qm"><picture><source srcSet="https://miro.medium.com/v2/resize:fit:640/format:webp/1*GZCzxYAMHdiCLiTTHNBsSw.png 640w, https://miro.medium.com/v2/resize:fit:720/format:webp/1*GZCzxYAMHdiCLiTTHNBsSw.png 720w, https://miro.medium.com/v2/resize:fit:750/format:webp/1*GZCzxYAMHdiCLiTTHNBsSw.png 750w, https://miro.medium.com/v2/resize:fit:786/format:webp/1*GZCzxYAMHdiCLiTTHNBsSw.png 786w, https://miro.medium.com/v2/resize:fit:828/format:webp/1*GZCzxYAMHdiCLiTTHNBsSw.png 828w, https://miro.medium.com/v2/resize:fit:1100/format:webp/1*GZCzxYAMHdiCLiTTHNBsSw.png 1100w, https://miro.medium.com/v2/resize:fit:200/format:webp/1*GZCzxYAMHdiCLiTTHNBsSw.png 200w" sizes="(min-resolution: 4dppx) and (max-width: 700px) 50vw, (-webkit-min-device-pixel-ratio: 4) and (max-width: 700px) 50vw, (min-resolution: 3dppx) and (max-width: 700px) 67vw, (-webkit-min-device-pixel-ratio: 3) and (max-width: 700px) 65vw, (min-resolution: 2.5dppx) and (max-width: 700px) 80vw, (-webkit-min-device-pixel-ratio: 2.5) and (max-width: 700px) 80vw, (min-resolution: 2dppx) and (max-width: 700px) 100vw, (-webkit-min-device-pixel-ratio: 2) and (max-width: 700px) 100vw, 100px" type="image/webp"/><source data-testid="og" srcSet="https://miro.medium.com/v2/resize:fit:640/1*GZCzxYAMHdiCLiTTHNBsSw.png 640w, https://miro.medium.com/v2/resize:fit:720/1*GZCzxYAMHdiCLiTTHNBsSw.png 720w, https://miro.medium.com/v2/resize:fit:750/1*GZCzxYAMHdiCLiTTHNBsSw.png 750w, https://miro.medium.com/v2/resize:fit:786/1*GZCzxYAMHdiCLiTTHNBsSw.png 786w, https://miro.medium.com/v2/resize:fit:828/1*GZCzxYAMHdiCLiTTHNBsSw.png 828w, https://miro.medium.com/v2/resize:fit:1100/1*GZCzxYAMHdiCLiTTHNBsSw.png 1100w, https://miro.medium.com/v2/resize:fit:200/1*GZCzxYAMHdiCLiTTHNBsSw.png 200w" sizes="(min-resolution: 4dppx) and (max-width: 700px) 50vw, (-webkit-min-device-pixel-ratio: 4) and (max-width: 700px) 50vw, (min-resolution: 3dppx) and (max-width: 700px) 67vw, (-webkit-min-device-pixel-ratio: 3) and (max-width: 700px) 65vw, (min-resolution: 2.5dppx) and (max-width: 700px) 80vw, (-webkit-min-device-pixel-ratio: 2.5) and (max-width: 700px) 80vw, (min-resolution: 2dppx) and (max-width: 700px) 100vw, (-webkit-min-device-pixel-ratio: 2) and (max-width: 700px) 100vw, 100px"/><img alt="" class="bh mq nw c" width="100" height="108" loading="lazy" role="presentation"/></picture></div><figcaption class="nx ff ny nj nk nz oa bf b bg z du">Figure 4: Encoder hidden vector at t=0</figcaption></figure><p id="0773" class="pw-post-body-paragraph ob oc gu od b hs oe of og hv oh oi oj ok ol om on oo op oq or os ot ou ov ow gn bk">At each time-step, this hidden vector takes in information from the inputted word at that time-step, while preserving the information it has already stored from previous time-steps. Thus, at the final time-step, the meaning of the whole input sentence is stored in the hidden vector. This hidden vector at the final time-step is the <em class="oy">thought vector</em> referred to above, which is then inputted into the Decoder. The process of encoding the English sentence “the cat likes to eat pizza” is represented in Figure 5.</p><figure class="nm nn no np nq nr nj nk paragraph-image"><div role="button" tabindex="0" class="ns nt fj nu bh nv"><div class="nj nk qn"><picture><source srcSet="https://miro.medium.com/v2/resize:fit:640/format:webp/1*xd8j4KoKRSzRq0b1Vx0FAA.png 640w, https://miro.medium.com/v2/resize:fit:720/format:webp/1*xd8j4KoKRSzRq0b1Vx0FAA.png 720w, https://miro.medium.com/v2/resize:fit:750/format:webp/1*xd8j4KoKRSzRq0b1Vx0FAA.png 750w, https://miro.medium.com/v2/resize:fit:786/format:webp/1*xd8j4KoKRSzRq0b1Vx0FAA.png 786w, https://miro.medium.com/v2/resize:fit:828/format:webp/1*xd8j4KoKRSzRq0b1Vx0FAA.png 828w, https://miro.medium.com/v2/resize:fit:1100/format:webp/1*xd8j4KoKRSzRq0b1Vx0FAA.png 1100w, https://miro.medium.com/v2/resize:fit:1400/format:webp/1*xd8j4KoKRSzRq0b1Vx0FAA.png 1400w" sizes="(min-resolution: 4dppx) and (max-width: 700px) 50vw, (-webkit-min-device-pixel-ratio: 4) and (max-width: 700px) 50vw, (min-resolution: 3dppx) and (max-width: 700px) 67vw, (-webkit-min-device-pixel-ratio: 3) and (max-width: 700px) 65vw, (min-resolution: 2.5dppx) and (max-width: 700px) 80vw, (-webkit-min-device-pixel-ratio: 2.5) and (max-width: 700px) 80vw, (min-resolution: 2dppx) and (max-width: 700px) 100vw, (-webkit-min-device-pixel-ratio: 2) and (max-width: 700px) 100vw, 700px" type="image/webp"/><source data-testid="og" srcSet="https://miro.medium.com/v2/resize:fit:640/1*xd8j4KoKRSzRq0b1Vx0FAA.png 640w, https://miro.medium.com/v2/resize:fit:720/1*xd8j4KoKRSzRq0b1Vx0FAA.png 720w, https://miro.medium.com/v2/resize:fit:750/1*xd8j4KoKRSzRq0b1Vx0FAA.png 750w, https://miro.medium.com/v2/resize:fit:786/1*xd8j4KoKRSzRq0b1Vx0FAA.png 786w, https://miro.medium.com/v2/resize:fit:828/1*xd8j4KoKRSzRq0b1Vx0FAA.png 828w, https://miro.medium.com/v2/resize:fit:1100/1*xd8j4KoKRSzRq0b1Vx0FAA.png 1100w, https://miro.medium.com/v2/resize:fit:1400/1*xd8j4KoKRSzRq0b1Vx0FAA.png 1400w" sizes="(min-resolution: 4dppx) and (max-width: 700px) 50vw, (-webkit-min-device-pixel-ratio: 4) and (max-width: 700px) 50vw, (min-resolution: 3dppx) and (max-width: 700px) 67vw, (-webkit-min-device-pixel-ratio: 3) and (max-width: 700px) 65vw, (min-resolution: 2.5dppx) and (max-width: 700px) 80vw, (-webkit-min-device-pixel-ratio: 2.5) and (max-width: 700px) 80vw, (min-resolution: 2dppx) and (max-width: 700px) 100vw, (-webkit-min-device-pixel-ratio: 2) and (max-width: 700px) 100vw, 700px"/><img alt="" class="bh mq nw c" width="700" height="145" loading="lazy" role="presentation"/></picture></div></div><figcaption class="nx ff ny nj nk nz oa bf b bg z du">Figure 5: Encoding of the sentence “the cat likes to eat pizza”</figcaption></figure><p id="b862" class="pw-post-body-paragraph ob oc gu od b hs oe of og hv oh oi oj ok ol om on oo op oq or os ot ou ov ow gn bk">In the above figure, the blue arrows correspond to weight matrices, which we will work to enhance through training to achieve more accurate translations.</p><p id="a1b2" class="pw-post-body-paragraph ob oc gu od b hs oe of og hv oh oi oj ok ol om on oo op oq or os ot ou ov ow gn bk">Also, notice how the final hidden state of the Encoder becomes the thought vector and is relabeled with superscript D at <em class="oy">t</em>=0. This is because this final hidden vector of the Encoder becomes the initial hidden vector of the Decoder. In this way, we are passing the encoded meaning of the sentence to the Decoder to be translated to a sentence in the output language. However, unlike the Encoder, we need the Decoder to output a translated sentence of variable length. Thus, we are going to have our Decoder output a prediction word at each time-step until we have outputted a complete sentence.</p><p id="d964" class="pw-post-body-paragraph ob oc gu od b hs oe of og hv oh oi oj ok ol om on oo op oq or os ot ou ov ow gn bk">In order to start this translation, we are going to input a <SOS> tag as the input at the first time-step in the Decoder. Just as in the Encoder, the Decoder will use the <SOS> input at time-step <em class="oy">t</em>=1 to update its hidden state. However, rather than just proceeding to the next time-step, the Decoder will use an additional weight matrix to create a probability over all of the words in the output vocabulary. In this way, the word with the highest probability in the output vocabulary will become the first word in the predicted output sentence. This first step of the Decoder, translating from “the cat likes to eat pizza” to “el gato le gusta comer pizza” is shown in Figure 6. For the sake of simplicity, the output vocabulary is restricted to the words in the output sentence (but in practice would consist of the thousands of words in the entire output vocabulary).</p><figure class="nm nn no np nq nr nj nk paragraph-image"><div class="nj nk qo"><picture><source srcSet="https://miro.medium.com/v2/resize:fit:640/format:webp/1*0kJbrSpwyzneRE1hHUHB6g.png 640w, https://miro.medium.com/v2/resize:fit:720/format:webp/1*0kJbrSpwyzneRE1hHUHB6g.png 720w, https://miro.medium.com/v2/resize:fit:750/format:webp/1*0kJbrSpwyzneRE1hHUHB6g.png 750w, https://miro.medium.com/v2/resize:fit:786/format:webp/1*0kJbrSpwyzneRE1hHUHB6g.png 786w, https://miro.medium.com/v2/resize:fit:828/format:webp/1*0kJbrSpwyzneRE1hHUHB6g.png 828w, https://miro.medium.com/v2/resize:fit:1100/format:webp/1*0kJbrSpwyzneRE1hHUHB6g.png 1100w, https://miro.medium.com/v2/resize:fit:624/format:webp/1*0kJbrSpwyzneRE1hHUHB6g.png 624w" sizes="(min-resolution: 4dppx) and (max-width: 700px) 50vw, (-webkit-min-device-pixel-ratio: 4) and (max-width: 700px) 50vw, (min-resolution: 3dppx) and (max-width: 700px) 67vw, (-webkit-min-device-pixel-ratio: 3) and (max-width: 700px) 65vw, (min-resolution: 2.5dppx) and (max-width: 700px) 80vw, (-webkit-min-device-pixel-ratio: 2.5) and (max-width: 700px) 80vw, (min-resolution: 2dppx) and (max-width: 700px) 100vw, (-webkit-min-device-pixel-ratio: 2) and (max-width: 700px) 100vw, 312px" type="image/webp"/><source data-testid="og" srcSet="https://miro.medium.com/v2/resize:fit:640/1*0kJbrSpwyzneRE1hHUHB6g.png 640w, https://miro.medium.com/v2/resize:fit:720/1*0kJbrSpwyzneRE1hHUHB6g.png 720w, https://miro.medium.com/v2/resize:fit:750/1*0kJbrSpwyzneRE1hHUHB6g.png 750w, https://miro.medium.com/v2/resize:fit:786/1*0kJbrSpwyzneRE1hHUHB6g.png 786w, https://miro.medium.com/v2/resize:fit:828/1*0kJbrSpwyzneRE1hHUHB6g.png 828w, https://miro.medium.com/v2/resize:fit:1100/1*0kJbrSpwyzneRE1hHUHB6g.png 1100w, https://miro.medium.com/v2/resize:fit:624/1*0kJbrSpwyzneRE1hHUHB6g.png 624w" sizes="(min-resolution: 4dppx) and (max-width: 700px) 50vw, (-webkit-min-device-pixel-ratio: 4) and (max-width: 700px) 50vw, (min-resolution: 3dppx) and (max-width: 700px) 67vw, (-webkit-min-device-pixel-ratio: 3) and (max-width: 700px) 65vw, (min-resolution: 2.5dppx) and (max-width: 700px) 80vw, (-webkit-min-device-pixel-ratio: 2.5) and (max-width: 700px) 80vw, (min-resolution: 2dppx) and (max-width: 700px) 100vw, (-webkit-min-device-pixel-ratio: 2) and (max-width: 700px) 100vw, 312px"/><img alt="" class="bh mq nw c" width="312" height="484" loading="lazy" role="presentation"/></picture></div><figcaption class="nx ff ny nj nk nz oa bf b bg z du">Figure 6: First step of the Decoder</figcaption></figure><p id="b8f8" class="pw-post-body-paragraph ob oc gu od b hs oe of og hv oh oi oj ok ol om on oo op oq or os ot ou ov ow gn bk">Now, given that the word “el” was given the highest probability, this word becomes the first word in our outputted prediction sentence. And we proceed by using “el” as the input in the next time-step as in Figure 7 below.</p><figure class="nm nn no np nq nr nj nk paragraph-image"><div class="nj nk qp"><picture><source srcSet="https://miro.medium.com/v2/resize:fit:640/format:webp/1*fIOdNSYBADB4452cRXWcsA.png 640w, https://miro.medium.com/v2/resize:fit:720/format:webp/1*fIOdNSYBADB4452cRXWcsA.png 720w, https://miro.medium.com/v2/resize:fit:750/format:webp/1*fIOdNSYBADB4452cRXWcsA.png 750w, https://miro.medium.com/v2/resize:fit:786/format:webp/1*fIOdNSYBADB4452cRXWcsA.png 786w, https://miro.medium.com/v2/resize:fit:828/format:webp/1*fIOdNSYBADB4452cRXWcsA.png 828w, https://miro.medium.com/v2/resize:fit:1100/format:webp/1*fIOdNSYBADB4452cRXWcsA.png 1100w, https://miro.medium.com/v2/resize:fit:882/format:webp/1*fIOdNSYBADB4452cRXWcsA.png 882w" sizes="(min-resolution: 4dppx) and (max-width: 700px) 50vw, (-webkit-min-device-pixel-ratio: 4) and (max-width: 700px) 50vw, (min-resolution: 3dppx) and (max-width: 700px) 67vw, (-webkit-min-device-pixel-ratio: 3) and (max-width: 700px) 65vw, (min-resolution: 2.5dppx) and (max-width: 700px) 80vw, (-webkit-min-device-pixel-ratio: 2.5) and (max-width: 700px) 80vw, (min-resolution: 2dppx) and (max-width: 700px) 100vw, (-webkit-min-device-pixel-ratio: 2) and (max-width: 700px) 100vw, 441px" type="image/webp"/><source data-testid="og" srcSet="https://miro.medium.com/v2/resize:fit:640/1*fIOdNSYBADB4452cRXWcsA.png 640w, https://miro.medium.com/v2/resize:fit:720/1*fIOdNSYBADB4452cRXWcsA.png 720w, https://miro.medium.com/v2/resize:fit:750/1*fIOdNSYBADB4452cRXWcsA.png 750w, https://miro.medium.com/v2/resize:fit:786/1*fIOdNSYBADB4452cRXWcsA.png 786w, https://miro.medium.com/v2/resize:fit:828/1*fIOdNSYBADB4452cRXWcsA.png 828w, https://miro.medium.com/v2/resize:fit:1100/1*fIOdNSYBADB4452cRXWcsA.png 1100w, https://miro.medium.com/v2/resize:fit:882/1*fIOdNSYBADB4452cRXWcsA.png 882w" sizes="(min-resolution: 4dppx) and (max-width: 700px) 50vw, (-webkit-min-device-pixel-ratio: 4) and (max-width: 700px) 50vw, (min-resolution: 3dppx) and (max-width: 700px) 67vw, (-webkit-min-device-pixel-ratio: 3) and (max-width: 700px) 65vw, (min-resolution: 2.5dppx) and (max-width: 700px) 80vw, (-webkit-min-device-pixel-ratio: 2.5) and (max-width: 700px) 80vw, (min-resolution: 2dppx) and (max-width: 700px) 100vw, (-webkit-min-device-pixel-ratio: 2) and (max-width: 700px) 100vw, 441px"/><img alt="" class="bh mq nw c" width="441" height="510" loading="lazy" role="presentation"/></picture></div><figcaption class="nx ff ny nj nk nz oa bf b bg z du">Figure 7: Second step of the Decoder</figcaption></figure><p id="f8f4" class="pw-post-body-paragraph ob oc gu od b hs oe of og hv oh oi oj ok ol om on oo op oq or os ot ou ov ow gn bk">We proceed in this way through the duration of the sentence — that is until we run into an error such as that depicted below in Figure 8.</p><figure class="nm nn no np nq nr nj nk paragraph-image"><div role="button" tabindex="0" class="ns nt fj nu bh nv"><div class="nj nk qq"><picture><source srcSet="https://miro.medium.com/v2/resize:fit:640/format:webp/1*cMo4JhbtDkm1Wy1EnqhEkg.png 640w, https://miro.medium.com/v2/resize:fit:720/format:webp/1*cMo4JhbtDkm1Wy1EnqhEkg.png 720w, https://miro.medium.com/v2/resize:fit:750/format:webp/1*cMo4JhbtDkm1Wy1EnqhEkg.png 750w, https://miro.medium.com/v2/resize:fit:786/format:webp/1*cMo4JhbtDkm1Wy1EnqhEkg.png 786w, https://miro.medium.com/v2/resize:fit:828/format:webp/1*cMo4JhbtDkm1Wy1EnqhEkg.png 828w, https://miro.medium.com/v2/resize:fit:1100/format:webp/1*cMo4JhbtDkm1Wy1EnqhEkg.png 1100w, https://miro.medium.com/v2/resize:fit:1400/format:webp/1*cMo4JhbtDkm1Wy1EnqhEkg.png 1400w" sizes="(min-resolution: 4dppx) and (max-width: 700px) 50vw, (-webkit-min-device-pixel-ratio: 4) and (max-width: 700px) 50vw, (min-resolution: 3dppx) and (max-width: 700px) 67vw, (-webkit-min-device-pixel-ratio: 3) and (max-width: 700px) 65vw, (min-resolution: 2.5dppx) and (max-width: 700px) 80vw, (-webkit-min-device-pixel-ratio: 2.5) and (max-width: 700px) 80vw, (min-resolution: 2dppx) and (max-width: 700px) 100vw, (-webkit-min-device-pixel-ratio: 2) and (max-width: 700px) 100vw, 700px" type="image/webp"/><source data-testid="og" srcSet="https://miro.medium.com/v2/resize:fit:640/1*cMo4JhbtDkm1Wy1EnqhEkg.png 640w, https://miro.medium.com/v2/resize:fit:720/1*cMo4JhbtDkm1Wy1EnqhEkg.png 720w, https://miro.medium.com/v2/resize:fit:750/1*cMo4JhbtDkm1Wy1EnqhEkg.png 750w, https://miro.medium.com/v2/resize:fit:786/1*cMo4JhbtDkm1Wy1EnqhEkg.png 786w, https://miro.medium.com/v2/resize:fit:828/1*cMo4JhbtDkm1Wy1EnqhEkg.png 828w, https://miro.medium.com/v2/resize:fit:1100/1*cMo4JhbtDkm1Wy1EnqhEkg.png 1100w, https://miro.medium.com/v2/resize:fit:1400/1*cMo4JhbtDkm1Wy1EnqhEkg.png 1400w" sizes="(min-resolution: 4dppx) and (max-width: 700px) 50vw, (-webkit-min-device-pixel-ratio: 4) and (max-width: 700px) 50vw, (min-resolution: 3dppx) and (max-width: 700px) 67vw, (-webkit-min-device-pixel-ratio: 3) and (max-width: 700px) 65vw, (min-resolution: 2.5dppx) and (max-width: 700px) 80vw, (-webkit-min-device-pixel-ratio: 2.5) and (max-width: 700px) 80vw, (min-resolution: 2dppx) and (max-width: 700px) 100vw, (-webkit-min-device-pixel-ratio: 2) and (max-width: 700px) 100vw, 700px"/><img alt="" class="bh mq nw c" width="700" height="476" loading="lazy" role="presentation"/></picture></div></div><figcaption class="nx ff ny nj nk nz oa bf b bg z du">Figure 8: Translation error in Decoder</figcaption></figure><p id="58e9" class="pw-post-body-paragraph ob oc gu od b hs oe of og hv oh oi oj ok ol om on oo op oq or os ot ou ov ow gn bk">As you can see, the Decoder has predicted “pizza” to be the next word in the translated sentence, when it should actually be “comer”. When testing the model on the test set, we would do nothing to correct this error and would allow the Decoder to use this improper prediction as the input at the next time-step. However, during the training process, we are going to keep “pizza” as the predicted word at that point in the sentence, but force our Decoder to input the correct word “comer” as the input for the next time-step. This is a strategy referred to as <a class="af ox" href="https://machinelearningmastery.com/teacher-forcing-for-recurrent-neural-networks/" rel="noopener ugc nofollow" target="_blank">teacher-forcing</a> and helps speed up the training process. It is shown in the below figure.</p><figure class="nm nn no np nq nr nj nk paragraph-image"><div role="button" tabindex="0" class="ns nt fj nu bh nv"><div class="nj nk qr"><picture><source srcSet="https://miro.medium.com/v2/resize:fit:640/format:webp/1*ignkCc7wFznUGDBKN-3ylg.png 640w, https://miro.medium.com/v2/resize:fit:720/format:webp/1*ignkCc7wFznUGDBKN-3ylg.png 720w, https://miro.medium.com/v2/resize:fit:750/format:webp/1*ignkCc7wFznUGDBKN-3ylg.png 750w, https://miro.medium.com/v2/resize:fit:786/format:webp/1*ignkCc7wFznUGDBKN-3ylg.png 786w, https://miro.medium.com/v2/resize:fit:828/format:webp/1*ignkCc7wFznUGDBKN-3ylg.png 828w, https://miro.medium.com/v2/resize:fit:1100/format:webp/1*ignkCc7wFznUGDBKN-3ylg.png 1100w, https://miro.medium.com/v2/resize:fit:1400/format:webp/1*ignkCc7wFznUGDBKN-3ylg.png 1400w" sizes="(min-resolution: 4dppx) and (max-width: 700px) 50vw, (-webkit-min-device-pixel-ratio: 4) and (max-width: 700px) 50vw, (min-resolution: 3dppx) and (max-width: 700px) 67vw, (-webkit-min-device-pixel-ratio: 3) and (max-width: 700px) 65vw, (min-resolution: 2.5dppx) and (max-width: 700px) 80vw, (-webkit-min-device-pixel-ratio: 2.5) and (max-width: 700px) 80vw, (min-resolution: 2dppx) and (max-width: 700px) 100vw, (-webkit-min-device-pixel-ratio: 2) and (max-width: 700px) 100vw, 700px" type="image/webp"/><source data-testid="og" srcSet="https://miro.medium.com/v2/resize:fit:640/1*ignkCc7wFznUGDBKN-3ylg.png 640w, https://miro.medium.com/v2/resize:fit:720/1*ignkCc7wFznUGDBKN-3ylg.png 720w, https://miro.medium.com/v2/resize:fit:750/1*ignkCc7wFznUGDBKN-3ylg.png 750w, https://miro.medium.com/v2/resize:fit:786/1*ignkCc7wFznUGDBKN-3ylg.png 786w, https://miro.medium.com/v2/resize:fit:828/1*ignkCc7wFznUGDBKN-3ylg.png 828w, https://miro.medium.com/v2/resize:fit:1100/1*ignkCc7wFznUGDBKN-3ylg.png 1100w, https://miro.medium.com/v2/resize:fit:1400/1*ignkCc7wFznUGDBKN-3ylg.png 1400w" sizes="(min-resolution: 4dppx) and (max-width: 700px) 50vw, (-webkit-min-device-pixel-ratio: 4) and (max-width: 700px) 50vw, (min-resolution: 3dppx) and (max-width: 700px) 67vw, (-webkit-min-device-pixel-ratio: 3) and (max-width: 700px) 65vw, (min-resolution: 2.5dppx) and (max-width: 700px) 80vw, (-webkit-min-device-pixel-ratio: 2.5) and (max-width: 700px) 80vw, (min-resolution: 2dppx) and (max-width: 700px) 100vw, (-webkit-min-device-pixel-ratio: 2) and (max-width: 700px) 100vw, 700px"/><img alt="" class="bh mq nw c" width="700" height="362" loading="lazy" role="presentation"/></picture></div></div><figcaption class="nx ff ny nj nk nz oa bf b bg z du">Figure 9: Teacher-forcing</figcaption></figure><p id="6901" class="pw-post-body-paragraph ob oc gu od b hs oe of og hv oh oi oj ok ol om on oo op oq or os ot ou ov ow gn bk">Now, since the Decoder has to output prediction sentences of variable lengths, the Decoder will continue predicting words in this fashion until it predicts the next word in the sentence to be a <EOS> tag. Once this tag has been predicted, the decoding process is complete and we are left with a complete predicted translation of the input sentence. The entire process of decoding the thought vector for the input sentence “the cat likes to eat pizza” is shown in Figure 10.</p><figure class="nm nn no np nq nr nj nk paragraph-image"><div role="button" tabindex="0" class="ns nt fj nu bh nv"><div class="nj nk qs"><picture><source srcSet="https://miro.medium.com/v2/resize:fit:640/format:webp/1*GwKpF9yMipPWuruXoTWKPQ.png 640w, https://miro.medium.com/v2/resize:fit:720/format:webp/1*GwKpF9yMipPWuruXoTWKPQ.png 720w, https://miro.medium.com/v2/resize:fit:750/format:webp/1*GwKpF9yMipPWuruXoTWKPQ.png 750w, https://miro.medium.com/v2/resize:fit:786/format:webp/1*GwKpF9yMipPWuruXoTWKPQ.png 786w, https://miro.medium.com/v2/resize:fit:828/format:webp/1*GwKpF9yMipPWuruXoTWKPQ.png 828w, https://miro.medium.com/v2/resize:fit:1100/format:webp/1*GwKpF9yMipPWuruXoTWKPQ.png 1100w, https://miro.medium.com/v2/resize:fit:1400/format:webp/1*GwKpF9yMipPWuruXoTWKPQ.png 1400w" sizes="(min-resolution: 4dppx) and (max-width: 700px) 50vw, (-webkit-min-device-pixel-ratio: 4) and (max-width: 700px) 50vw, (min-resolution: 3dppx) and (max-width: 700px) 67vw, (-webkit-min-device-pixel-ratio: 3) and (max-width: 700px) 65vw, (min-resolution: 2.5dppx) and (max-width: 700px) 80vw, (-webkit-min-device-pixel-ratio: 2.5) and (max-width: 700px) 80vw, (min-resolution: 2dppx) and (max-width: 700px) 100vw, (-webkit-min-device-pixel-ratio: 2) and (max-width: 700px) 100vw, 700px" type="image/webp"/><source data-testid="og" srcSet="https://miro.medium.com/v2/resize:fit:640/1*GwKpF9yMipPWuruXoTWKPQ.png 640w, https://miro.medium.com/v2/resize:fit:720/1*GwKpF9yMipPWuruXoTWKPQ.png 720w, https://miro.medium.com/v2/resize:fit:750/1*GwKpF9yMipPWuruXoTWKPQ.png 750w, https://miro.medium.com/v2/resize:fit:786/1*GwKpF9yMipPWuruXoTWKPQ.png 786w, https://miro.medium.com/v2/resize:fit:828/1*GwKpF9yMipPWuruXoTWKPQ.png 828w, https://miro.medium.com/v2/resize:fit:1100/1*GwKpF9yMipPWuruXoTWKPQ.png 1100w, https://miro.medium.com/v2/resize:fit:1400/1*GwKpF9yMipPWuruXoTWKPQ.png 1400w" sizes="(min-resolution: 4dppx) and (max-width: 700px) 50vw, (-webkit-min-device-pixel-ratio: 4) and (max-width: 700px) 50vw, (min-resolution: 3dppx) and (max-width: 700px) 67vw, (-webkit-min-device-pixel-ratio: 3) and (max-width: 700px) 65vw, (min-resolution: 2.5dppx) and (max-width: 700px) 80vw, (-webkit-min-device-pixel-ratio: 2.5) and (max-width: 700px) 80vw, (min-resolution: 2dppx) and (max-width: 700px) 100vw, (-webkit-min-device-pixel-ratio: 2) and (max-width: 700px) 100vw, 700px"/><img alt="" class="bh mq nw c" width="700" height="251" loading="lazy" role="presentation"/></picture></div></div><figcaption class="nx ff ny nj nk nz oa bf b bg z du">Figure 10: Decoding of the sentence “the cat likes to eat pizza”</figcaption></figure><p id="fea2" class="pw-post-body-paragraph ob oc gu od b hs oe of og hv oh oi oj ok ol om on oo op oq or os ot ou ov ow gn bk">We can then compare the accuracy of this predicted translation to the actual translation of the input sentence to compute a loss. While there are several varieties of loss functions, a very common one to utilize is the Cross-Entropy Loss. The equation of this loss function is detailed in Figure 11.</p><figure class="nm nn no np nq nr nj nk paragraph-image"><div class="nj nk qt"><picture><source srcSet="https://miro.medium.com/v2/resize:fit:640/format:webp/1*JZ-qea3BYaGOT4Vdhds9mQ.png 640w, https://miro.medium.com/v2/resize:fit:720/format:webp/1*JZ-qea3BYaGOT4Vdhds9mQ.png 720w, https://miro.medium.com/v2/resize:fit:750/format:webp/1*JZ-qea3BYaGOT4Vdhds9mQ.png 750w, https://miro.medium.com/v2/resize:fit:786/format:webp/1*JZ-qea3BYaGOT4Vdhds9mQ.png 786w, https://miro.medium.com/v2/resize:fit:828/format:webp/1*JZ-qea3BYaGOT4Vdhds9mQ.png 828w, https://miro.medium.com/v2/resize:fit:1100/format:webp/1*JZ-qea3BYaGOT4Vdhds9mQ.png 1100w, https://miro.medium.com/v2/resize:fit:996/format:webp/1*JZ-qea3BYaGOT4Vdhds9mQ.png 996w" sizes="(min-resolution: 4dppx) and (max-width: 700px) 50vw, (-webkit-min-device-pixel-ratio: 4) and (max-width: 700px) 50vw, (min-resolution: 3dppx) and (max-width: 700px) 67vw, (-webkit-min-device-pixel-ratio: 3) and (max-width: 700px) 65vw, (min-resolution: 2.5dppx) and (max-width: 700px) 80vw, (-webkit-min-device-pixel-ratio: 2.5) and (max-width: 700px) 80vw, (min-resolution: 2dppx) and (max-width: 700px) 100vw, (-webkit-min-device-pixel-ratio: 2) and (max-width: 700px) 100vw, 498px" type="image/webp"/><source data-testid="og" srcSet="https://miro.medium.com/v2/resize:fit:640/1*JZ-qea3BYaGOT4Vdhds9mQ.png 640w, https://miro.medium.com/v2/resize:fit:720/1*JZ-qea3BYaGOT4Vdhds9mQ.png 720w, https://miro.medium.com/v2/resize:fit:750/1*JZ-qea3BYaGOT4Vdhds9mQ.png 750w, https://miro.medium.com/v2/resize:fit:786/1*JZ-qea3BYaGOT4Vdhds9mQ.png 786w, https://miro.medium.com/v2/resize:fit:828/1*JZ-qea3BYaGOT4Vdhds9mQ.png 828w, https://miro.medium.com/v2/resize:fit:1100/1*JZ-qea3BYaGOT4Vdhds9mQ.png 1100w, https://miro.medium.com/v2/resize:fit:996/1*JZ-qea3BYaGOT4Vdhds9mQ.png 996w" sizes="(min-resolution: 4dppx) and (max-width: 700px) 50vw, (-webkit-min-device-pixel-ratio: 4) and (max-width: 700px) 50vw, (min-resolution: 3dppx) and (max-width: 700px) 67vw, (-webkit-min-device-pixel-ratio: 3) and (max-width: 700px) 65vw, (min-resolution: 2.5dppx) and (max-width: 700px) 80vw, (-webkit-min-device-pixel-ratio: 2.5) and (max-width: 700px) 80vw, (min-resolution: 2dppx) and (max-width: 700px) 100vw, (-webkit-min-device-pixel-ratio: 2) and (max-width: 700px) 100vw, 498px"/><img alt="" class="bh mq nw c" width="498" height="394" loading="lazy" role="presentation"/></picture></div><figcaption class="nx ff ny nj nk nz oa bf b bg z du">Figure 11: Cross-Entropy Loss function</figcaption></figure><p id="f487" class="pw-post-body-paragraph ob oc gu od b hs oe of og hv oh oi oj ok ol om on oo op oq or os ot ou ov ow gn bk">In essence, what this loss function does is sum over the negative log likelihoods that the model gives to the correct word at each position in the output sentence. Given that the negative log function has a value of 0 when the input is 1 and increases exponentially as the input approaches 0 (as shown in Figure 12), the closer the probability that the model gives to the correct word at each point in the sentence is to 100%, the lower the loss.</p><figure class="nm nn no np nq nr nj nk paragraph-image"><div class="nj nk qu"><picture><source srcSet="https://miro.medium.com/v2/resize:fit:640/format:webp/1*GLLqWJkqlF4RcbuftbIIDA.png 640w, https://miro.medium.com/v2/resize:fit:720/format:webp/1*GLLqWJkqlF4RcbuftbIIDA.png 720w, https://miro.medium.com/v2/resize:fit:750/format:webp/1*GLLqWJkqlF4RcbuftbIIDA.png 750w, https://miro.medium.com/v2/resize:fit:786/format:webp/1*GLLqWJkqlF4RcbuftbIIDA.png 786w, https://miro.medium.com/v2/resize:fit:828/format:webp/1*GLLqWJkqlF4RcbuftbIIDA.png 828w, https://miro.medium.com/v2/resize:fit:1100/format:webp/1*GLLqWJkqlF4RcbuftbIIDA.png 1100w, https://miro.medium.com/v2/resize:fit:838/format:webp/1*GLLqWJkqlF4RcbuftbIIDA.png 838w" sizes="(min-resolution: 4dppx) and (max-width: 700px) 50vw, (-webkit-min-device-pixel-ratio: 4) and (max-width: 700px) 50vw, (min-resolution: 3dppx) and (max-width: 700px) 67vw, (-webkit-min-device-pixel-ratio: 3) and (max-width: 700px) 65vw, (min-resolution: 2.5dppx) and (max-width: 700px) 80vw, (-webkit-min-device-pixel-ratio: 2.5) and (max-width: 700px) 80vw, (min-resolution: 2dppx) and (max-width: 700px) 100vw, (-webkit-min-device-pixel-ratio: 2) and (max-width: 700px) 100vw, 419px" type="image/webp"/><source data-testid="og" srcSet="https://miro.medium.com/v2/resize:fit:640/1*GLLqWJkqlF4RcbuftbIIDA.png 640w, https://miro.medium.com/v2/resize:fit:720/1*GLLqWJkqlF4RcbuftbIIDA.png 720w, https://miro.medium.com/v2/resize:fit:750/1*GLLqWJkqlF4RcbuftbIIDA.png 750w, https://miro.medium.com/v2/resize:fit:786/1*GLLqWJkqlF4RcbuftbIIDA.png 786w, https://miro.medium.com/v2/resize:fit:828/1*GLLqWJkqlF4RcbuftbIIDA.png 828w, https://miro.medium.com/v2/resize:fit:1100/1*GLLqWJkqlF4RcbuftbIIDA.png 1100w, https://miro.medium.com/v2/resize:fit:838/1*GLLqWJkqlF4RcbuftbIIDA.png 838w" sizes="(min-resolution: 4dppx) and (max-width: 700px) 50vw, (-webkit-min-device-pixel-ratio: 4) and (max-width: 700px) 50vw, (min-resolution: 3dppx) and (max-width: 700px) 67vw, (-webkit-min-device-pixel-ratio: 3) and (max-width: 700px) 65vw, (min-resolution: 2.5dppx) and (max-width: 700px) 80vw, (-webkit-min-device-pixel-ratio: 2.5) and (max-width: 700px) 80vw, (min-resolution: 2dppx) and (max-width: 700px) 100vw, (-webkit-min-device-pixel-ratio: 2) and (max-width: 700px) 100vw, 419px"/><img alt="" class="bh mq nw c" width="419" height="528" loading="lazy" role="presentation"/></picture></div><figcaption class="nx ff ny nj nk nz oa bf b bg z du">Figure 12: Graph of the function y = -log(x)</figcaption></figure><p id="5c8f" class="pw-post-body-paragraph ob oc gu od b hs oe of og hv oh oi oj ok ol om on oo op oq or os ot ou ov ow gn bk">For example, given that the correct first word in the output sentence above is “el”, and our model gave a fairly high probability to the word “el” at that position, the loss for this position would be fairly low. Conversely, since the correct word at time-step <em class="oy">t</em>=5 is “comer”, but our model gave a rather low probability to the word “comer”, the loss at that step would be relatively high.</p><p id="2bec" class="pw-post-body-paragraph ob oc gu od b hs oe of og hv oh oi oj ok ol om on oo op oq or os ot ou ov ow gn bk">By summing over the loss for each word in the output sentence a total loss for the sentence is obtained. This loss corresponds to the accuracy of the translation, with lower loss values corresponding to better translations. When training, the loss values of several sentences in a batch would be summed together, resulting in a total batch loss. This batch loss would then be used to perform mini-batch gradient descent to update all of the weight matrices in both the Decoder and the Encoder. These updates modify the weight matrices to slightly enhance the accuracy of the model’s translations. Thus, by performing this process iteratively, we eventually construct weight matrices that are capable of creating quality translations.</p><p id="a897" class="pw-post-body-paragraph ob oc gu od b hs oe of og hv oh oi oj ok ol om on oo op oq or os ot ou ov ow gn bk"><em class="oy">If you are unfamiliar with the concept of batches and/or mini-batch gradient descent you can find a short explanation of these concepts </em><a class="af ox" href="https://machinelearningmastery.com/gentle-introduction-mini-batch-gradient-descent-configure-batch-size/" rel="noopener ugc nofollow" target="_blank"><em class="oy">here</em></a><em class="oy">.</em></p><p id="452a" class="pw-post-body-paragraph ob oc gu od b hs oe of og hv oh oi oj ok ol om on oo op oq or os ot ou ov ow gn bk">As mentioned in the introduction, an attention mechanism is an incredible tool that greatly enhances an NMT model’s ability to create accurate translations. While there are a number of different types of attention mechanisms, some of which you can read about <a class="af ox" href="https://arxiv.org/pdf/1508.04025.pdf" rel="noopener ugc nofollow" target="_blank">here</a>, the model built in this tutorial uses a rather simple implementation of global attention. In this method of attention, at each time-step, the Decoder “looks back” at all of the hidden vectors of the Encoder to create a memory vector. It then uses this memory vector, along with the hidden vector in the Decoder at that time-step, to predict the next word in the translated sentence. In doing this, the Decoder utilizes valuable information from the Encoder that would otherwise go to waste. A visual representation of this process is shown in Figure 13. I’d recommend reading the linked article in this paragraph to learn more about the various ways this memory vector can be calculated to gain a better understanding of this important concept.</p><figure class="nm nn no np nq nr nj nk paragraph-image"><div role="button" tabindex="0" class="ns nt fj nu bh nv"><div class="nj nk qv"><picture><source srcSet="https://miro.medium.com/v2/resize:fit:640/format:webp/1*ff0Uh3mefwAMH7Z0gNrqTQ.png 640w, https://miro.medium.com/v2/resize:fit:720/format:webp/1*ff0Uh3mefwAMH7Z0gNrqTQ.png 720w, https://miro.medium.com/v2/resize:fit:750/format:webp/1*ff0Uh3mefwAMH7Z0gNrqTQ.png 750w, https://miro.medium.com/v2/resize:fit:786/format:webp/1*ff0Uh3mefwAMH7Z0gNrqTQ.png 786w, https://miro.medium.com/v2/resize:fit:828/format:webp/1*ff0Uh3mefwAMH7Z0gNrqTQ.png 828w, https://miro.medium.com/v2/resize:fit:1100/format:webp/1*ff0Uh3mefwAMH7Z0gNrqTQ.png 1100w, https://miro.medium.com/v2/resize:fit:1400/format:webp/1*ff0Uh3mefwAMH7Z0gNrqTQ.png 1400w" sizes="(min-resolution: 4dppx) and (max-width: 700px) 50vw, (-webkit-min-device-pixel-ratio: 4) and (max-width: 700px) 50vw, (min-resolution: 3dppx) and (max-width: 700px) 67vw, (-webkit-min-device-pixel-ratio: 3) and (max-width: 700px) 65vw, (min-resolution: 2.5dppx) and (max-width: 700px) 80vw, (-webkit-min-device-pixel-ratio: 2.5) and (max-width: 700px) 80vw, (min-resolution: 2dppx) and (max-width: 700px) 100vw, (-webkit-min-device-pixel-ratio: 2) and (max-width: 700px) 100vw, 700px" type="image/webp"/><source data-testid="og" srcSet="https://miro.medium.com/v2/resize:fit:640/1*ff0Uh3mefwAMH7Z0gNrqTQ.png 640w, https://miro.medium.com/v2/resize:fit:720/1*ff0Uh3mefwAMH7Z0gNrqTQ.png 720w, https://miro.medium.com/v2/resize:fit:750/1*ff0Uh3mefwAMH7Z0gNrqTQ.png 750w, https://miro.medium.com/v2/resize:fit:786/1*ff0Uh3mefwAMH7Z0gNrqTQ.png 786w, https://miro.medium.com/v2/resize:fit:828/1*ff0Uh3mefwAMH7Z0gNrqTQ.png 828w, https://miro.medium.com/v2/resize:fit:1100/1*ff0Uh3mefwAMH7Z0gNrqTQ.png 1100w, https://miro.medium.com/v2/resize:fit:1400/1*ff0Uh3mefwAMH7Z0gNrqTQ.png 1400w" sizes="(min-resolution: 4dppx) and (max-width: 700px) 50vw, (-webkit-min-device-pixel-ratio: 4) and (max-width: 700px) 50vw, (min-resolution: 3dppx) and (max-width: 700px) 67vw, (-webkit-min-device-pixel-ratio: 3) and (max-width: 700px) 65vw, (min-resolution: 2.5dppx) and (max-width: 700px) 80vw, (-webkit-min-device-pixel-ratio: 2.5) and (max-width: 700px) 80vw, (min-resolution: 2dppx) and (max-width: 700px) 100vw, (-webkit-min-device-pixel-ratio: 2) and (max-width: 700px) 100vw, 700px"/><img alt="" class="bh mq nw c" width="700" height="456" loading="lazy" role="presentation"/></picture></div></div><figcaption class="nx ff ny nj nk nz oa bf b bg z du">Figure 13: Attention mechanism for time-step t=1 in Decoder</figcaption></figure><p id="34c6" class="pw-post-body-paragraph ob oc gu od b hs oe of og hv oh oi oj ok ol om on oo op oq or os ot ou ov ow gn bk"><em class="oy">Note: Attention mechanisms are incredibly powerful and have recently been proposed (and shown) to be more effective when used on their own (i.e. without any RNN architecture). If you’re interested in NMT I’d recommend you look into transformers and particularly read the article “</em><a class="af ox" href="https://arxiv.org/pdf/1706.03762.pdf" rel="noopener ugc nofollow" target="_blank"><em class="oy">Attention Is All You Need</em></a><em class="oy">”.</em></p></div></div></div><div class="ab cb oz pa pb pc" role="separator"><span class="pd by bm pe pf pg"></span><span class="pd by bm pe pf pg"></span><span class="pd by bm pe pf"></span></div><div class="gn go gp gq gr"><div class="ab cb"><div class="ci bh fz ga gb gc"><h1 id="74d8" class="ph pi gu bf pj pk pl hu pm pn po hx pp pq pr ps pt pu pv pw px py pz qa qb qc bk"><strong class="al">Coding Tutorial (Python)</strong></h1><p id="e2b5" class="pw-post-body-paragraph ob oc gu od b hs qd of og hv qe oi oj ok qf om on oo qg oq or os qh ou ov ow gn bk">Before beginning the tutorial I would like to reiterate that this tutorial is derived largely from the PyTorch tutorial “<a class="af ox" href="https://pytorch.org/tutorials/intermediate/seq2seq_translation_tutorial.html" rel="noopener ugc nofollow" target="_blank">Translation with a Sequence to Sequence Network and Attention</a>”. However, this tutorial is optimized in a number of ways. Most notably, this code allows for the data to be separated into batches (thus allowing us to utilize the enhanced parallel computing power of a GPU), can split datasets into a train and a test set, and also has added functionality to run on datasets of various formats.</p><p id="3b5b" class="pw-post-body-paragraph ob oc gu od b hs oe of og hv oh oi oj ok ol om on oo op oq or os ot ou ov ow gn bk">Before we dive into the code tutorial, a little setup is in store. If you’d like to run the model on a GPU (highly recommended), this tutorial is going to be using Google Colab; which offers free access to Jupyter notebooks with GPU capability. If you have other access to a GPU then feel free to use that as well. Otherwise, you can look into a variety of other <a class="af ox" rel="noopener" target="_blank" href="/training-machine-learning-models-online-for-free-gpu-tpu-enabled-5def6a5c1ce3">free online GPU options</a>. The code can be run on a CPU, but the capability of any model will be constricted by computational power (and make sure to change to batch-size to 1 if you choose to do so).</p><p id="9bf0" class="pw-post-body-paragraph ob oc gu od b hs oe of og hv oh oi oj ok ol om on oo op oq or os ot ou ov ow gn bk">To get started, navigate to <a class="af ox" href="https://colab.research.google.com" rel="noopener ugc nofollow" target="_blank">Google Colaboratory</a> and log into a Google account to get started. From here, navigate to File > New Python 3 Notebook to launch a Jupyter notebook. Once you’ve opened up a new notebook, we first need to enable GPU capabilities. To do so, navigate to the top left of the page and select Edit > Notebook Settings. From here select GPU in the dropdown menu under “Hardware accelerator.”</p><figure class="nm nn no np nq nr nj nk paragraph-image"><div class="nj nk qw"><picture><source srcSet="https://miro.medium.com/v2/resize:fit:640/format:webp/1*xPZ_B0b44kdYuRp9jhQI1A.png 640w, https://miro.medium.com/v2/resize:fit:720/format:webp/1*xPZ_B0b44kdYuRp9jhQI1A.png 720w, https://miro.medium.com/v2/resize:fit:750/format:webp/1*xPZ_B0b44kdYuRp9jhQI1A.png 750w, https://miro.medium.com/v2/resize:fit:786/format:webp/1*xPZ_B0b44kdYuRp9jhQI1A.png 786w, https://miro.medium.com/v2/resize:fit:828/format:webp/1*xPZ_B0b44kdYuRp9jhQI1A.png 828w, https://miro.medium.com/v2/resize:fit:1100/format:webp/1*xPZ_B0b44kdYuRp9jhQI1A.png 1100w, https://miro.medium.com/v2/resize:fit:844/format:webp/1*xPZ_B0b44kdYuRp9jhQI1A.png 844w" sizes="(min-resolution: 4dppx) and (max-width: 700px) 50vw, (-webkit-min-device-pixel-ratio: 4) and (max-width: 700px) 50vw, (min-resolution: 3dppx) and (max-width: 700px) 67vw, (-webkit-min-device-pixel-ratio: 3) and (max-width: 700px) 65vw, (min-resolution: 2.5dppx) and (max-width: 700px) 80vw, (-webkit-min-device-pixel-ratio: 2.5) and (max-width: 700px) 80vw, (min-resolution: 2dppx) and (max-width: 700px) 100vw, (-webkit-min-device-pixel-ratio: 2) and (max-width: 700px) 100vw, 422px" type="image/webp"/><source data-testid="og" srcSet="https://miro.medium.com/v2/resize:fit:640/1*xPZ_B0b44kdYuRp9jhQI1A.png 640w, https://miro.medium.com/v2/resize:fit:720/1*xPZ_B0b44kdYuRp9jhQI1A.png 720w, https://miro.medium.com/v2/resize:fit:750/1*xPZ_B0b44kdYuRp9jhQI1A.png 750w, https://miro.medium.com/v2/resize:fit:786/1*xPZ_B0b44kdYuRp9jhQI1A.png 786w, https://miro.medium.com/v2/resize:fit:828/1*xPZ_B0b44kdYuRp9jhQI1A.png 828w, https://miro.medium.com/v2/resize:fit:1100/1*xPZ_B0b44kdYuRp9jhQI1A.png 1100w, https://miro.medium.com/v2/resize:fit:844/1*xPZ_B0b44kdYuRp9jhQI1A.png 844w" sizes="(min-resolution: 4dppx) and (max-width: 700px) 50vw, (-webkit-min-device-pixel-ratio: 4) and (max-width: 700px) 50vw, (min-resolution: 3dppx) and (max-width: 700px) 67vw, (-webkit-min-device-pixel-ratio: 3) and (max-width: 700px) 65vw, (min-resolution: 2.5dppx) and (max-width: 700px) 80vw, (-webkit-min-device-pixel-ratio: 2.5) and (max-width: 700px) 80vw, (min-resolution: 2dppx) and (max-width: 700px) 100vw, (-webkit-min-device-pixel-ratio: 2) and (max-width: 700px) 100vw, 422px"/><img alt="" class="bh mq nw c" width="422" height="304" loading="eager" role="presentation"/></picture></div><figcaption class="nx ff ny nj nk nz oa bf b bg z du">Figure 14: Enabling GPU capabilities on Google Colab</figcaption></figure><p id="b817" class="pw-post-body-paragraph ob oc gu od b hs oe of og hv oh oi oj ok ol om on oo op oq or os ot ou ov ow gn bk">We now have a Jupyter notebook with GPU capabilities and can start working towards creating an NMT model! First, we will import all of the necessary packages.</p><figure class="nm nn no np nq nr"><div class="qx jr l fj"><div class="qy qz l"></div></div></figure><p id="ff32" class="pw-post-body-paragraph ob oc gu od b hs oe of og hv oh oi oj ok ol om on oo op oq or os ot ou ov ow gn bk">Now, run the following code to check if GPU capabilities are enabled.</p><figure class="nm nn no np nq nr"><div class="qx jr l fj"><div class="qy qz l"></div></div></figure><p id="cc28" class="pw-post-body-paragraph ob oc gu od b hs oe of og hv oh oi oj ok ol om on oo op oq or os ot ou ov ow gn bk">If TRUE is returned, GPU is available. Now, before we begin doing any translation, we first need to create a number of functions which will prepare the data. The following functions serve to clean the data and allow functionality for us to remove sentences that are too long or whose input sentences don’t start with certain words.</p><figure class="nm nn no np nq nr"><div class="qx jr l fj"><div class="qy qz l"></div></div></figure><p id="6125" class="pw-post-body-paragraph ob oc gu od b hs oe of og hv oh oi oj ok ol om on oo op oq or os ot ou ov ow gn bk">Now, with functions that will clean the data, we need a way to transform this cleaned textual data into One Hot Encoding vectors. First, we create a <em class="oy">Lang </em>class which will essentially allow us to construct a vocabulary for both the input and output languages.</p><figure class="nm nn no np nq nr"><div class="qx jr l fj"><div class="qy qz l"></div></div></figure><p id="a2d7" class="pw-post-body-paragraph ob oc gu od b hs oe of og hv oh oi oj ok ol om on oo op oq or os ot ou ov ow gn bk">Next, we create a <em class="oy">prepareLangs</em> function which will take a dataset of translated sentences and create <em class="oy">Lang</em> classes for the input and the output languages of a dataset.</p><figure class="nm nn no np nq nr"><div class="qx jr l fj"><div class="qy qz l"></div></div></figure><p id="efc7" class="pw-post-body-paragraph ob oc gu od b hs oe of og hv oh oi oj ok ol om on oo op oq or os ot ou ov ow gn bk">This function has the ability to work with input and output sentences that are contained in two separate files or in a single file. If the sentences are in two separate files, each sentence must be separated by a newline and each line in the files must correspond to each other (i.e. make a sentence pair). For example, if your input file is english.txt and output file in espanol.txt the files should be formatted as in Figure 15.</p><figure class="nm nn no np nq nr nj nk paragraph-image"><div role="button" tabindex="0" class="ns nt fj nu bh nv"><div class="nj nk ra"><picture><source srcSet="https://miro.medium.com/v2/resize:fit:640/format:webp/1*xCEe0bM_9WB00iZEYWj90Q.png 640w, https://miro.medium.com/v2/resize:fit:720/format:webp/1*xCEe0bM_9WB00iZEYWj90Q.png 720w, https://miro.medium.com/v2/resize:fit:750/format:webp/1*xCEe0bM_9WB00iZEYWj90Q.png 750w, https://miro.medium.com/v2/resize:fit:786/format:webp/1*xCEe0bM_9WB00iZEYWj90Q.png 786w, https://miro.medium.com/v2/resize:fit:828/format:webp/1*xCEe0bM_9WB00iZEYWj90Q.png 828w, https://miro.medium.com/v2/resize:fit:1100/format:webp/1*xCEe0bM_9WB00iZEYWj90Q.png 1100w, https://miro.medium.com/v2/resize:fit:1400/format:webp/1*xCEe0bM_9WB00iZEYWj90Q.png 1400w" sizes="(min-resolution: 4dppx) and (max-width: 700px) 50vw, (-webkit-min-device-pixel-ratio: 4) and (max-width: 700px) 50vw, (min-resolution: 3dppx) and (max-width: 700px) 67vw, (-webkit-min-device-pixel-ratio: 3) and (max-width: 700px) 65vw, (min-resolution: 2.5dppx) and (max-width: 700px) 80vw, (-webkit-min-device-pixel-ratio: 2.5) and (max-width: 700px) 80vw, (min-resolution: 2dppx) and (max-width: 700px) 100vw, (-webkit-min-device-pixel-ratio: 2) and (max-width: 700px) 100vw, 700px" type="image/webp"/><source data-testid="og" srcSet="https://miro.medium.com/v2/resize:fit:640/1*xCEe0bM_9WB00iZEYWj90Q.png 640w, https://miro.medium.com/v2/resize:fit:720/1*xCEe0bM_9WB00iZEYWj90Q.png 720w, https://miro.medium.com/v2/resize:fit:750/1*xCEe0bM_9WB00iZEYWj90Q.png 750w, https://miro.medium.com/v2/resize:fit:786/1*xCEe0bM_9WB00iZEYWj90Q.png 786w, https://miro.medium.com/v2/resize:fit:828/1*xCEe0bM_9WB00iZEYWj90Q.png 828w, https://miro.medium.com/v2/resize:fit:1100/1*xCEe0bM_9WB00iZEYWj90Q.png 1100w, https://miro.medium.com/v2/resize:fit:1400/1*xCEe0bM_9WB00iZEYWj90Q.png 1400w" sizes="(min-resolution: 4dppx) and (max-width: 700px) 50vw, (-webkit-min-device-pixel-ratio: 4) and (max-width: 700px) 50vw, (min-resolution: 3dppx) and (max-width: 700px) 67vw, (-webkit-min-device-pixel-ratio: 3) and (max-width: 700px) 65vw, (min-resolution: 2.5dppx) and (max-width: 700px) 80vw, (-webkit-min-device-pixel-ratio: 2.5) and (max-width: 700px) 80vw, (min-resolution: 2dppx) and (max-width: 700px) 100vw, (-webkit-min-device-pixel-ratio: 2) and (max-width: 700px) 100vw, 700px"/><img alt="" class="bh mq nw c" width="700" height="199" loading="lazy" role="presentation"/></picture></div></div><figcaption class="nx ff ny nj nk nz oa bf b bg z du">Figure 15: Format for dataset stored in two separate files.</figcaption></figure><p id="2202" class="pw-post-body-paragraph ob oc gu od b hs oe of og hv oh oi oj ok ol om on oo op oq or os ot ou ov ow gn bk">On the other hand, if the input and output sentences are stored in a single file, each sentence in the pair must be separated by a tab and each sentence pair must be separated by a newline. For example, if your single file name is data.txt, the file should be formatted as in Figure 16.</p><figure class="nm nn no np nq nr nj nk paragraph-image"><div class="nj nk rb"><picture><source srcSet="https://miro.medium.com/v2/resize:fit:640/format:webp/1*hK9a8XjUdkyv-_nHd80KYw.png 640w, https://miro.medium.com/v2/resize:fit:720/format:webp/1*hK9a8XjUdkyv-_nHd80KYw.png 720w, https://miro.medium.com/v2/resize:fit:750/format:webp/1*hK9a8XjUdkyv-_nHd80KYw.png 750w, https://miro.medium.com/v2/resize:fit:786/format:webp/1*hK9a8XjUdkyv-_nHd80KYw.png 786w, https://miro.medium.com/v2/resize:fit:828/format:webp/1*hK9a8XjUdkyv-_nHd80KYw.png 828w, https://miro.medium.com/v2/resize:fit:1100/format:webp/1*hK9a8XjUdkyv-_nHd80KYw.png 1100w, https://miro.medium.com/v2/resize:fit:1376/format:webp/1*hK9a8XjUdkyv-_nHd80KYw.png 1376w" sizes="(min-resolution: 4dppx) and (max-width: 700px) 50vw, (-webkit-min-device-pixel-ratio: 4) and (max-width: 700px) 50vw, (min-resolution: 3dppx) and (max-width: 700px) 67vw, (-webkit-min-device-pixel-ratio: 3) and (max-width: 700px) 65vw, (min-resolution: 2.5dppx) and (max-width: 700px) 80vw, (-webkit-min-device-pixel-ratio: 2.5) and (max-width: 700px) 80vw, (min-resolution: 2dppx) and (max-width: 700px) 100vw, (-webkit-min-device-pixel-ratio: 2) and (max-width: 700px) 100vw, 688px" type="image/webp"/><source data-testid="og" srcSet="https://miro.medium.com/v2/resize:fit:640/1*hK9a8XjUdkyv-_nHd80KYw.png 640w, https://miro.medium.com/v2/resize:fit:720/1*hK9a8XjUdkyv-_nHd80KYw.png 720w, https://miro.medium.com/v2/resize:fit:750/1*hK9a8XjUdkyv-_nHd80KYw.png 750w, https://miro.medium.com/v2/resize:fit:786/1*hK9a8XjUdkyv-_nHd80KYw.png 786w, https://miro.medium.com/v2/resize:fit:828/1*hK9a8XjUdkyv-_nHd80KYw.png 828w, https://miro.medium.com/v2/resize:fit:1100/1*hK9a8XjUdkyv-_nHd80KYw.png 1100w, https://miro.medium.com/v2/resize:fit:1376/1*hK9a8XjUdkyv-_nHd80KYw.png 1376w" sizes="(min-resolution: 4dppx) and (max-width: 700px) 50vw, (-webkit-min-device-pixel-ratio: 4) and (max-width: 700px) 50vw, (min-resolution: 3dppx) and (max-width: 700px) 67vw, (-webkit-min-device-pixel-ratio: 3) and (max-width: 700px) 65vw, (min-resolution: 2.5dppx) and (max-width: 700px) 80vw, (-webkit-min-device-pixel-ratio: 2.5) and (max-width: 700px) 80vw, (min-resolution: 2dppx) and (max-width: 700px) 100vw, (-webkit-min-device-pixel-ratio: 2) and (max-width: 700px) 100vw, 688px"/><img alt="" class="bh mq nw c" width="688" height="313" loading="lazy" role="presentation"/></picture></div><figcaption class="nx ff ny nj nk nz oa bf b bg z du">Figure 16: Format for dataset stored in one single file.</figcaption></figure><p id="d89c" class="pw-post-body-paragraph ob oc gu od b hs oe of og hv oh oi oj ok ol om on oo op oq or os ot ou ov ow gn bk"><em class="oy">Note: In order for this function to work with both one and two files, the file_path argument must be in the tuple format with two elements in the tuple if the data is stored in two files, and one element in the tuple if the data is stored in a single file.</em></p><p id="3c5b" class="pw-post-body-paragraph ob oc gu od b hs oe of og hv oh oi oj ok ol om on oo op oq or os ot ou ov ow gn bk">With a function that works to prepare the language vocabularies for both the input and output languages, we can use all of the above functions to create a single function that will take a dataset of both input and target sentences and complete all of the preprocessing steps. Thus, the <em class="oy">prepareData </em>function will creates <em class="oy">Lang</em> classes for each language and fully clean and trim the data according to the specified passed arguments. In the end, this function will return both language classes along with a set of training pairs and a set of test pairs.</p><figure class="nm nn no np nq nr"><div class="qx jr l fj"><div class="qy qz l"></div></div></figure><p id="a4d0" class="pw-post-body-paragraph ob oc gu od b hs oe of og hv oh oi oj ok ol om on oo op oq or os ot ou ov ow gn bk">While we have created a vocabulary for each language, we still need to create functions which use these vocabularies to transform sentence pairs both to and from their One Hot Encoding vector representations.</p><figure class="nm nn no np nq nr"><div class="qx jr l fj"><div class="qy qz l"></div></div></figure><p id="53f8" class="pw-post-body-paragraph ob oc gu od b hs oe of og hv oh oi oj ok ol om on oo op oq or os ot ou ov ow gn bk">NMT is no different than normal machine learning in that <a class="af ox" href="https://machinelearningmastery.com/gentle-introduction-mini-batch-gradient-descent-configure-batch-size/" rel="noopener ugc nofollow" target="_blank">minibatch gradient descent is the most effective way to train a model</a>. Thus, before we begin building our model, we want to create a function to <em class="oy">batchify</em> our sentence pairs so that we can perform gradient descent on mini-batches. We also create the function <em class="oy">pad_batch</em> to handle the issue of variable length sentences in a batch. This function essentially appends<EOS> tags to the end of each of the shorter sentences until every sentence in the batch is the same length.</p><figure class="nm nn no np nq nr"><div class="qx jr l fj"><div class="qy qz l"></div></div></figure><p id="9e98" class="pw-post-body-paragraph ob oc gu od b hs oe of og hv oh oi oj ok ol om on oo op oq or os ot ou ov ow gn bk">And with that, we have created all of the necessary functions to preprocess the data and are finally ready to build our Encoder Decoder model! With a general understanding of the Encoder Decoder architecture and attention mechanisms, let’s dive into the Python code that creates these frameworks. Rather than explain each aspect of the Encoder and the Decoder, I will simply provide the code and refer you to the <a class="af ox" href="https://pytorch.org/docs/stable/nn.html" rel="noopener ugc nofollow" target="_blank">PyTorch documentation</a> for any questions you may have on various aspects of the code.</p><figure class="nm nn no np nq nr"><div class="qx jr l fj"><div class="qy qz l"></div></div></figure><figure class="nm nn no np nq nr"><div class="qx jr l fj"><div class="qy qz l"></div></div></figure><p id="3904" class="pw-post-body-paragraph ob oc gu od b hs oe of og hv oh oi oj ok ol om on oo op oq or os ot ou ov ow gn bk">Now, in order to train and test the model, we will use the following functions. The <em class="oy">train_batch</em> function below performs a training loop on a single training batch. This includes a completing a forward pass through the model to create a predicted translation for each sentence in the batch, computing the total loss for the batch, and then back-propagating on the loss to update all of the weight matrices in both the Encoder and the Decoder.</p><figure class="nm nn no np nq nr"><div class="qx jr l fj"><div class="qy qz l"></div></div></figure><p id="8aad" class="pw-post-body-paragraph ob oc gu od b hs oe of og hv oh oi oj ok ol om on oo op oq or os ot ou ov ow gn bk">The <em class="oy">train</em> function simply performs the <em class="oy">train_batch</em> function iteratively for each batch in a list of batches. In this way, we can pass a list of all of the training batches to complete a full epoch through the training data.</p><figure class="nm nn no np nq nr"><div class="qx jr l fj"><div class="qy qz l"></div></div></figure><p id="6b6a" class="pw-post-body-paragraph ob oc gu od b hs oe of og hv oh oi oj ok ol om on oo op oq or os ot ou ov ow gn bk">The following <em class="oy">test_batch</em> and <em class="oy">test</em> functions are essentially the same as the <em class="oy">train_batch</em> and <em class="oy">train</em> functions, with the exception that these test functions are to be performed on the test data and do not include a back-propagation step. Thus, these functions do not update the weight matrices in the model and are solely used to evaluate the loss (i.e. the accuracy) on training data. In turn, this will help us track how the model performs on data outside of the training set.</p><figure class="nm nn no np nq nr"><div class="qx jr l fj"><div class="qy qz l"></div></div></figure><figure class="nm nn no np nq nr"><div class="qx jr l fj"><div class="qy qz l"></div></div></figure><p id="a2d9" class="pw-post-body-paragraph ob oc gu od b hs oe of og hv oh oi oj ok ol om on oo op oq or os ot ou ov ow gn bk">During training, it will also be nice to be able to track our progress in a more qualitative sense. The <em class="oy">evaluate </em>function will allow us to do so by returning the predicted translation that our model makes for a given input sentence. And the <em class="oy">evaluate_randomly</em> function will simply predict translation for a specified number of sentences chosen randomly from the test set (if we have one) or the train set.</p><figure class="nm nn no np nq nr"><div class="qx jr l fj"><div class="qy qz l"></div></div></figure><figure class="nm nn no np nq nr"><div class="qx jr l fj"><div class="qy qz l"></div></div></figure><p id="2ef6" class="pw-post-body-paragraph ob oc gu od b hs oe of og hv oh oi oj ok ol om on oo op oq or os ot ou ov ow gn bk">A few helper functions below will work to plot our training progress, print memory consumption, and reformat time measurements.</p><figure class="nm nn no np nq nr"><div class="qx jr l fj"><div class="qy qz l"></div></div></figure><p id="472d" class="pw-post-body-paragraph ob oc gu od b hs oe of og hv oh oi oj ok ol om on oo op oq or os ot ou ov ow gn bk">And finally, we can put all of these functions into a master function which we will call <em class="oy">train_and_test</em>. This function will take quite a few arguments, but will completely train our model while evaluating our progress on the train set (and test set if present) at specified intervals. Also, some arguments will specify whether we want to save the output in a separate .txt file, create a graph of the loss values over time, and also allow us to save the weights of both the Encoder and the Decoder for <a class="af ox" href="https://pytorch.org/tutorials/beginner/saving_loading_models.html" rel="noopener ugc nofollow" target="_blank">future use</a>. The next few cells after this function will outline how you can modify each argument, but just know that this function will essentially be all we need to run in order to train the model.</p><figure class="nm nn no np nq nr"><div class="qx jr l fj"><div class="qy qz l"></div></div></figure><p id="8663" class="pw-post-body-paragraph ob oc gu od b hs oe of og hv oh oi oj ok ol om on oo op oq or os ot ou ov ow gn bk">Now that we have everything in place we are ready to import our dataset, initialize all of the hyperparameters, and start training!</p><p id="20ba" class="pw-post-body-paragraph ob oc gu od b hs oe of og hv oh oi oj ok ol om on oo op oq or os ot ou ov ow gn bk">First, in order to upload a dataset, run the following cell:</p><figure class="nm nn no np nq nr"><div class="qx jr l fj"><div class="qy qz l"></div></div></figure><p id="efc2" class="pw-post-body-paragraph ob oc gu od b hs oe of og hv oh oi oj ok ol om on oo op oq or os ot ou ov ow gn bk">And you will see the following:</p><figure class="nm nn no np nq nr nj nk paragraph-image"><div class="nj nk rc"><picture><source srcSet="https://miro.medium.com/v2/resize:fit:640/format:webp/1*pHnANxJlp7IdPElhnvJWLw.png 640w, https://miro.medium.com/v2/resize:fit:720/format:webp/1*pHnANxJlp7IdPElhnvJWLw.png 720w, https://miro.medium.com/v2/resize:fit:750/format:webp/1*pHnANxJlp7IdPElhnvJWLw.png 750w, https://miro.medium.com/v2/resize:fit:786/format:webp/1*pHnANxJlp7IdPElhnvJWLw.png 786w, https://miro.medium.com/v2/resize:fit:828/format:webp/1*pHnANxJlp7IdPElhnvJWLw.png 828w, https://miro.medium.com/v2/resize:fit:1100/format:webp/1*pHnANxJlp7IdPElhnvJWLw.png 1100w, https://miro.medium.com/v2/resize:fit:766/format:webp/1*pHnANxJlp7IdPElhnvJWLw.png 766w" sizes="(min-resolution: 4dppx) and (max-width: 700px) 50vw, (-webkit-min-device-pixel-ratio: 4) and (max-width: 700px) 50vw, (min-resolution: 3dppx) and (max-width: 700px) 67vw, (-webkit-min-device-pixel-ratio: 3) and (max-width: 700px) 65vw, (min-resolution: 2.5dppx) and (max-width: 700px) 80vw, (-webkit-min-device-pixel-ratio: 2.5) and (max-width: 700px) 80vw, (min-resolution: 2dppx) and (max-width: 700px) 100vw, (-webkit-min-device-pixel-ratio: 2) and (max-width: 700px) 100vw, 383px" type="image/webp"/><source data-testid="og" srcSet="https://miro.medium.com/v2/resize:fit:640/1*pHnANxJlp7IdPElhnvJWLw.png 640w, https://miro.medium.com/v2/resize:fit:720/1*pHnANxJlp7IdPElhnvJWLw.png 720w, https://miro.medium.com/v2/resize:fit:750/1*pHnANxJlp7IdPElhnvJWLw.png 750w, https://miro.medium.com/v2/resize:fit:786/1*pHnANxJlp7IdPElhnvJWLw.png 786w, https://miro.medium.com/v2/resize:fit:828/1*pHnANxJlp7IdPElhnvJWLw.png 828w, https://miro.medium.com/v2/resize:fit:1100/1*pHnANxJlp7IdPElhnvJWLw.png 1100w, https://miro.medium.com/v2/resize:fit:766/1*pHnANxJlp7IdPElhnvJWLw.png 766w" sizes="(min-resolution: 4dppx) and (max-width: 700px) 50vw, (-webkit-min-device-pixel-ratio: 4) and (max-width: 700px) 50vw, (min-resolution: 3dppx) and (max-width: 700px) 67vw, (-webkit-min-device-pixel-ratio: 3) and (max-width: 700px) 65vw, (min-resolution: 2.5dppx) and (max-width: 700px) 80vw, (-webkit-min-device-pixel-ratio: 2.5) and (max-width: 700px) 80vw, (min-resolution: 2dppx) and (max-width: 700px) 100vw, (-webkit-min-device-pixel-ratio: 2) and (max-width: 700px) 100vw, 383px"/><img alt="" class="bh mq nw c" width="383" height="87" loading="lazy" role="presentation"/></picture></div><figcaption class="nx ff ny nj nk nz oa bf b bg z du">Figure 17: Upload data to Google Colab</figcaption></figure><p id="e1e8" class="pw-post-body-paragraph ob oc gu od b hs oe of og hv oh oi oj ok ol om on oo op oq or os ot ou ov ow gn bk">Simply click on the “Choose Files” button and navigate to the dataset you wish to upload. In this tutorial, we are using the same dataset that was used in the original PyTorch tutorial. You can download that dataset of English to French translations <a class="af ox" href="https://github.com/qlanners/nmt_tutorial/blob/master/eng-fra.txt" rel="noopener ugc nofollow" target="_blank">here</a>. You can also experiment with a number of other datasets of various languages <a class="af ox" href="https://www.manythings.org/anki/" rel="noopener ugc nofollow" target="_blank">here</a>.</p><p id="0400" class="pw-post-body-paragraph ob oc gu od b hs oe of og hv oh oi oj ok ol om on oo op oq or os ot ou ov ow gn bk">If you are looking to get more state-of-the-art results I’d recommend trying to train on a larger dataset. You can find some larger datasets <a class="af ox" href="http://www.statmt.org/wmt14/translation-task.html" rel="noopener ugc nofollow" target="_blank">here</a>, but also feel free to use any corpus of translated excerpts as long as they are formatted like in Figure 15 or Figure 16 above.</p><p id="aeb8" class="pw-post-body-paragraph ob oc gu od b hs oe of og hv oh oi oj ok ol om on oo op oq or os ot ou ov ow gn bk"><em class="oy">Note: You may have issues uploading larger datasets to Google Colab using the upload method presented in this tutorial. If you run into such issues, read </em><a class="af ox" href="https://www.freecodecamp.org/news/how-to-transfer-large-files-to-google-colab-and-remote-jupyter-notebooks-26ca252892fa/" rel="noopener ugc nofollow" target="_blank"><em class="oy">this article</em></a><em class="oy"> to learn how to upload large files.</em></p><p id="5a51" class="pw-post-body-paragraph ob oc gu od b hs oe of og hv oh oi oj ok ol om on oo op oq or os ot ou ov ow gn bk">Now, run the following cell to ensure that your dataset has been successfully uploaded.</p><figure class="nm nn no np nq nr nj nk paragraph-image"><div class="nj nk rd"><picture><source srcSet="https://miro.medium.com/v2/resize:fit:640/format:webp/1*TNecR90YCwzPKhiOZCNytg.png 640w, https://miro.medium.com/v2/resize:fit:720/format:webp/1*TNecR90YCwzPKhiOZCNytg.png 720w, https://miro.medium.com/v2/resize:fit:750/format:webp/1*TNecR90YCwzPKhiOZCNytg.png 750w, https://miro.medium.com/v2/resize:fit:786/format:webp/1*TNecR90YCwzPKhiOZCNytg.png 786w, https://miro.medium.com/v2/resize:fit:828/format:webp/1*TNecR90YCwzPKhiOZCNytg.png 828w, https://miro.medium.com/v2/resize:fit:1100/format:webp/1*TNecR90YCwzPKhiOZCNytg.png 1100w, https://miro.medium.com/v2/resize:fit:460/format:webp/1*TNecR90YCwzPKhiOZCNytg.png 460w" sizes="(min-resolution: 4dppx) and (max-width: 700px) 50vw, (-webkit-min-device-pixel-ratio: 4) and (max-width: 700px) 50vw, (min-resolution: 3dppx) and (max-width: 700px) 67vw, (-webkit-min-device-pixel-ratio: 3) and (max-width: 700px) 65vw, (min-resolution: 2.5dppx) and (max-width: 700px) 80vw, (-webkit-min-device-pixel-ratio: 2.5) and (max-width: 700px) 80vw, (min-resolution: 2dppx) and (max-width: 700px) 100vw, (-webkit-min-device-pixel-ratio: 2) and (max-width: 700px) 100vw, 230px" type="image/webp"/><source data-testid="og" srcSet="https://miro.medium.com/v2/resize:fit:640/1*TNecR90YCwzPKhiOZCNytg.png 640w, https://miro.medium.com/v2/resize:fit:720/1*TNecR90YCwzPKhiOZCNytg.png 720w, https://miro.medium.com/v2/resize:fit:750/1*TNecR90YCwzPKhiOZCNytg.png 750w, https://miro.medium.com/v2/resize:fit:786/1*TNecR90YCwzPKhiOZCNytg.png 786w, https://miro.medium.com/v2/resize:fit:828/1*TNecR90YCwzPKhiOZCNytg.png 828w, https://miro.medium.com/v2/resize:fit:1100/1*TNecR90YCwzPKhiOZCNytg.png 1100w, https://miro.medium.com/v2/resize:fit:460/1*TNecR90YCwzPKhiOZCNytg.png 460w" sizes="(min-resolution: 4dppx) and (max-width: 700px) 50vw, (-webkit-min-device-pixel-ratio: 4) and (max-width: 700px) 50vw, (min-resolution: 3dppx) and (max-width: 700px) 67vw, (-webkit-min-device-pixel-ratio: 3) and (max-width: 700px) 65vw, (min-resolution: 2.5dppx) and (max-width: 700px) 80vw, (-webkit-min-device-pixel-ratio: 2.5) and (max-width: 700px) 80vw, (min-resolution: 2dppx) and (max-width: 700px) 100vw, (-webkit-min-device-pixel-ratio: 2) and (max-width: 700px) 100vw, 230px"/><img alt="" class="bh mq nw c" width="230" height="67" loading="lazy" role="presentation"/></picture></div><figcaption class="nx ff ny nj nk nz oa bf b bg z du">Figure 18: Run ls to ensure dataset has been uploaded</figcaption></figure><p id="8af5" class="pw-post-body-paragraph ob oc gu od b hs oe of og hv oh oi oj ok ol om on oo op oq or os ot ou ov ow gn bk">From here, edit the following cells to apply to your dataset and desires.</p><figure class="nm nn no np nq nr"><div class="qx jr l fj"><div class="qy qz l"></div></div></figure><figure class="nm nn no np nq nr"><div class="qx jr l fj"><div class="qy qz l"></div></div></figure><p id="46ba" class="pw-post-body-paragraph ob oc gu od b hs oe of og hv oh oi oj ok ol om on oo op oq or os ot ou ov ow gn bk">The following cell consists of the variety of hyperparameters that you are going to need to play with towards finding an effective NMT model. So have fun experimenting with these.</p><figure class="nm nn no np nq nr"><div class="qx jr l fj"><div class="qy qz l"></div></div></figure><p id="5ded" class="pw-post-body-paragraph ob oc gu od b hs oe of og hv oh oi oj ok ol om on oo op oq or os ot ou ov ow gn bk">And finally, you just need to run the following cell to train your model according to all of the hyperparameters you set above.</p><figure class="nm nn no np nq nr"><div class="qx jr l fj"><div class="qy qz l"></div></div></figure><p id="0f2b" class="pw-post-body-paragraph ob oc gu od b hs oe of og hv oh oi oj ok ol om on oo op oq or os ot ou ov ow gn bk">And voilà! You have just trained an NMT model! Congrats! If you saved any graphs, output files, or output weights, you can view all of the saved files by running ls again. And to download any of these files simply run the code below.</p><figure class="nm nn no np nq nr"><div class="qx jr l fj"><div class="qy qz l"></div></div></figure><p id="13b6" class="pw-post-body-paragraph ob oc gu od b hs oe of og hv oh oi oj ok ol om on oo op oq or os ot ou ov ow gn bk">Now, if you’d like to test the model on sentences outside both the train and the test set you can do that as well. Just make sure the sentence you are trying to translate is in the same language as the input language of your model.</p><figure class="nm nn no np nq nr"><div class="qx jr l fj"><div class="qy qz l"></div></div></figure><p id="f262" class="pw-post-body-paragraph ob oc gu od b hs oe of og hv oh oi oj ok ol om on oo op oq or os ot ou ov ow gn bk">I trained my model and the PyTorch tutorial model on the same dataset used in the PyTorch tutorial (which is the same dataset of English to French translations mentioned above). To preprocess the data, the trim was set to 10 and the eng_prefixes filters that PyTorch used was set to TRUE. With these restrictions, the dataset was cut to a rather small set of 10,853 sentence pairs. The PyTorch tutorial broke one of the fundamental rules of machine learning and didn’t to use a test set (not good practice!). So, just for comparison purposes, I kept all of these sentence pairs in my train set and didn’t use a test set (i.e. perc_train_set = 1.0). However, I’d recommend that you always use a test set when training any sort of machine learning model.</p><p id="94b5" class="pw-post-body-paragraph ob oc gu od b hs oe of og hv oh oi oj ok ol om on oo op oq or os ot ou ov ow gn bk">A comparison of the hyperparameters I chose for my model vs. the hyperparameters in the PyTorch tutorial model is shown in Table 1. The graph below in Figure 19 depicts the results of training for 40 minutes on an NVIDIA GeForce GTX 1080 (a bit older GPU, you can actually achieve superior results using Google Colab).</p><figure class="nm nn no np nq nr nj nk paragraph-image"><div class="nj nk re"><picture><source srcSet="https://miro.medium.com/v2/resize:fit:640/format:webp/1*ldHhAVYcbeRxZcQ4u3HPWQ.png 640w, https://miro.medium.com/v2/resize:fit:720/format:webp/1*ldHhAVYcbeRxZcQ4u3HPWQ.png 720w, https://miro.medium.com/v2/resize:fit:750/format:webp/1*ldHhAVYcbeRxZcQ4u3HPWQ.png 750w, https://miro.medium.com/v2/resize:fit:786/format:webp/1*ldHhAVYcbeRxZcQ4u3HPWQ.png 786w, https://miro.medium.com/v2/resize:fit:828/format:webp/1*ldHhAVYcbeRxZcQ4u3HPWQ.png 828w, https://miro.medium.com/v2/resize:fit:1100/format:webp/1*ldHhAVYcbeRxZcQ4u3HPWQ.png 1100w, https://miro.medium.com/v2/resize:fit:892/format:webp/1*ldHhAVYcbeRxZcQ4u3HPWQ.png 892w" sizes="(min-resolution: 4dppx) and (max-width: 700px) 50vw, (-webkit-min-device-pixel-ratio: 4) and (max-width: 700px) 50vw, (min-resolution: 3dppx) and (max-width: 700px) 67vw, (-webkit-min-device-pixel-ratio: 3) and (max-width: 700px) 65vw, (min-resolution: 2.5dppx) and (max-width: 700px) 80vw, (-webkit-min-device-pixel-ratio: 2.5) and (max-width: 700px) 80vw, (min-resolution: 2dppx) and (max-width: 700px) 100vw, (-webkit-min-device-pixel-ratio: 2) and (max-width: 700px) 100vw, 446px" type="image/webp"/><source data-testid="og" srcSet="https://miro.medium.com/v2/resize:fit:640/1*ldHhAVYcbeRxZcQ4u3HPWQ.png 640w, https://miro.medium.com/v2/resize:fit:720/1*ldHhAVYcbeRxZcQ4u3HPWQ.png 720w, https://miro.medium.com/v2/resize:fit:750/1*ldHhAVYcbeRxZcQ4u3HPWQ.png 750w, https://miro.medium.com/v2/resize:fit:786/1*ldHhAVYcbeRxZcQ4u3HPWQ.png 786w, https://miro.medium.com/v2/resize:fit:828/1*ldHhAVYcbeRxZcQ4u3HPWQ.png 828w, https://miro.medium.com/v2/resize:fit:1100/1*ldHhAVYcbeRxZcQ4u3HPWQ.png 1100w, https://miro.medium.com/v2/resize:fit:892/1*ldHhAVYcbeRxZcQ4u3HPWQ.png 892w" sizes="(min-resolution: 4dppx) and (max-width: 700px) 50vw, (-webkit-min-device-pixel-ratio: 4) and (max-width: 700px) 50vw, (min-resolution: 3dppx) and (max-width: 700px) 67vw, (-webkit-min-device-pixel-ratio: 3) and (max-width: 700px) 65vw, (min-resolution: 2.5dppx) and (max-width: 700px) 80vw, (-webkit-min-device-pixel-ratio: 2.5) and (max-width: 700px) 80vw, (min-resolution: 2dppx) and (max-width: 700px) 100vw, (-webkit-min-device-pixel-ratio: 2) and (max-width: 700px) 100vw, 446px"/><img alt="" class="bh mq nw c" width="446" height="184" loading="lazy" role="presentation"/></picture></div><figcaption class="nx ff ny nj nk nz oa bf b bg z du">Table 1: Hyperparameters comparison</figcaption></figure><figure class="nm nn no np nq nr nj nk paragraph-image"><div role="button" tabindex="0" class="ns nt fj nu bh nv"><div class="nj nk rf"><picture><source srcSet="https://miro.medium.com/v2/resize:fit:640/format:webp/1*gWMmg6860K7Q8uXhOKAk4A.png 640w, https://miro.medium.com/v2/resize:fit:720/format:webp/1*gWMmg6860K7Q8uXhOKAk4A.png 720w, https://miro.medium.com/v2/resize:fit:750/format:webp/1*gWMmg6860K7Q8uXhOKAk4A.png 750w, https://miro.medium.com/v2/resize:fit:786/format:webp/1*gWMmg6860K7Q8uXhOKAk4A.png 786w, https://miro.medium.com/v2/resize:fit:828/format:webp/1*gWMmg6860K7Q8uXhOKAk4A.png 828w, https://miro.medium.com/v2/resize:fit:1100/format:webp/1*gWMmg6860K7Q8uXhOKAk4A.png 1100w, https://miro.medium.com/v2/resize:fit:1400/format:webp/1*gWMmg6860K7Q8uXhOKAk4A.png 1400w" sizes="(min-resolution: 4dppx) and (max-width: 700px) 50vw, (-webkit-min-device-pixel-ratio: 4) and (max-width: 700px) 50vw, (min-resolution: 3dppx) and (max-width: 700px) 67vw, (-webkit-min-device-pixel-ratio: 3) and (max-width: 700px) 65vw, (min-resolution: 2.5dppx) and (max-width: 700px) 80vw, (-webkit-min-device-pixel-ratio: 2.5) and (max-width: 700px) 80vw, (min-resolution: 2dppx) and (max-width: 700px) 100vw, (-webkit-min-device-pixel-ratio: 2) and (max-width: 700px) 100vw, 700px" type="image/webp"/><source data-testid="og" srcSet="https://miro.medium.com/v2/resize:fit:640/1*gWMmg6860K7Q8uXhOKAk4A.png 640w, https://miro.medium.com/v2/resize:fit:720/1*gWMmg6860K7Q8uXhOKAk4A.png 720w, https://miro.medium.com/v2/resize:fit:750/1*gWMmg6860K7Q8uXhOKAk4A.png 750w, https://miro.medium.com/v2/resize:fit:786/1*gWMmg6860K7Q8uXhOKAk4A.png 786w, https://miro.medium.com/v2/resize:fit:828/1*gWMmg6860K7Q8uXhOKAk4A.png 828w, https://miro.medium.com/v2/resize:fit:1100/1*gWMmg6860K7Q8uXhOKAk4A.png 1100w, https://miro.medium.com/v2/resize:fit:1400/1*gWMmg6860K7Q8uXhOKAk4A.png 1400w" sizes="(min-resolution: 4dppx) and (max-width: 700px) 50vw, (-webkit-min-device-pixel-ratio: 4) and (max-width: 700px) 50vw, (min-resolution: 3dppx) and (max-width: 700px) 67vw, (-webkit-min-device-pixel-ratio: 3) and (max-width: 700px) 65vw, (min-resolution: 2.5dppx) and (max-width: 700px) 80vw, (-webkit-min-device-pixel-ratio: 2.5) and (max-width: 700px) 80vw, (min-resolution: 2dppx) and (max-width: 700px) 100vw, (-webkit-min-device-pixel-ratio: 2) and (max-width: 700px) 100vw, 700px"/><img alt="" class="bh mq nw c" width="700" height="625" loading="lazy" role="presentation"/></picture></div></div><figcaption class="nx ff ny nj nk nz oa bf b bg z du">Figure 19: Loss over 40 minute training period for this tutorial model (My Model) vs PyTorch Tutorial Model</figcaption></figure><p id="2429" class="pw-post-body-paragraph ob oc gu od b hs oe of og hv oh oi oj ok ol om on oo op oq or os ot ou ov ow gn bk">Since this dataset has no training set, I evaluated the model on a few sentences from the train set.</p><figure class="nm nn no np nq nr nj nk paragraph-image"><div class="nj nk rg"><picture><source srcSet="https://miro.medium.com/v2/resize:fit:640/format:webp/1*rUvPuDqINjJPUwnpe8uyEw.png 640w, https://miro.medium.com/v2/resize:fit:720/format:webp/1*rUvPuDqINjJPUwnpe8uyEw.png 720w, https://miro.medium.com/v2/resize:fit:750/format:webp/1*rUvPuDqINjJPUwnpe8uyEw.png 750w, https://miro.medium.com/v2/resize:fit:786/format:webp/1*rUvPuDqINjJPUwnpe8uyEw.png 786w, https://miro.medium.com/v2/resize:fit:828/format:webp/1*rUvPuDqINjJPUwnpe8uyEw.png 828w, https://miro.medium.com/v2/resize:fit:1100/format:webp/1*rUvPuDqINjJPUwnpe8uyEw.png 1100w, https://miro.medium.com/v2/resize:fit:890/format:webp/1*rUvPuDqINjJPUwnpe8uyEw.png 890w" sizes="(min-resolution: 4dppx) and (max-width: 700px) 50vw, (-webkit-min-device-pixel-ratio: 4) and (max-width: 700px) 50vw, (min-resolution: 3dppx) and (max-width: 700px) 67vw, (-webkit-min-device-pixel-ratio: 3) and (max-width: 700px) 65vw, (min-resolution: 2.5dppx) and (max-width: 700px) 80vw, (-webkit-min-device-pixel-ratio: 2.5) and (max-width: 700px) 80vw, (min-resolution: 2dppx) and (max-width: 700px) 100vw, (-webkit-min-device-pixel-ratio: 2) and (max-width: 700px) 100vw, 445px" type="image/webp"/><source data-testid="og" srcSet="https://miro.medium.com/v2/resize:fit:640/1*rUvPuDqINjJPUwnpe8uyEw.png 640w, https://miro.medium.com/v2/resize:fit:720/1*rUvPuDqINjJPUwnpe8uyEw.png 720w, https://miro.medium.com/v2/resize:fit:750/1*rUvPuDqINjJPUwnpe8uyEw.png 750w, https://miro.medium.com/v2/resize:fit:786/1*rUvPuDqINjJPUwnpe8uyEw.png 786w, https://miro.medium.com/v2/resize:fit:828/1*rUvPuDqINjJPUwnpe8uyEw.png 828w, https://miro.medium.com/v2/resize:fit:1100/1*rUvPuDqINjJPUwnpe8uyEw.png 1100w, https://miro.medium.com/v2/resize:fit:890/1*rUvPuDqINjJPUwnpe8uyEw.png 890w" sizes="(min-resolution: 4dppx) and (max-width: 700px) 50vw, (-webkit-min-device-pixel-ratio: 4) and (max-width: 700px) 50vw, (min-resolution: 3dppx) and (max-width: 700px) 67vw, (-webkit-min-device-pixel-ratio: 3) and (max-width: 700px) 65vw, (min-resolution: 2.5dppx) and (max-width: 700px) 80vw, (-webkit-min-device-pixel-ratio: 2.5) and (max-width: 700px) 80vw, (min-resolution: 2dppx) and (max-width: 700px) 100vw, (-webkit-min-device-pixel-ratio: 2) and (max-width: 700px) 100vw, 445px"/><img alt="" class="bh mq nw c" width="445" height="339" loading="lazy" role="presentation"/></picture></div><figcaption class="nx ff ny nj nk nz oa bf b bg z du">Figure 20: Predicted translation of PyTorch tutorial model (Blue) vs. My Model (Orange)</figcaption></figure><p id="d0f8" class="pw-post-body-paragraph ob oc gu od b hs oe of og hv oh oi oj ok ol om on oo op oq or os ot ou ov ow gn bk">From these results, we can see that the model in this tutorial can create a more effective translation model in the same amount of training time. However, when we try to use this model to translate sentences outside of the train set, it immediately breaks down. We can see this in the model’s attempted translation of the following sentence which was NOT in the dataset.</p><figure class="nm nn no np nq nr nj nk paragraph-image"><div role="button" tabindex="0" class="ns nt fj nu bh nv"><div class="nj nk rh"><picture><source srcSet="https://miro.medium.com/v2/resize:fit:640/format:webp/1*XpJeIJvIWRivzTtG5ZHxdw.png 640w, https://miro.medium.com/v2/resize:fit:720/format:webp/1*XpJeIJvIWRivzTtG5ZHxdw.png 720w, https://miro.medium.com/v2/resize:fit:750/format:webp/1*XpJeIJvIWRivzTtG5ZHxdw.png 750w, https://miro.medium.com/v2/resize:fit:786/format:webp/1*XpJeIJvIWRivzTtG5ZHxdw.png 786w, https://miro.medium.com/v2/resize:fit:828/format:webp/1*XpJeIJvIWRivzTtG5ZHxdw.png 828w, https://miro.medium.com/v2/resize:fit:1100/format:webp/1*XpJeIJvIWRivzTtG5ZHxdw.png 1100w, https://miro.medium.com/v2/resize:fit:1400/format:webp/1*XpJeIJvIWRivzTtG5ZHxdw.png 1400w" sizes="(min-resolution: 4dppx) and (max-width: 700px) 50vw, (-webkit-min-device-pixel-ratio: 4) and (max-width: 700px) 50vw, (min-resolution: 3dppx) and (max-width: 700px) 67vw, (-webkit-min-device-pixel-ratio: 3) and (max-width: 700px) 65vw, (min-resolution: 2.5dppx) and (max-width: 700px) 80vw, (-webkit-min-device-pixel-ratio: 2.5) and (max-width: 700px) 80vw, (min-resolution: 2dppx) and (max-width: 700px) 100vw, (-webkit-min-device-pixel-ratio: 2) and (max-width: 700px) 100vw, 700px" type="image/webp"/><source data-testid="og" srcSet="https://miro.medium.com/v2/resize:fit:640/1*XpJeIJvIWRivzTtG5ZHxdw.png 640w, https://miro.medium.com/v2/resize:fit:720/1*XpJeIJvIWRivzTtG5ZHxdw.png 720w, https://miro.medium.com/v2/resize:fit:750/1*XpJeIJvIWRivzTtG5ZHxdw.png 750w, https://miro.medium.com/v2/resize:fit:786/1*XpJeIJvIWRivzTtG5ZHxdw.png 786w, https://miro.medium.com/v2/resize:fit:828/1*XpJeIJvIWRivzTtG5ZHxdw.png 828w, https://miro.medium.com/v2/resize:fit:1100/1*XpJeIJvIWRivzTtG5ZHxdw.png 1100w, https://miro.medium.com/v2/resize:fit:1400/1*XpJeIJvIWRivzTtG5ZHxdw.png 1400w" sizes="(min-resolution: 4dppx) and (max-width: 700px) 50vw, (-webkit-min-device-pixel-ratio: 4) and (max-width: 700px) 50vw, (min-resolution: 3dppx) and (max-width: 700px) 67vw, (-webkit-min-device-pixel-ratio: 3) and (max-width: 700px) 65vw, (min-resolution: 2.5dppx) and (max-width: 700px) 80vw, (-webkit-min-device-pixel-ratio: 2.5) and (max-width: 700px) 80vw, (min-resolution: 2dppx) and (max-width: 700px) 100vw, (-webkit-min-device-pixel-ratio: 2) and (max-width: 700px) 100vw, 700px"/><img alt="" class="bh mq nw c" width="700" height="98" loading="lazy" role="presentation"/></picture></div></div><figcaption class="nx ff ny nj nk nz oa bf b bg z du">Figure 21: Failed translation on sentence outside the dataset.</figcaption></figure><p id="6fe7" class="pw-post-body-paragraph ob oc gu od b hs oe of og hv oh oi oj ok ol om on oo op oq or os ot ou ov ow gn bk">This failure of the model is largely due to the fact that it was trained on such a small dataset. Furthermore, we were not aware of this problem because we had no test set to check the model’s ability to translate on sentences outside of the train set. To combat this issue, I retrained my model on the same dataset, this time with a trim=40 and without the eng_prefixes filter. Even when I set aside 10% of the sentence pairs for a train set, the test set was still over 10x the size of the one used to train the model before (122,251 train pairs). I also modified the hidden size of the model from 440 to 1080 and decreased the batch size from 32 to 10. Finally, I changed the initial learning rate to 0.5 and installed a learning rate schedule which decreased the learning rate by a factor of five after every five epochs.</p><p id="2f4c" class="pw-post-body-paragraph ob oc gu od b hs oe of og hv oh oi oj ok ol om on oo op oq or os ot ou ov ow gn bk">With this larger dataset and updated hyperparameters, the model was trained on the same GPU. The loss on the train and test set during training, as well as the translation of the same sentence it failed on above, are shown below.</p><figure class="nm nn no np nq nr nj nk paragraph-image"><div class="nj nk ri"><picture><source srcSet="https://miro.medium.com/v2/resize:fit:640/format:webp/1*6v4ywPVnzs2Km4HkAPlCZw.png 640w, https://miro.medium.com/v2/resize:fit:720/format:webp/1*6v4ywPVnzs2Km4HkAPlCZw.png 720w, https://miro.medium.com/v2/resize:fit:750/format:webp/1*6v4ywPVnzs2Km4HkAPlCZw.png 750w, https://miro.medium.com/v2/resize:fit:786/format:webp/1*6v4ywPVnzs2Km4HkAPlCZw.png 786w, https://miro.medium.com/v2/resize:fit:828/format:webp/1*6v4ywPVnzs2Km4HkAPlCZw.png 828w, https://miro.medium.com/v2/resize:fit:1100/format:webp/1*6v4ywPVnzs2Km4HkAPlCZw.png 1100w, https://miro.medium.com/v2/resize:fit:992/format:webp/1*6v4ywPVnzs2Km4HkAPlCZw.png 992w" sizes="(min-resolution: 4dppx) and (max-width: 700px) 50vw, (-webkit-min-device-pixel-ratio: 4) and (max-width: 700px) 50vw, (min-resolution: 3dppx) and (max-width: 700px) 67vw, (-webkit-min-device-pixel-ratio: 3) and (max-width: 700px) 65vw, (min-resolution: 2.5dppx) and (max-width: 700px) 80vw, (-webkit-min-device-pixel-ratio: 2.5) and (max-width: 700px) 80vw, (min-resolution: 2dppx) and (max-width: 700px) 100vw, (-webkit-min-device-pixel-ratio: 2) and (max-width: 700px) 100vw, 496px" type="image/webp"/><source data-testid="og" srcSet="https://miro.medium.com/v2/resize:fit:640/1*6v4ywPVnzs2Km4HkAPlCZw.png 640w, https://miro.medium.com/v2/resize:fit:720/1*6v4ywPVnzs2Km4HkAPlCZw.png 720w, https://miro.medium.com/v2/resize:fit:750/1*6v4ywPVnzs2Km4HkAPlCZw.png 750w, https://miro.medium.com/v2/resize:fit:786/1*6v4ywPVnzs2Km4HkAPlCZw.png 786w, https://miro.medium.com/v2/resize:fit:828/1*6v4ywPVnzs2Km4HkAPlCZw.png 828w, https://miro.medium.com/v2/resize:fit:1100/1*6v4ywPVnzs2Km4HkAPlCZw.png 1100w, https://miro.medium.com/v2/resize:fit:992/1*6v4ywPVnzs2Km4HkAPlCZw.png 992w" sizes="(min-resolution: 4dppx) and (max-width: 700px) 50vw, (-webkit-min-device-pixel-ratio: 4) and (max-width: 700px) 50vw, (min-resolution: 3dppx) and (max-width: 700px) 67vw, (-webkit-min-device-pixel-ratio: 3) and (max-width: 700px) 65vw, (min-resolution: 2.5dppx) and (max-width: 700px) 80vw, (-webkit-min-device-pixel-ratio: 2.5) and (max-width: 700px) 80vw, (min-resolution: 2dppx) and (max-width: 700px) 100vw, (-webkit-min-device-pixel-ratio: 2) and (max-width: 700px) 100vw, 496px"/><img alt="" class="bh mq nw c" width="496" height="398" loading="lazy" role="presentation"/></picture></div><figcaption class="nx ff ny nj nk nz oa bf b bg z du">Figure 22: Train and Test loss vs. time</figcaption></figure><figure class="nm nn no np nq nr nj nk paragraph-image"><div role="button" tabindex="0" class="ns nt fj nu bh nv"><div class="nj nk rj"><picture><source srcSet="https://miro.medium.com/v2/resize:fit:640/format:webp/1*rAAWxzXj8zRfE6cfxrEzOQ.png 640w, https://miro.medium.com/v2/resize:fit:720/format:webp/1*rAAWxzXj8zRfE6cfxrEzOQ.png 720w, https://miro.medium.com/v2/resize:fit:750/format:webp/1*rAAWxzXj8zRfE6cfxrEzOQ.png 750w, https://miro.medium.com/v2/resize:fit:786/format:webp/1*rAAWxzXj8zRfE6cfxrEzOQ.png 786w, https://miro.medium.com/v2/resize:fit:828/format:webp/1*rAAWxzXj8zRfE6cfxrEzOQ.png 828w, https://miro.medium.com/v2/resize:fit:1100/format:webp/1*rAAWxzXj8zRfE6cfxrEzOQ.png 1100w, https://miro.medium.com/v2/resize:fit:1400/format:webp/1*rAAWxzXj8zRfE6cfxrEzOQ.png 1400w" sizes="(min-resolution: 4dppx) and (max-width: 700px) 50vw, (-webkit-min-device-pixel-ratio: 4) and (max-width: 700px) 50vw, (min-resolution: 3dppx) and (max-width: 700px) 67vw, (-webkit-min-device-pixel-ratio: 3) and (max-width: 700px) 65vw, (min-resolution: 2.5dppx) and (max-width: 700px) 80vw, (-webkit-min-device-pixel-ratio: 2.5) and (max-width: 700px) 80vw, (min-resolution: 2dppx) and (max-width: 700px) 100vw, (-webkit-min-device-pixel-ratio: 2) and (max-width: 700px) 100vw, 700px" type="image/webp"/><source data-testid="og" srcSet="https://miro.medium.com/v2/resize:fit:640/1*rAAWxzXj8zRfE6cfxrEzOQ.png 640w, https://miro.medium.com/v2/resize:fit:720/1*rAAWxzXj8zRfE6cfxrEzOQ.png 720w, https://miro.medium.com/v2/resize:fit:750/1*rAAWxzXj8zRfE6cfxrEzOQ.png 750w, https://miro.medium.com/v2/resize:fit:786/1*rAAWxzXj8zRfE6cfxrEzOQ.png 786w, https://miro.medium.com/v2/resize:fit:828/1*rAAWxzXj8zRfE6cfxrEzOQ.png 828w, https://miro.medium.com/v2/resize:fit:1100/1*rAAWxzXj8zRfE6cfxrEzOQ.png 1100w, https://miro.medium.com/v2/resize:fit:1400/1*rAAWxzXj8zRfE6cfxrEzOQ.png 1400w" sizes="(min-resolution: 4dppx) and (max-width: 700px) 50vw, (-webkit-min-device-pixel-ratio: 4) and (max-width: 700px) 50vw, (min-resolution: 3dppx) and (max-width: 700px) 67vw, (-webkit-min-device-pixel-ratio: 3) and (max-width: 700px) 65vw, (min-resolution: 2.5dppx) and (max-width: 700px) 80vw, (-webkit-min-device-pixel-ratio: 2.5) and (max-width: 700px) 80vw, (min-resolution: 2dppx) and (max-width: 700px) 100vw, (-webkit-min-device-pixel-ratio: 2) and (max-width: 700px) 100vw, 700px"/><img alt="" class="bh mq nw c" width="700" height="97" loading="lazy" role="presentation"/></picture></div></div><figcaption class="nx ff ny nj nk nz oa bf b bg z du">Figure 23: Improved (yet still imperfect) translation of sentence outside of the dataset.</figcaption></figure><p id="7d07" class="pw-post-body-paragraph ob oc gu od b hs oe of og hv oh oi oj ok ol om on oo op oq or os ot ou ov ow gn bk">As you can see, the translation of this sentence is significantly improved. However, in order to achieve a perfect translation, we would probably need to increase the size of the dataset by even more.</p></div></div></div><div class="ab cb oz pa pb pc" role="separator"><span class="pd by bm pe pf pg"></span><span class="pd by bm pe pf pg"></span><span class="pd by bm pe pf"></span></div><div class="gn go gp gq gr"><div class="ab cb"><div class="ci bh fz ga gb gc"><h1 id="b6d8" class="ph pi gu bf pj pk pl hu pm pn po hx pp pq pr ps pt pu pv pw px py pz qa qb qc bk">Conclusion</h1><p id="c3d6" class="pw-post-body-paragraph ob oc gu od b hs qd of og hv qe oi oj ok qf om on oo qg oq or os qh ou ov ow gn bk">While this tutorial provides an introduction to NMT using the Encoder Decoder structure, the implemented attention mechanism is rather basic. If you are interested in creating a more state-of-the-art model I’d recommend looking into the concept of local attention and attempting to implement this more advanced type of attention within the Decoder portion of the model.</p><p id="5141" class="pw-post-body-paragraph ob oc gu od b hs oe of og hv oh oi oj ok ol om on oo op oq or os ot ou ov ow gn bk">Otherwise, I hope you enjoyed the tutorial and learned a lot! The basis of the material covered in this post was from my thesis at Loyola Marymount University. If you want to take a look at the PPT presentation I used to share these ideas (which includes the majority of the images in this article) you can find that <a class="af ox" href="https://github.com/qlanners/nmt_tutorial/blob/master/thesis_presentation.pptx" rel="noopener ugc nofollow" target="_blank">here</a>. You can also read the Thesis paper I wrote on the topic, which explains the math behind NMT in much greater depth, <a class="af ox" href="https://github.com/qlanners/nmt_tutorial/blob/master/quinn_thesis_final.pdf" rel="noopener ugc nofollow" target="_blank">here</a>. And lastly, the full Jupyter notebook for this project can be found <a class="af ox" href="https://github.com/qlanners/nmt_tutorial/blob/master/nmt_tutorial.ipynb" rel="noopener ugc nofollow" target="_blank">here</a> or alternatively a Python script version can be found <a class="af ox" href="https://github.com/qlanners/nmt_tutorial/blob/master/nmt_tutorial.py" rel="noopener ugc nofollow" target="_blank">here</a>.</p></div></div></div></div></section></div></div></article></div><div class="ab cb"><div class="ci bh fz ga gb gc"><div class="rk rl ab jm"><div class="rm ab"><a class="rn ay am ao" href="https://medium.com/tag/machine-learning?source=post_page-----15ecf6b0b--------------------------------" rel="noopener follow"><div class="ro fj cx rp ge rq rr bf b bg z bk rs">Machine Learning</div></a></div><div class="rm ab"><a class="rn ay am ao" href="https://medium.com/tag/recurrent-neural-network?source=post_page-----15ecf6b0b--------------------------------" rel="noopener follow"><div class="ro fj cx rp ge rq rr bf b bg z bk rs">Recurrent Neural Network</div></a></div><div class="rm ab"><a class="rn ay am ao" href="https://medium.com/tag/machine-translation?source=post_page-----15ecf6b0b--------------------------------" rel="noopener follow"><div class="ro fj cx rp ge rq rr bf b bg z bk rs">Machine Translation</div></a></div><div class="rm ab"><a class="rn ay am ao" href="https://medium.com/tag/deep-learning?source=post_page-----15ecf6b0b--------------------------------" rel="noopener follow"><div class="ro fj cx rp ge rq rr bf b bg z bk rs">Deep Learning</div></a></div><div class="rm ab"><a class="rn ay am ao" href="https://medium.com/tag/pytorch?source=post_page-----15ecf6b0b--------------------------------" rel="noopener follow"><div class="ro fj cx rp ge rq rr bf b bg z bk rs">Pytorch</div></a></div></div></div></div><div class="l"></div><footer class="rt pa ru rv rw ab q rx iw c"><div class="l ae"><div class="ab cb"><div class="ci bh fz ga gb gc"><div class="ab cp ry"><div class="ab q lh"><div class="rz l"><span class="l sa sb sc e d"><div class="ab q lh li"><div class="pw-multi-vote-icon fj jq lj lk ll"><span><a class="af ag ah ai aj ak al am an ao ap aq ar as at" data-testid="footerClapButton" href="https://medium.com/m/signin?actionUrl=https%3A%2F%2Fmedium.com%2F_%2Fvote%2Ftowards-data-science%2F15ecf6b0b&operation=register&redirect=https%3A%2F%2Ftowardsdatascience.com%2Fneural-machine-translation-15ecf6b0b&user=Quinn+Lanners&userId=8a2f7df48b90&source=---footer_actions--15ecf6b0b---------------------clap_footer-----------" rel="noopener follow"><div><div class="bm" aria-hidden="false"><div class="lm ao ln lo lp lq am lr ls lt ll"><svg xmlns="http://www.w3.org/2000/svg" width="24" height="24" viewBox="0 0 24 24" aria-label="clap"><path fill-rule="evenodd" d="M11.37.828 12 3.282l.63-2.454zM13.916 3.953l1.523-2.112-1.184-.39zM8.589 1.84l1.522 2.112-.337-2.501zM18.523 18.92c-.86.86-1.75 1.246-2.62 1.33a6 6 0 0 0 .407-.372c2.388-2.389 2.86-4.951 1.399-7.623l-.912-1.603-.79-1.672c-.26-.56-.194-.98.203-1.288a.7.7 0 0 1 .546-.132c.283.046.546.231.728.5l2.363 4.157c.976 1.624 1.141 4.237-1.324 6.702m-10.999-.438L3.37 14.328a.828.828 0 0 1 .585-1.408.83.83 0 0 1 .585.242l2.158 2.157a.365.365 0 0 0 .516-.516l-2.157-2.158-1.449-1.449a.826.826 0 0 1 1.167-1.17l3.438 3.44a.363.363 0 0 0 .516 0 .364.364 0 0 0 0-.516L5.293 9.513l-.97-.97a.826.826 0 0 1 0-1.166.84.84 0 0 1 1.167 0l.97.968 3.437 3.436a.36.36 0 0 0 .517 0 .366.366 0 0 0 0-.516L6.977 7.83a.82.82 0 0 1-.241-.584.82.82 0 0 1 .824-.826c.219 0 .43.087.584.242l5.787 5.787a.366.366 0 0 0 .587-.415l-1.117-2.363c-.26-.56-.194-.98.204-1.289a.7.7 0 0 1 .546-.132c.283.046.545.232.727.501l2.193 3.86c1.302 2.38.883 4.59-1.277 6.75-1.156 1.156-2.602 1.627-4.19 1.367-1.418-.236-2.866-1.033-4.079-2.246M10.75 5.971l2.12 2.12c-.41.502-.465 1.17-.128 1.89l.22.465-3.523-3.523a.8.8 0 0 1-.097-.368c0-.22.086-.428.241-.584a.847.847 0 0 1 1.167 0m7.355 1.705c-.31-.461-.746-.758-1.23-.837a1.44 1.44 0 0 0-1.11.275c-.312.24-.505.543-.59.881a1.74 1.74 0 0 0-.906-.465 1.47 1.47 0 0 0-.82.106l-2.182-2.182a1.56 1.56 0 0 0-2.2 0 1.54 1.54 0 0 0-.396.701 1.56 1.56 0 0 0-2.21-.01 1.55 1.55 0 0 0-.416.753c-.624-.624-1.649-.624-2.237-.037a1.557 1.557 0 0 0 0 2.2c-.239.1-.501.238-.715.453a1.56 1.56 0 0 0 0 2.2l.516.515a1.556 1.556 0 0 0-.753 2.615L7.01 19c1.32 1.319 2.909 2.189 4.475 2.449q.482.08.971.08c.85 0 1.653-.198 2.393-.579.231.033.46.054.686.054 1.266 0 2.457-.52 3.505-1.567 2.763-2.763 2.552-5.734 1.439-7.586z" clip-rule="evenodd"></path></svg></div></div></div></a></span></div><div class="pw-multi-vote-count l lu lv lw lx ly lz ma"><p class="bf b dv z du"><span class="mb">--</span></p></div></div></span><span class="l h g f sd se"><div class="ab q lh li"><div class="pw-multi-vote-icon fj jq lj lk ll"><span><a class="af ag ah ai aj ak al am an ao ap aq ar as at" data-testid="footerClapButton" href="https://medium.com/m/signin?actionUrl=https%3A%2F%2Fmedium.com%2F_%2Fvote%2Ftowards-data-science%2F15ecf6b0b&operation=register&redirect=https%3A%2F%2Ftowardsdatascience.com%2Fneural-machine-translation-15ecf6b0b&user=Quinn+Lanners&userId=8a2f7df48b90&source=---footer_actions--15ecf6b0b---------------------clap_footer-----------" rel="noopener follow"><div><div class="bm" aria-hidden="false"><div class="lm ao ln lo lp lq am lr ls lt ll"><svg xmlns="http://www.w3.org/2000/svg" width="24" height="24" viewBox="0 0 24 24" aria-label="clap"><path fill-rule="evenodd" d="M11.37.828 12 3.282l.63-2.454zM13.916 3.953l1.523-2.112-1.184-.39zM8.589 1.84l1.522 2.112-.337-2.501zM18.523 18.92c-.86.86-1.75 1.246-2.62 1.33a6 6 0 0 0 .407-.372c2.388-2.389 2.86-4.951 1.399-7.623l-.912-1.603-.79-1.672c-.26-.56-.194-.98.203-1.288a.7.7 0 0 1 .546-.132c.283.046.546.231.728.5l2.363 4.157c.976 1.624 1.141 4.237-1.324 6.702m-10.999-.438L3.37 14.328a.828.828 0 0 1 .585-1.408.83.83 0 0 1 .585.242l2.158 2.157a.365.365 0 0 0 .516-.516l-2.157-2.158-1.449-1.449a.826.826 0 0 1 1.167-1.17l3.438 3.44a.363.363 0 0 0 .516 0 .364.364 0 0 0 0-.516L5.293 9.513l-.97-.97a.826.826 0 0 1 0-1.166.84.84 0 0 1 1.167 0l.97.968 3.437 3.436a.36.36 0 0 0 .517 0 .366.366 0 0 0 0-.516L6.977 7.83a.82.82 0 0 1-.241-.584.82.82 0 0 1 .824-.826c.219 0 .43.087.584.242l5.787 5.787a.366.366 0 0 0 .587-.415l-1.117-2.363c-.26-.56-.194-.98.204-1.289a.7.7 0 0 1 .546-.132c.283.046.545.232.727.501l2.193 3.86c1.302 2.38.883 4.59-1.277 6.75-1.156 1.156-2.602 1.627-4.19 1.367-1.418-.236-2.866-1.033-4.079-2.246M10.75 5.971l2.12 2.12c-.41.502-.465 1.17-.128 1.89l.22.465-3.523-3.523a.8.8 0 0 1-.097-.368c0-.22.086-.428.241-.584a.847.847 0 0 1 1.167 0m7.355 1.705c-.31-.461-.746-.758-1.23-.837a1.44 1.44 0 0 0-1.11.275c-.312.24-.505.543-.59.881a1.74 1.74 0 0 0-.906-.465 1.47 1.47 0 0 0-.82.106l-2.182-2.182a1.56 1.56 0 0 0-2.2 0 1.54 1.54 0 0 0-.396.701 1.56 1.56 0 0 0-2.21-.01 1.55 1.55 0 0 0-.416.753c-.624-.624-1.649-.624-2.237-.037a1.557 1.557 0 0 0 0 2.2c-.239.1-.501.238-.715.453a1.56 1.56 0 0 0 0 2.2l.516.515a1.556 1.556 0 0 0-.753 2.615L7.01 19c1.32 1.319 2.909 2.189 4.475 2.449q.482.08.971.08c.85 0 1.653-.198 2.393-.579.231.033.46.054.686.054 1.266 0 2.457-.52 3.505-1.567 2.763-2.763 2.552-5.734 1.439-7.586z" clip-rule="evenodd"></path></svg></div></div></div></a></span></div><div class="pw-multi-vote-count l lu lv lw lx ly lz ma"><p class="bf b dv z du"><span class="mb">--</span></p></div></div></span></div><div class="bq ab"><div><div class="bm" aria-hidden="false"><button class="ao lm me mf ab q fk mg mh" aria-label="responses"><svg xmlns="http://www.w3.org/2000/svg" width="24" height="24" viewBox="0 0 24 24" class="md"><path d="M18.006 16.803c1.533-1.456 2.234-3.325 2.234-5.321C20.24 7.357 16.709 4 12.191 4S4 7.357 4 11.482c0 4.126 3.674 7.482 8.191 7.482.817 0 1.622-.111 2.393-.327.231.2.48.391.744.559 1.06.693 2.203 1.044 3.399 1.044.224-.008.4-.112.486-.287a.49.49 0 0 0-.042-.518c-.495-.67-.845-1.364-1.04-2.057a4 4 0 0 1-.125-.598zm-3.122 1.055-.067-.223-.315.096a8 8 0 0 1-2.311.338c-4.023 0-7.292-2.955-7.292-6.587 0-3.633 3.269-6.588 7.292-6.588 4.014 0 7.112 2.958 7.112 6.593 0 1.794-.608 3.469-2.027 4.72l-.195.168v.255c0 .056 0 .151.016.295.025.231.081.478.154.733.154.558.398 1.117.722 1.659a5.3 5.3 0 0 1-2.165-.845c-.276-.176-.714-.383-.941-.59z"></path></svg><p class="bf b bg z du"><span class="pw-responses-count mc md">5</span></p></button></div></div></div></div><div class="ab q"><div class="pg l jj"><div><div class="bm" aria-hidden="false"><span><a class="af ag ah ai aj ak al am an ao ap aq ar as at" data-testid="footerBookmarkButton" href="https://medium.com/m/signin?actionUrl=https%3A%2F%2Fmedium.com%2F_%2Fbookmark%2Fp%2F15ecf6b0b&operation=register&redirect=https%3A%2F%2Ftowardsdatascience.com%2Fneural-machine-translation-15ecf6b0b&source=---footer_actions--15ecf6b0b---------------------bookmark_footer-----------" rel="noopener follow"><svg xmlns="http://www.w3.org/2000/svg" width="25" height="25" fill="none" viewBox="0 0 25 25" class="du mj" aria-label="Add to list bookmark button"><path fill="currentColor" d="M18 2.5a.5.5 0 0 1 1 0V5h2.5a.5.5 0 0 1 0 1H19v2.5a.5.5 0 1 1-1 0V6h-2.5a.5.5 0 0 1 0-1H18zM7 7a1 1 0 0 1 1-1h3.5a.5.5 0 0 0 0-1H8a2 2 0 0 0-2 2v14a.5.5 0 0 0 .805.396L12.5 17l5.695 4.396A.5.5 0 0 0 19 21v-8.5a.5.5 0 0 0-1 0v7.485l-5.195-4.012a.5.5 0 0 0-.61 0L7 19.985z"></path></svg></a></span></div></div></div><div class="pg l jj"><div class="bm" aria-hidden="false" aria-describedby="postFooterSocialMenu" aria-labelledby="postFooterSocialMenu"><div><div class="bm" aria-hidden="false"><button aria-controls="postFooterSocialMenu" aria-expanded="false" aria-label="Share Post" data-testid="footerSocialShareButton" class="af fk ah ai aj ak al mr an ao ap ex ms mt mh mu"><svg xmlns="http://www.w3.org/2000/svg" width="24" height="24" fill="none" viewBox="0 0 24 24"><path fill="currentColor" fill-rule="evenodd" d="M15.218 4.931a.4.4 0 0 1-.118.132l.012.006a.45.45 0 0 1-.292.074.5.5 0 0 1-.3-.13l-2.02-2.02v7.07c0 .28-.23.5-.5.5s-.5-.22-.5-.5v-7.04l-2 2a.45.45 0 0 1-.57.04h-.02a.4.4 0 0 1-.16-.3.4.4 0 0 1 .1-.32l2.8-2.8a.5.5 0 0 1 .7 0l2.8 2.79a.42.42 0 0 1 .068.498m-.106.138.008.004v-.01zM16 7.063h1.5a2 2 0 0 1 2 2v10a2 2 0 0 1-2 2h-11c-1.1 0-2-.9-2-2v-10a2 2 0 0 1 2-2H8a.5.5 0 0 1 .35.15.5.5 0 0 1 .15.35.5.5 0 0 1-.15.35.5.5 0 0 1-.35.15H6.4c-.5 0-.9.4-.9.9v10.2a.9.9 0 0 0 .9.9h11.2c.5 0 .9-.4.9-.9v-10.2c0-.5-.4-.9-.9-.9H16a.5.5 0 0 1 0-1" clip-rule="evenodd"></path></svg></button></div></div></div></div></div></div></div></div></div></footer><div class="sf sg sh si sj l"><div class="ab cb"><div class="ci bh fz ga gb gc"><div class="sk bh r sl"></div><div class="sm l"><div class="ab sn so sp jl jk"><div class="sq sr ss st su sv sw sx sy sz ab cp"><div class="h k"><a href="https://towardsdatascience.com/?source=post_page---post_publication_info--15ecf6b0b--------------------------------" rel="noopener follow"><div class="fj ab"><img alt="Towards Data Science" class="ta in io cx" src="https://miro.medium.com/v2/resize:fill:96:96/1*CJe3891yB1A1mzMdqemkdg.jpeg" width="48" height="48" loading="lazy"/><div class="ta l io in fs n fr tb"></div></div></a></div><div class="j i d"><a href="https://towardsdatascience.com/?source=post_page---post_publication_info--15ecf6b0b--------------------------------" rel="noopener follow"><div class="fj ab"><img alt="Towards Data Science" class="ta td tc cx" src="https://miro.medium.com/v2/resize:fill:128:128/1*CJe3891yB1A1mzMdqemkdg.jpeg" width="64" height="64" loading="lazy"/><div class="ta l tc td fs n fr tb"></div></div></a></div><div class="j i d te jj"><div class="ab"><span><a class="bf b bg z tf ro tg th ti tj tk ev ew tl tm tn fa fb fc fd bm fe ff" href="https://medium.com/m/signin?actionUrl=https%3A%2F%2Fmedium.com%2F_%2Fsubscribe%2Fcollection%2Ftowards-data-science&operation=register&redirect=https%3A%2F%2Ftowardsdatascience.com%2Fneural-machine-translation-15ecf6b0b&collection=Towards+Data+Science&collectionId=7f60cf5620c9&source=post_page---post_publication_info--15ecf6b0b---------------------follow_profile-----------" rel="noopener follow">Follow</a></span></div></div></div><div class="ab co to"><div class="tp tq tr ts tt l"><a class="af ag ah aj ak al am an ao ap aq ar as at ab q" href="https://towardsdatascience.com/?source=post_page---post_publication_info--15ecf6b0b--------------------------------" rel="noopener follow"><h2 class="pw-author-name bf tv tw tx ty tz ua ub ok uc ud oo ue uf os ug uh bk"><span class="gn tu">Published in <!-- -->Towards Data Science</span></h2></a><div class="rm ab im"><div class="l jj"><span class="pw-follower-count bf b bg z du"><a class="af ag ah ai aj ak al am an ao ap aq ar jc" rel="noopener follow" href="/followers?source=post_page---post_publication_info--15ecf6b0b--------------------------------">768K Followers</a></span></div><div class="bf b bg z du ab jp"><span class="jd l" aria-hidden="true"><span class="bf b bg z du">·</span></span><a class="af ag ah ai aj ak al am an ao ap aq ar jc" rel="noopener follow" href="/roadmap-to-becoming-a-data-scientist-part-1-maths-2dc9beb69b27?source=post_page---post_publication_info--15ecf6b0b--------------------------------">Last published <!-- -->just now</a></div></div><div class="ui l"><p class="bf b bg z bk"><span class="gn">Your home for data science and AI. The world’s leading publication for data science, data analytics, data engineering, machine learning, and artificial intelligence professionals.</span></p></div></div></div><div class="h k"><div class="ab"><span><a class="bf b bg z tf ro tg th ti tj tk ev ew tl tm tn fa fb fc fd bm fe ff" href="https://medium.com/m/signin?actionUrl=https%3A%2F%2Fmedium.com%2F_%2Fsubscribe%2Fcollection%2Ftowards-data-science&operation=register&redirect=https%3A%2F%2Ftowardsdatascience.com%2Fneural-machine-translation-15ecf6b0b&collection=Towards+Data+Science&collectionId=7f60cf5620c9&source=post_page---post_publication_info--15ecf6b0b---------------------follow_profile-----------" rel="noopener follow">Follow</a></span></div></div></div></div><div class="ab sn so sp jl jk"><div class="sq sr ss st su sv sw sx sy sz ab cp"><div class="h k"><a tabindex="0" href="https://medium.com/@lannersq?source=post_page---post_author_info--15ecf6b0b--------------------------------" rel="noopener follow"><div class="l fj"><img alt="Quinn Lanners" class="l fd by io in cx" src="https://miro.medium.com/v2/resize:fill:96:96/2*Brk5nEh8iz86Uf-730hMgA.png" width="48" height="48" loading="lazy"/><div class="fr by l io in fs n ay tb"></div></div></a></div><div class="j i d"><a tabindex="0" href="https://medium.com/@lannersq?source=post_page---post_author_info--15ecf6b0b--------------------------------" rel="noopener follow"><div class="l fj"><img alt="Quinn Lanners" class="l fd by tc td cx" src="https://miro.medium.com/v2/resize:fill:128:128/2*Brk5nEh8iz86Uf-730hMgA.png" width="64" height="64" loading="lazy"/><div class="fr by l tc td fs n ay tb"></div></div></a></div><div class="j i d te jj"><div class="ab"><span><a class="bf b bg z tf ro tg th ti tj tk ev ew tl tm tn fa fb fc fd bm fe ff" href="https://medium.com/m/signin?actionUrl=https%3A%2F%2Fmedium.com%2F_%2Fsubscribe%2Fuser%2F8a2f7df48b90&operation=register&redirect=https%3A%2F%2Ftowardsdatascience.com%2Fneural-machine-translation-15ecf6b0b&user=Quinn+Lanners&userId=8a2f7df48b90&source=post_page-8a2f7df48b90--post_author_info--15ecf6b0b---------------------follow_profile-----------" rel="noopener follow">Follow</a></span></div></div></div><div class="ab co to"><div class="tp tq tr ts tt l"><a class="af ag ah aj ak al am an ao ap aq ar as at ab q" href="https://medium.com/@lannersq?source=post_page---post_author_info--15ecf6b0b--------------------------------" rel="noopener follow"><h2 class="pw-author-name bf tv tw tx ty tz ua ub ok uc ud oo ue uf os ug uh bk"><span class="gn tu">Written by <!-- -->Quinn Lanners</span></h2></a><div class="rm ab im"><div class="l jj"><span class="pw-follower-count bf b bg z du"><a class="af ag ah ai aj ak al am an ao ap aq ar jc" href="https://medium.com/@lannersq/followers?source=post_page---post_author_info--15ecf6b0b--------------------------------" rel="noopener follow">96 Followers</a></span></div><div class="bf b bg z du ab jp"><span class="jd l" aria-hidden="true"><span class="bf b bg z du">·</span></span><a class="af ag ah ai aj ak al am an ao ap aq ar jc" href="https://medium.com/@lannersq/following?source=post_page---post_author_info--15ecf6b0b--------------------------------" rel="noopener follow">1 Following</a></div></div><div class="ui l"><p class="bf b bg z bk"><span class="gn">Biostatistics PhD student at Duke University. My work centers around interpretable AI and Causal Inference with observational data, particularly in medicine.</span></p></div></div></div><div class="h k"><div class="ab"><span><a class="bf b bg z tf ro tg th ti tj tk ev ew tl tm tn fa fb fc fd bm fe ff" href="https://medium.com/m/signin?actionUrl=https%3A%2F%2Fmedium.com%2F_%2Fsubscribe%2Fuser%2F8a2f7df48b90&operation=register&redirect=https%3A%2F%2Ftowardsdatascience.com%2Fneural-machine-translation-15ecf6b0b&user=Quinn+Lanners&userId=8a2f7df48b90&source=post_page-8a2f7df48b90--post_author_info--15ecf6b0b---------------------follow_profile-----------" rel="noopener follow">Follow</a></span></div></div></div></div></div></div><div class="uj uk ul um un l"><div class="sk bh r uj uk uo up uq"></div><div class="ab cb"><div class="ci bh fz ga gb gc"><div class="ab q cp"><h2 class="bf tv pk hu pm pn hx pp pq ps pt pu pw px py qa qb bk">Responses (<!-- -->5<!-- -->)</h2><div class="ab ur"><div><div class="bm" aria-hidden="false"><a class="us ut" href="https://policy.medium.com/medium-rules-30e5502c4eb4?source=post_page---post_responses--15ecf6b0b--------------------------------" rel="noopener follow" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" width="25" height="25" viewBox="0 0 25 25"><path fill-rule="evenodd" d="M11.987 5.036a.754.754 0 0 1 .914-.01c.972.721 1.767 1.218 2.6 1.543.828.322 1.719.485 2.887.505a.755.755 0 0 1 .741.757c-.018 3.623-.43 6.256-1.449 8.21-1.034 1.984-2.662 3.209-4.966 4.083a.75.75 0 0 1-.537-.003c-2.243-.874-3.858-2.095-4.897-4.074-1.024-1.951-1.457-4.583-1.476-8.216a.755.755 0 0 1 .741-.757c1.195-.02 2.1-.182 2.923-.503.827-.322 1.6-.815 2.519-1.535m.468.903c-.897.69-1.717 1.21-2.623 1.564-.898.35-1.856.527-3.026.565.037 3.45.469 5.817 1.36 7.515.884 1.684 2.25 2.762 4.284 3.571 2.092-.81 3.465-1.89 4.344-3.575.886-1.698 1.299-4.065 1.334-7.512-1.149-.039-2.091-.217-2.99-.567-.906-.353-1.745-.873-2.683-1.561m-.009 9.155a2.672 2.672 0 1 0 0-5.344 2.672 2.672 0 0 0 0 5.344m0 1a3.672 3.672 0 1 0 0-7.344 3.672 3.672 0 0 0 0 7.344m-1.813-3.777.525-.526.916.917 1.623-1.625.526.526-2.149 2.152z" clip-rule="evenodd"></path></svg></a></div></div></div></div><div class="oz l"><button class="bf b bg z bk ro uu uv uw mj mg tk ev ew ex ux uy uz fa va vb vc vd ve fb fc fd bm fe ff">See all responses</button></div></div></div></div><div class="vf vg vh vi vj l bx"><div class="h k j"><div class="sk bh vk vl"></div><div class="ab cb"><div class="ci bh fz ga gb gc"><div class="vm ab lh jm"><div class="vn vo l"><a class="af ag ah ai aj ak al am an ao ap aq ar as at" href="https://help.medium.com/hc/en-us?source=post_page-----15ecf6b0b--------------------------------" rel="noopener follow"><p class="bf b dv z du">Help</p></a></div><div class="vn vo l"><a class="af ag ah ai aj ak al am an ao ap aq ar as at" href="https://medium.statuspage.io/?source=post_page-----15ecf6b0b--------------------------------" rel="noopener follow"><p class="bf b dv z du">Status</p></a></div><div class="vn vo l"><a class="af ag ah ai aj ak al am an ao ap aq ar as at" href="https://medium.com/about?autoplay=1&source=post_page-----15ecf6b0b--------------------------------" rel="noopener follow"><p class="bf b dv z du">About</p></a></div><div class="vn vo l"><a class="af ag ah ai aj ak al am an ao ap aq ar as at" href="https://medium.com/jobs-at-medium/work-at-medium-959d1a85284e?source=post_page-----15ecf6b0b--------------------------------" rel="noopener follow"><p class="bf b dv z du">Careers</p></a></div><div class="vn vo l"><a class="af ag ah ai aj ak al am an ao ap aq ar as at" href="pressinquiries@medium.com?source=post_page-----15ecf6b0b--------------------------------" rel="noopener follow"><p class="bf b dv z du">Press</p></a></div><div class="vn vo l"><a class="af ag ah ai aj ak al am an ao ap aq ar as at" href="https://blog.medium.com/?source=post_page-----15ecf6b0b--------------------------------" rel="noopener follow"><p class="bf b dv z du">Blog</p></a></div><div class="vn vo l"><a class="af ag ah ai aj ak al am an ao ap aq ar as at" href="https://policy.medium.com/medium-privacy-policy-f03bf92035c9?source=post_page-----15ecf6b0b--------------------------------" rel="noopener follow"><p class="bf b dv z du">Privacy</p></a></div><div class="vn vo l"><a class="af ag ah ai aj ak al am an ao ap aq ar as at" href="https://policy.medium.com/medium-terms-of-service-9db0094a1e0f?source=post_page-----15ecf6b0b--------------------------------" rel="noopener follow"><p class="bf b dv z du">Terms</p></a></div><div class="vn vo l"><a class="af ag ah ai aj ak al am an ao ap aq ar as at" href="https://speechify.com/medium?source=post_page-----15ecf6b0b--------------------------------" rel="noopener follow"><p class="bf b dv z du">Text to speech</p></a></div><div class="vn l"><a class="af ag ah ai aj ak al am an ao ap aq ar as at" href="https://medium.com/business?source=post_page-----15ecf6b0b--------------------------------" rel="noopener follow"><p class="bf b dv z du">Teams</p></a></div></div></div></div></div></div></div></div></div></div><script>window.__BUILD_ID__="main-20241126-181518-0cb59a020f"</script><script>window.__GRAPHQL_URI__ = "https://towardsdatascience.com/_/graphql"</script><script>window.__PRELOADED_STATE__ = {"algolia":{"queries":{}},"cache":{"experimentGroupSet":true,"reason":"This request is not using the cache middleware worker","group":"disabled","tags":["group-edgeCachePosts","post-15ecf6b0b","user-8a2f7df48b90","collection-7f60cf5620c9"],"serverVariantState":"","middlewareEnabled":false,"cacheStatus":"DYNAMIC","shouldUseCache":false,"vary":[],"lohpSummerUpsellEnabled":false,"publicationHierarchyEnabledWeb":false,"postBottomResponsesEnabled":false},"client":{"hydrated":false,"isUs":false,"isNativeMedium":false,"isSafariMobile":false,"isSafari":false,"isFirefox":false,"routingEntity":{"type":"COLLECTION","id":"7f60cf5620c9","explicit":true},"viewerIsBot":false},"debug":{"requestId":"eb46ad85-baee-4a20-8bc0-48df31d56ea0","hybridDevServices":[],"originalSpanCarrier":{"traceparent":"00-6cc64e6ef82ec2e8f53e4b276abc7a7c-f2a935f4ab445bb6-01"}},"multiVote":{"clapsPerPost":{}},"navigation":{"branch":{"show":null,"hasRendered":null,"blockedByCTA":false},"hideGoogleOneTap":false,"hasRenderedAlternateUserBanner":null,"currentLocation":"https:\u002F\u002Ftowardsdatascience.com\u002Fneural-machine-translation-15ecf6b0b","host":"towardsdatascience.com","hostname":"towardsdatascience.com","referrer":"","hasSetReferrer":false,"susiModal":{"step":null,"operation":"register"},"postRead":false,"partnerProgram":{"selectedCountryCode":null},"queryString":"?source=rss----7f60cf5620c9---4"},"config":{"nodeEnv":"production","version":"main-20241126-181518-0cb59a020f","target":"production","productName":"Medium","publicUrl":"https:\u002F\u002Fcdn-client.medium.com\u002Flite","authDomain":"medium.com","authGoogleClientId":"216296035834-k1k6qe060s2tp2a2jam4ljdcms00sttg.apps.googleusercontent.com","favicon":"production","glyphUrl":"https:\u002F\u002Fglyph.medium.com","branchKey":"key_live_ofxXr2qTrrU9NqURK8ZwEhknBxiI6KBm","algolia":{"appId":"MQ57UUUQZ2","apiKeySearch":"394474ced050e3911ae2249ecc774921","indexPrefix":"medium_","host":"-dsn.algolia.net"},"recaptchaKey":"6Lfc37IUAAAAAKGGtC6rLS13R1Hrw_BqADfS1LRk","recaptcha3Key":"6Lf8R9wUAAAAABMI_85Wb8melS7Zj6ziuf99Yot5","recaptchaEnterpriseKeyId":"6Le-uGgpAAAAAPprRaokM8AKthQ9KNGdoxaGUvVp","datadog":{"applicationId":"6702d87d-a7e0-42fe-bbcb-95b469547ea0","clientToken":"pub853ea8d17ad6821d9f8f11861d23dfed","rumToken":"pubf9cc52896502b9413b68ba36fc0c7162","context":{"deployment":{"target":"production","tag":"main-20241126-181518-0cb59a020f","commit":"0cb59a020f4453d0900f671f1a6576feecc55e74"}},"datacenter":"us"},"googleAnalyticsCode":"G-7JY7T788PK","googlePay":{"apiVersion":"2","apiVersionMinor":"0","merchantId":"BCR2DN6TV7EMTGBM","merchantName":"Medium","instanceMerchantId":"13685562959212738550"},"applePay":{"version":3},"signInWallCustomDomainCollectionIds":["3a8144eabfe3","336d898217ee","61061eb0c96b","138adf9c44c","819cc2aaeee0"],"mediumMastodonDomainName":"me.dm","mediumOwnedAndOperatedCollectionIds":["8a9336e5bb4","b7e45b22fec3","193b68bd4fba","8d6b8a439e32","54c98c43354d","3f6ecf56618","d944778ce714","92d2092dc598","ae2a65f35510","1285ba81cada","544c7006046e","fc8964313712","40187e704f1c","88d9857e584e","7b6769f2748b","bcc38c8f6edf","cef6983b292","cb8577c9149e","444d13b52878","713d7dbc99b0","ef8e90590e66","191186aaafa0","55760f21cdc5","9dc80918cc93","bdc4052bbdba","8ccfed20cbb2"],"tierOneDomains":["medium.com","thebolditalic.com","arcdigital.media","towardsdatascience.com","uxdesign.cc","codeburst.io","psiloveyou.xyz","writingcooperative.com","entrepreneurshandbook.co","prototypr.io","betterhumans.coach.me","theascent.pub"],"topicsToFollow":["d61cf867d93f","8a146bc21b28","1eca0103fff3","4d562ee63426","aef1078a3ef5","e15e46793f8d","6158eb913466","55f1c20aba7a","3d18b94f6858","4861fee224fd","63c6f1f93ee","1d98b3a9a871","decb52b64abf","ae5d4995e225","830cded25262"],"topicToTagMappings":{"accessibility":"accessibility","addiction":"addiction","android-development":"android-development","art":"art","artificial-intelligence":"artificial-intelligence","astrology":"astrology","basic-income":"basic-income","beauty":"beauty","biotech":"biotech","blockchain":"blockchain","books":"books","business":"business","cannabis":"cannabis","cities":"cities","climate-change":"climate-change","comics":"comics","coronavirus":"coronavirus","creativity":"creativity","cryptocurrency":"cryptocurrency","culture":"culture","cybersecurity":"cybersecurity","data-science":"data-science","design":"design","digital-life":"digital-life","disability":"disability","economy":"economy","education":"education","equality":"equality","family":"family","feminism":"feminism","fiction":"fiction","film":"film","fitness":"fitness","food":"food","freelancing":"freelancing","future":"future","gadgets":"gadgets","gaming":"gaming","gun-control":"gun-control","health":"health","history":"history","humor":"humor","immigration":"immigration","ios-development":"ios-development","javascript":"javascript","justice":"justice","language":"language","leadership":"leadership","lgbtqia":"lgbtqia","lifestyle":"lifestyle","machine-learning":"machine-learning","makers":"makers","marketing":"marketing","math":"math","media":"media","mental-health":"mental-health","mindfulness":"mindfulness","money":"money","music":"music","neuroscience":"neuroscience","nonfiction":"nonfiction","outdoors":"outdoors","parenting":"parenting","pets":"pets","philosophy":"philosophy","photography":"photography","podcasts":"podcast","poetry":"poetry","politics":"politics","privacy":"privacy","product-management":"product-management","productivity":"productivity","programming":"programming","psychedelics":"psychedelics","psychology":"psychology","race":"race","relationships":"relationships","religion":"religion","remote-work":"remote-work","san-francisco":"san-francisco","science":"science","self":"self","self-driving-cars":"self-driving-cars","sexuality":"sexuality","social-media":"social-media","society":"society","software-engineering":"software-engineering","space":"space","spirituality":"spirituality","sports":"sports","startups":"startup","style":"style","technology":"technology","transportation":"transportation","travel":"travel","true-crime":"true-crime","tv":"tv","ux":"ux","venture-capital":"venture-capital","visual-design":"visual-design","work":"work","world":"world","writing":"writing"},"defaultImages":{"avatar":{"imageId":"1*dmbNkD5D-u45r44go_cf0g.png","height":150,"width":150},"orgLogo":{"imageId":"7*V1_7XP4snlmqrc_0Njontw.png","height":110,"width":500},"postLogo":{"imageId":"bd978bb536350a710e8efb012513429cabdc4c28700604261aeda246d0f980b7","height":810,"width":1440},"postPreviewImage":{"imageId":"1*hn4v1tCaJy7cWMyb0bpNpQ.png","height":386,"width":579}},"collectionStructuredData":{"8d6b8a439e32":{"name":"Elemental","data":{"@type":"NewsMediaOrganization","ethicsPolicy":"https:\u002F\u002Fhelp.medium.com\u002Fhc\u002Fen-us\u002Farticles\u002F360043290473","logo":{"@type":"ImageObject","url":"https:\u002F\u002Fcdn-images-1.medium.com\u002Fmax\u002F980\u002F1*9ygdqoKprhwuTVKUM0DLPA@2x.png","width":980,"height":159}}},"3f6ecf56618":{"name":"Forge","data":{"@type":"NewsMediaOrganization","ethicsPolicy":"https:\u002F\u002Fhelp.medium.com\u002Fhc\u002Fen-us\u002Farticles\u002F360043290473","logo":{"@type":"ImageObject","url":"https:\u002F\u002Fcdn-images-1.medium.com\u002Fmax\u002F596\u002F1*uULpIlImcO5TDuBZ6lm7Lg@2x.png","width":596,"height":183}}},"ae2a65f35510":{"name":"GEN","data":{"@type":"NewsMediaOrganization","ethicsPolicy":"https:\u002F\u002Fhelp.medium.com\u002Fhc\u002Fen-us\u002Farticles\u002F360043290473","logo":{"@type":"ImageObject","url":"https:\u002F\u002Fmiro.medium.com\u002Fmax\u002F264\u002F1*RdVZMdvfV3YiZTw6mX7yWA.png","width":264,"height":140}}},"88d9857e584e":{"name":"LEVEL","data":{"@type":"NewsMediaOrganization","ethicsPolicy":"https:\u002F\u002Fhelp.medium.com\u002Fhc\u002Fen-us\u002Farticles\u002F360043290473","logo":{"@type":"ImageObject","url":"https:\u002F\u002Fmiro.medium.com\u002Fmax\u002F540\u002F1*JqYMhNX6KNNb2UlqGqO2WQ.png","width":540,"height":108}}},"7b6769f2748b":{"name":"Marker","data":{"@type":"NewsMediaOrganization","ethicsPolicy":"https:\u002F\u002Fhelp.medium.com\u002Fhc\u002Fen-us\u002Farticles\u002F360043290473","logo":{"@type":"ImageObject","url":"https:\u002F\u002Fcdn-images-1.medium.com\u002Fmax\u002F383\u002F1*haCUs0wF6TgOOvfoY-jEoQ@2x.png","width":383,"height":92}}},"444d13b52878":{"name":"OneZero","data":{"@type":"NewsMediaOrganization","ethicsPolicy":"https:\u002F\u002Fhelp.medium.com\u002Fhc\u002Fen-us\u002Farticles\u002F360043290473","logo":{"@type":"ImageObject","url":"https:\u002F\u002Fmiro.medium.com\u002Fmax\u002F540\u002F1*cw32fIqCbRWzwJaoQw6BUg.png","width":540,"height":123}}},"8ccfed20cbb2":{"name":"Zora","data":{"@type":"NewsMediaOrganization","ethicsPolicy":"https:\u002F\u002Fhelp.medium.com\u002Fhc\u002Fen-us\u002Farticles\u002F360043290473","logo":{"@type":"ImageObject","url":"https:\u002F\u002Fmiro.medium.com\u002Fmax\u002F540\u002F1*tZUQqRcCCZDXjjiZ4bDvgQ.png","width":540,"height":106}}}},"embeddedPostIds":{"coronavirus":"cd3010f9d81f"},"sharedCdcMessaging":{"COVID_APPLICABLE_TAG_SLUGS":[],"COVID_APPLICABLE_TOPIC_NAMES":[],"COVID_APPLICABLE_TOPIC_NAMES_FOR_TOPIC_PAGE":[],"COVID_MESSAGES":{"tierA":{"text":"For more information on the novel coronavirus and Covid-19, visit cdc.gov.","markups":[{"start":66,"end":73,"href":"https:\u002F\u002Fwww.cdc.gov\u002Fcoronavirus\u002F2019-nCoV"}]},"tierB":{"text":"Anyone can publish on Medium per our Policies, but we don’t fact-check every story. For more info about the coronavirus, see cdc.gov.","markups":[{"start":37,"end":45,"href":"https:\u002F\u002Fhelp.medium.com\u002Fhc\u002Fen-us\u002Fcategories\u002F201931128-Policies-Safety"},{"start":125,"end":132,"href":"https:\u002F\u002Fwww.cdc.gov\u002Fcoronavirus\u002F2019-nCoV"}]},"paywall":{"text":"This article has been made free for everyone, thanks to Medium Members. For more information on the novel coronavirus and Covid-19, visit cdc.gov.","markups":[{"start":56,"end":70,"href":"https:\u002F\u002Fmedium.com\u002Fmembership"},{"start":138,"end":145,"href":"https:\u002F\u002Fwww.cdc.gov\u002Fcoronavirus\u002F2019-nCoV"}]},"unbound":{"text":"This article is free for everyone, thanks to Medium Members. For more information on the novel coronavirus and Covid-19, visit cdc.gov.","markups":[{"start":45,"end":59,"href":"https:\u002F\u002Fmedium.com\u002Fmembership"},{"start":127,"end":134,"href":"https:\u002F\u002Fwww.cdc.gov\u002Fcoronavirus\u002F2019-nCoV"}]}},"COVID_BANNER_POST_ID_OVERRIDE_WHITELIST":["3b31a67bff4a"]},"sharedVoteMessaging":{"TAGS":["politics","election-2020","government","us-politics","election","2020-presidential-race","trump","donald-trump","democrats","republicans","congress","republican-party","democratic-party","biden","joe-biden","maga"],"TOPICS":["politics","election"],"MESSAGE":{"text":"Find out more about the U.S. election results here.","markups":[{"start":46,"end":50,"href":"https:\u002F\u002Fcookpolitical.com\u002F2020-national-popular-vote-tracker"}]},"EXCLUDE_POSTS":["397ef29e3ca5"]},"embedPostRules":[],"recircOptions":{"v1":{"limit":3},"v2":{"limit":8}},"braintreeClientKey":"production_zjkj96jm_m56f8fqpf7ngnrd4","braintree":{"enabled":true,"merchantId":"m56f8fqpf7ngnrd4","merchantAccountId":{"usd":"AMediumCorporation_instant","eur":"amediumcorporation_EUR","cad":"amediumcorporation_CAD"},"publicKey":"ds2nn34bg2z7j5gd","braintreeEnvironment":"production","dashboardUrl":"https:\u002F\u002Fwww.braintreegateway.com\u002Fmerchants","gracePeriodDurationInDays":14,"mediumMembershipPlanId":{"monthly":"ce105f8c57a3","monthlyV2":"e8a5e126-792b-4ee6-8fba-d574c1b02fc5","monthlyWithTrial":"d5ee3dbe3db8","monthlyPremium":"fa741a9b47a2","yearly":"a40ad4a43185","yearlyV2":"3815d7d6-b8ca-4224-9b8c-182f9047866e","yearlyStaff":"d74fb811198a","yearlyWithTrial":"b3bc7350e5c7","yearlyPremium":"e21bd2c12166","monthlyOneYearFree":"e6c0637a-2bad-4171-ab4f-3c268633d83c","monthly25PercentOffFirstYear":"235ecc62-0cdb-49ae-9378-726cd21c504b","monthly20PercentOffFirstYear":"ba518864-9c13-4a99-91ca-411bf0cac756","monthly15PercentOffFirstYear":"594c029b-9f89-43d5-88f8-8173af4e070e","monthly10PercentOffFirstYear":"c6c7bc9a-40f2-4b51-8126-e28511d5bdb0","monthlyForStudents":"629ebe51-da7d-41fd-8293-34cd2f2030a8","yearlyOneYearFree":"78ba7be9-0d9f-4ece-aa3e-b54b826f2bf1","yearly25PercentOffFirstYear":"2dbb010d-bb8f-4eeb-ad5c-a08509f42d34","yearly20PercentOffFirstYear":"47565488-435b-47f8-bf93-40d5fbe0ebc8","yearly15PercentOffFirstYear":"8259809b-0881-47d9-acf7-6c001c7f720f","yearly10PercentOffFirstYear":"9dd694fb-96e1-472c-8d9e-3c868d5c1506","yearlyForStudents":"e29345ef-ab1c-4234-95c5-70e50fe6bc23","monthlyCad":"p52orjkaceei","yearlyCad":"h4q9g2up9ktt"},"braintreeDiscountId":{"oneMonthFree":"MONTHS_FREE_01","threeMonthsFree":"MONTHS_FREE_03","sixMonthsFree":"MONTHS_FREE_06","fiftyPercentOffOneYear":"FIFTY_PERCENT_OFF_ONE_YEAR"},"3DSecureVersion":"2","defaultCurrency":"usd","providerPlanIdCurrency":{"4ycw":"usd","rz3b":"usd","3kqm":"usd","jzw6":"usd","c2q2":"usd","nnsw":"usd","q8qw":"usd","d9y6":"usd","fx7w":"cad","nwf2":"cad"}},"paypalClientId":"AXj1G4fotC2GE8KzWX9mSxCH1wmPE3nJglf4Z2ig_amnhvlMVX87otaq58niAg9iuLktVNF_1WCMnN7v","paypal":{"host":"https:\u002F\u002Fapi.paypal.com:443","clientMode":"production","serverMode":"live","webhookId":"4G466076A0294510S","monthlyPlan":{"planId":"P-9WR0658853113943TMU5FDQA","name":"Medium Membership (Monthly) with setup fee","description":"Unlimited access to the best and brightest stories on Medium. Membership billed monthly."},"yearlyPlan":{"planId":"P-7N8963881P8875835MU5JOPQ","name":"Medium Membership (Annual) with setup fee","description":"Unlimited access to the best and brightest stories on Medium. Membership billed annually."},"oneYearGift":{"name":"Medium Membership (1 Year, Digital Gift Code)","description":"Unlimited access to the best and brightest stories on Medium. Gift codes can be redeemed at medium.com\u002Fredeem.","price":"50.00","currency":"USD","sku":"membership-gift-1-yr"},"oldMonthlyPlan":{"planId":"P-96U02458LM656772MJZUVH2Y","name":"Medium Membership (Monthly)","description":"Unlimited access to the best and brightest stories on Medium. Membership billed monthly."},"oldYearlyPlan":{"planId":"P-59P80963JF186412JJZU3SMI","name":"Medium Membership (Annual)","description":"Unlimited access to the best and brightest stories on Medium. Membership billed annually."},"monthlyPlanWithTrial":{"planId":"P-66C21969LR178604GJPVKUKY","name":"Medium Membership (Monthly) with setup fee","description":"Unlimited access to the best and brightest stories on Medium. Membership billed monthly."},"yearlyPlanWithTrial":{"planId":"P-6XW32684EX226940VKCT2MFA","name":"Medium Membership (Annual) with setup fee","description":"Unlimited access to the best and brightest stories on Medium. Membership billed annually."},"oldMonthlyPlanNoSetupFee":{"planId":"P-4N046520HR188054PCJC7LJI","name":"Medium Membership (Monthly)","description":"Unlimited access to the best and brightest stories on Medium. Membership billed monthly."},"oldYearlyPlanNoSetupFee":{"planId":"P-7A4913502Y5181304CJEJMXQ","name":"Medium Membership (Annual)","description":"Unlimited access to the best and brightest stories on Medium. Membership billed annually."},"sdkUrl":"https:\u002F\u002Fwww.paypal.com\u002Fsdk\u002Fjs"},"stripePublishableKey":"pk_live_7FReX44VnNIInZwrIIx6ghjl","log":{"json":true,"level":"info"},"imageUploadMaxSizeMb":25,"staffPicks":{"title":"Staff Picks","catalogId":"c7bc6e1ee00f"}},"session":{"xsrf":""}}</script><script>window.__APOLLO_STATE__ = {"ROOT_QUERY":{"__typename":"Query","viewer":null,"collectionByDomainOrSlug({\"domainOrSlug\":\"towardsdatascience.com\"})":{"__ref":"Collection:7f60cf5620c9"},"variantFlags":[{"__typename":"VariantFlag","name":"enable_lo_homepage","valueType":{"__typename":"VariantFlagString","value":"control"}},{"__typename":"VariantFlag","name":"enable_starspace","valueType":{"__typename":"VariantFlagBoolean","value":true}},{"__typename":"VariantFlag","name":"enable_rex_new_push_notification_endpoint","valueType":{"__typename":"VariantFlagBoolean","value":true}},{"__typename":"VariantFlag","name":"enable_sharer_validate_post_share_key","valueType":{"__typename":"VariantFlagBoolean","value":true}},{"__typename":"VariantFlag","name":"enable_android_offline_reading","valueType":{"__typename":"VariantFlagBoolean","value":true}},{"__typename":"VariantFlag","name":"enable_android_miro_v2","valueType":{"__typename":"VariantFlagBoolean","value":true}},{"__typename":"VariantFlag","name":"enable_automod","valueType":{"__typename":"VariantFlagBoolean","value":true}},{"__typename":"VariantFlag","name":"enable_verifications_service","valueType":{"__typename":"VariantFlagBoolean","value":true}},{"__typename":"VariantFlag","name":"allow_test_auth","valueType":{"__typename":"VariantFlagString","value":"disallow"}},{"__typename":"VariantFlag","name":"skip_fs_cache_user_vals","valueType":{"__typename":"VariantFlagBoolean","value":true}},{"__typename":"VariantFlag","name":"enable_medium2_kbfd","valueType":{"__typename":"VariantFlagBoolean","value":true}},{"__typename":"VariantFlag","name":"mobile_custom_app_icon","valueType":{"__typename":"VariantFlagBoolean","value":true}},{"__typename":"VariantFlag","name":"enable_lite_response_markup","valueType":{"__typename":"VariantFlagBoolean","value":true}},{"__typename":"VariantFlag","name":"enable_pill_based_home_feed","valueType":{"__typename":"VariantFlagBoolean","value":true}},{"__typename":"VariantFlag","name":"enable_rex_aggregator_v2","valueType":{"__typename":"VariantFlagBoolean","value":true}},{"__typename":"VariantFlag","name":"enable_billing_frequency_on_step2","valueType":{"__typename":"VariantFlagString","value":"group_1"}},{"__typename":"VariantFlag","name":"android_enable_friend_links_creation","valueType":{"__typename":"VariantFlagBoolean","value":true}},{"__typename":"VariantFlag","name":"enable_boost_nia_v01","valueType":{"__typename":"VariantFlagBoolean","value":true}},{"__typename":"VariantFlag","name":"enable_diversification_rex","valueType":{"__typename":"VariantFlagBoolean","value":true}},{"__typename":"VariantFlag","name":"enable_pp_v4","valueType":{"__typename":"VariantFlagBoolean","value":true}},{"__typename":"VariantFlag","name":"allow_signup","valueType":{"__typename":"VariantFlagBoolean","value":true}},{"__typename":"VariantFlag","name":"enable_android_dynamic_programming_paywall","valueType":{"__typename":"VariantFlagBoolean","value":true}},{"__typename":"VariantFlag","name":"enable_deprecate_legacy_providers_v3","valueType":{"__typename":"VariantFlagBoolean","value":true}},{"__typename":"VariantFlag","name":"enable_premium_tier","valueType":{"__typename":"VariantFlagBoolean","value":true}},{"__typename":"VariantFlag","name":"limit_user_follows","valueType":{"__typename":"VariantFlagBoolean","value":true}},{"__typename":"VariantFlag","name":"enable_lite_continue_this_thread","valueType":{"__typename":"VariantFlagBoolean","value":true}},{"__typename":"VariantFlag","name":"glyph_font_set","valueType":{"__typename":"VariantFlagString","value":"m2-unbound-source-serif-pro"}},{"__typename":"VariantFlag","name":"enable_marketing_emails","valueType":{"__typename":"VariantFlagBoolean","value":true}},{"__typename":"VariantFlag","name":"onboarding_tags_from_top_views","valueType":{"__typename":"VariantFlagBoolean","value":true}},{"__typename":"VariantFlag","name":"enable_ios_autorefresh","valueType":{"__typename":"VariantFlagBoolean","value":true}},{"__typename":"VariantFlag","name":"enable_sprig","valueType":{"__typename":"VariantFlagBoolean","value":true}},{"__typename":"VariantFlag","name":"enable_braintree_integration","valueType":{"__typename":"VariantFlagBoolean","value":true}},{"__typename":"VariantFlag","name":"enable_ml_rank_rex_anno","valueType":{"__typename":"VariantFlagBoolean","value":true}},{"__typename":"VariantFlag","name":"enable_speechify_widget","valueType":{"__typename":"VariantFlagBoolean","value":true}},{"__typename":"VariantFlag","name":"android_rating_prompt_stories_read_threshold","valueType":{"__typename":"VariantFlagNumber","value":2}},{"__typename":"VariantFlag","name":"enable_footer_app_buttons","valueType":{"__typename":"VariantFlagBoolean","value":true}},{"__typename":"VariantFlag","name":"enable_tribute_landing_page","valueType":{"__typename":"VariantFlagBoolean","value":true}},{"__typename":"VariantFlag","name":"enable_moc_load_processor_c","valueType":{"__typename":"VariantFlagBoolean","value":true}},{"__typename":"VariantFlag","name":"enable_recaptcha_enterprise","valueType":{"__typename":"VariantFlagBoolean","value":true}},{"__typename":"VariantFlag","name":"num_post_bottom_responses_to_show","valueType":{"__typename":"VariantFlagString","value":"3"}},{"__typename":"VariantFlag","name":"disable_partner_program_enrollment","valueType":{"__typename":"VariantFlagBoolean","value":true}},{"__typename":"VariantFlag","name":"enable_tipping_v0_ios","valueType":{"__typename":"VariantFlagBoolean","value":true}},{"__typename":"VariantFlag","name":"enable_recirc_model","valueType":{"__typename":"VariantFlagBoolean","value":true}},{"__typename":"VariantFlag","name":"enable_simplified_digest_v2_b","valueType":{"__typename":"VariantFlagBoolean","value":true}},{"__typename":"VariantFlag","name":"enable_susi_redesign_android","valueType":{"__typename":"VariantFlagBoolean","value":true}},{"__typename":"VariantFlag","name":"enable_update_topic_portals_wtf","valueType":{"__typename":"VariantFlagBoolean","value":true}},{"__typename":"VariantFlag","name":"ios_enable_home_post_menu","valueType":{"__typename":"VariantFlagBoolean","value":true}},{"__typename":"VariantFlag","name":"price_smoke_test_monthly","valueType":{"__typename":"VariantFlagString","value":""}},{"__typename":"VariantFlag","name":"enable_legacy_feed_in_iceland","valueType":{"__typename":"VariantFlagBoolean","value":true}},{"__typename":"VariantFlag","name":"enable_app_flirty_thirty","valueType":{"__typename":"VariantFlagBoolean","value":true}},{"__typename":"VariantFlag","name":"enable_members_only_audio","valueType":{"__typename":"VariantFlagBoolean","value":true}},{"__typename":"VariantFlag","name":"android_enable_syntax_highlight","valueType":{"__typename":"VariantFlagBoolean","value":true}},{"__typename":"VariantFlag","name":"reader_fair_distribution_non_qp","valueType":{"__typename":"VariantFlagBoolean","value":true}},{"__typename":"VariantFlag","name":"android_enable_lists_v2","valueType":{"__typename":"VariantFlagBoolean","value":true}},{"__typename":"VariantFlag","name":"enable_cache_less_following_feed","valueType":{"__typename":"VariantFlagBoolean","value":true}},{"__typename":"VariantFlag","name":"enable_group_gifting","valueType":{"__typename":"VariantFlagBoolean","value":true}},{"__typename":"VariantFlag","name":"available_monthly_plan","valueType":{"__typename":"VariantFlagString","value":"60e220181034"}},{"__typename":"VariantFlag","name":"enable_moc_load_processor_first_story","valueType":{"__typename":"VariantFlagBoolean","value":true}},{"__typename":"VariantFlag","name":"enable_sharer_create_post_share_key","valueType":{"__typename":"VariantFlagBoolean","value":true}},{"__typename":"VariantFlag","name":"enable_tick_landing_page","valueType":{"__typename":"VariantFlagBoolean","value":true}},{"__typename":"VariantFlag","name":"available_annual_plan","valueType":{"__typename":"VariantFlagString","value":"2c754bcc2995"}},{"__typename":"VariantFlag","name":"enable_ios_offline_reading","valueType":{"__typename":"VariantFlagBoolean","value":true}},{"__typename":"VariantFlag","name":"enable_tag_recs","valueType":{"__typename":"VariantFlagBoolean","value":true}},{"__typename":"VariantFlag","name":"enable_aurora_pub_follower_page","valueType":{"__typename":"VariantFlagBoolean","value":true}},{"__typename":"VariantFlag","name":"enable_creator_welcome_email","valueType":{"__typename":"VariantFlagBoolean","value":true}},{"__typename":"VariantFlag","name":"enable_explicit_signals_updated_post_previews","valueType":{"__typename":"VariantFlagBoolean","value":true}},{"__typename":"VariantFlag","name":"enable_new_manage_membership_flow","valueType":{"__typename":"VariantFlagBoolean","value":true}},{"__typename":"VariantFlag","name":"available_annual_premium_plan","valueType":{"__typename":"VariantFlagString","value":"4a442ace1476"}},{"__typename":"VariantFlag","name":"enable_entities_to_follow_v2","valueType":{"__typename":"VariantFlagBoolean","value":true}},{"__typename":"VariantFlag","name":"enable_iceland_forced_android","valueType":{"__typename":"VariantFlagBoolean","value":true}},{"__typename":"VariantFlag","name":"enable_ios_dynamic_paywall_aspiriational","valueType":{"__typename":"VariantFlagBoolean","value":true}},{"__typename":"VariantFlag","name":"enable_post_bottom_responses_input","valueType":{"__typename":"VariantFlagBoolean","value":true}},{"__typename":"VariantFlag","name":"enable_switch_plan_premium_tier","valueType":{"__typename":"VariantFlagBoolean","value":true}},{"__typename":"VariantFlag","name":"ios_enable_friend_links_postpage_banners","valueType":{"__typename":"VariantFlagBoolean","value":true}},{"__typename":"VariantFlag","name":"price_smoke_test_yearly","valueType":{"__typename":"VariantFlagString","value":""}},{"__typename":"VariantFlag","name":"android_enable_friend_links_postpage_banners","valueType":{"__typename":"VariantFlagBoolean","value":true}},{"__typename":"VariantFlag","name":"enable_android_verified_author","valueType":{"__typename":"VariantFlagBoolean","value":true}},{"__typename":"VariantFlag","name":"enable_see_pronouns","valueType":{"__typename":"VariantFlagBoolean","value":true}},{"__typename":"VariantFlag","name":"android_enable_editor_new_publishing_flow","valueType":{"__typename":"VariantFlagBoolean","value":true}},{"__typename":"VariantFlag","name":"enable_author_cards","valueType":{"__typename":"VariantFlagBoolean","value":true}},{"__typename":"VariantFlag","name":"enable_explicit_signals","valueType":{"__typename":"VariantFlagBoolean","value":true}},{"__typename":"VariantFlag","name":"enable_new_stripe_customers","valueType":{"__typename":"VariantFlagBoolean","value":true}},{"__typename":"VariantFlag","name":"ios_social_share_sheet","valueType":{"__typename":"VariantFlagBoolean","value":true}},{"__typename":"VariantFlag","name":"enable_auto_follow_on_subscribe","valueType":{"__typename":"VariantFlagBoolean","value":true}},{"__typename":"VariantFlag","name":"enable_ios_easy_resubscribe","valueType":{"__typename":"VariantFlagBoolean","value":true}},{"__typename":"VariantFlag","name":"available_monthly_premium_plan","valueType":{"__typename":"VariantFlagString","value":"12a660186432"}},{"__typename":"VariantFlag","name":"ios_display_paywall_after_onboarding","valueType":{"__typename":"VariantFlagBoolean","value":true}},{"__typename":"VariantFlag","name":"enable_conversion_model_v2","valueType":{"__typename":"VariantFlagString","value":"group_2"}},{"__typename":"VariantFlag","name":"enable_google_webhook","valueType":{"__typename":"VariantFlagBoolean","value":true}},{"__typename":"VariantFlag","name":"enable_lite_server_upstream_deadlines","valueType":{"__typename":"VariantFlagBoolean","value":true}},{"__typename":"VariantFlag","name":"enable_eventstats_event_processing","valueType":{"__typename":"VariantFlagBoolean","value":true}},{"__typename":"VariantFlag","name":"enable_pp_country_expansion","valueType":{"__typename":"VariantFlagBoolean","value":true}},{"__typename":"VariantFlag","name":"ios_iceland_nux","valueType":{"__typename":"VariantFlagBoolean","value":true}},{"__typename":"VariantFlag","name":"enable_braintree_google_pay","valueType":{"__typename":"VariantFlagBoolean","value":true}},{"__typename":"VariantFlag","name":"enable_moc_load_processor_all_recs_surfaces","valueType":{"__typename":"VariantFlagBoolean","value":true}},{"__typename":"VariantFlag","name":"ios_enable_lock_responses","valueType":{"__typename":"VariantFlagBoolean","value":true}},{"__typename":"VariantFlag","name":"enable_pre_pp_v4","valueType":{"__typename":"VariantFlagBoolean","value":true}},{"__typename":"VariantFlag","name":"signin_services","valueType":{"__typename":"VariantFlagString","value":"twitter,facebook,google,email,google-fastidv,google-one-tap,apple"}},{"__typename":"VariantFlag","name":"enable_apple_sign_in","valueType":{"__typename":"VariantFlagBoolean","value":true}},{"__typename":"VariantFlag","name":"enable_update_explore_wtf","valueType":{"__typename":"VariantFlagBoolean","value":true}},{"__typename":"VariantFlag","name":"reengagement_notification_duration","valueType":{"__typename":"VariantFlagNumber","value":3}},{"__typename":"VariantFlag","name":"ios_enable_friend_links_creation","valueType":{"__typename":"VariantFlagBoolean","value":true}},{"__typename":"VariantFlag","name":"ios_remove_twitter_onboarding_step","valueType":{"__typename":"VariantFlagBoolean","value":true}},{"__typename":"VariantFlag","name":"enable_maim_the_meter","valueType":{"__typename":"VariantFlagBoolean","value":true}},{"__typename":"VariantFlag","name":"enable_rito_upstream_deadlines","valueType":{"__typename":"VariantFlagBoolean","value":true}},{"__typename":"VariantFlag","name":"textshots_userid","valueType":{"__typename":"VariantFlagString","value":""}},{"__typename":"VariantFlag","name":"enable_braintree_webhook","valueType":{"__typename":"VariantFlagBoolean","value":true}},{"__typename":"VariantFlag","name":"enable_publication_hierarchy_web","valueType":{"__typename":"VariantFlagBoolean","value":true}},{"__typename":"VariantFlag","name":"enable_seamless_social_sharing","valueType":{"__typename":"VariantFlagBoolean","value":true}},{"__typename":"VariantFlag","name":"enable_susi_redesign_ios","valueType":{"__typename":"VariantFlagBoolean","value":true}},{"__typename":"VariantFlag","name":"android_two_hour_refresh","valueType":{"__typename":"VariantFlagBoolean","value":true}},{"__typename":"VariantFlag","name":"enable_ios_dynamic_paywall_programming","valueType":{"__typename":"VariantFlagBoolean","value":true}},{"__typename":"VariantFlag","name":"limit_post_referrers","valueType":{"__typename":"VariantFlagBoolean","value":true}},{"__typename":"VariantFlag","name":"enable_newsletter_lo_flow_custom_domains","valueType":{"__typename":"VariantFlagBoolean","value":true}},{"__typename":"VariantFlag","name":"coronavirus_topic_recirc","valueType":{"__typename":"VariantFlagBoolean","value":true}},{"__typename":"VariantFlag","name":"enable_import","valueType":{"__typename":"VariantFlagBoolean","value":true}},{"__typename":"VariantFlag","name":"enable_recommended_publishers_query","valueType":{"__typename":"VariantFlagBoolean","value":true}},{"__typename":"VariantFlag","name":"enable_speechify_ios","valueType":{"__typename":"VariantFlagBoolean","value":true}},{"__typename":"VariantFlag","name":"android_enable_image_sharer","valueType":{"__typename":"VariantFlagBoolean","value":true}},{"__typename":"VariantFlag","name":"enable_author_cards_byline","valueType":{"__typename":"VariantFlagBoolean","value":true}},{"__typename":"VariantFlag","name":"enable_mastodon_avatar_upload","valueType":{"__typename":"VariantFlagBoolean","value":true}},{"__typename":"VariantFlag","name":"redefined_top_posts","valueType":{"__typename":"VariantFlagBoolean","value":true}},{"__typename":"VariantFlag","name":"enable_android_dynamic_aspirational_paywall","valueType":{"__typename":"VariantFlagBoolean","value":true}},{"__typename":"VariantFlag","name":"can_receive_tips_v0","valueType":{"__typename":"VariantFlagBoolean","value":true}},{"__typename":"VariantFlag","name":"enable_branch_io","valueType":{"__typename":"VariantFlagBoolean","value":true}},{"__typename":"VariantFlag","name":"enable_tipping_v0_android","valueType":{"__typename":"VariantFlagBoolean","value":true}},{"__typename":"VariantFlag","name":"enable_updated_pub_recs_ui","valueType":{"__typename":"VariantFlagBoolean","value":true}},{"__typename":"VariantFlag","name":"browsable_stream_config_bucket","valueType":{"__typename":"VariantFlagString","value":"curated-topics"}},{"__typename":"VariantFlag","name":"ios_enable_verified_book_author","valueType":{"__typename":"VariantFlagBoolean","value":true}},{"__typename":"VariantFlag","name":"enable_lite_archive_page","valueType":{"__typename":"VariantFlagBoolean","value":true}},{"__typename":"VariantFlag","name":"enable_ranker_v10","valueType":{"__typename":"VariantFlagString","value":"control"}},{"__typename":"VariantFlag","name":"enable_apple_webhook","valueType":{"__typename":"VariantFlagBoolean","value":true}},{"__typename":"VariantFlag","name":"enable_mastodon_for_members","valueType":{"__typename":"VariantFlagBoolean","value":true}},{"__typename":"VariantFlag","name":"signup_services","valueType":{"__typename":"VariantFlagString","value":"twitter,facebook,google,email,google-fastidv,google-one-tap,apple"}},{"__typename":"VariantFlag","name":"enable_google_one_tap","valueType":{"__typename":"VariantFlagBoolean","value":true}},{"__typename":"VariantFlag","name":"enable_intrinsic_automatic_actions","valueType":{"__typename":"VariantFlagBoolean","value":true}},{"__typename":"VariantFlag","name":"goliath_externalsearch_enable_comment_deindexation","valueType":{"__typename":"VariantFlagBoolean","value":true}},{"__typename":"VariantFlag","name":"enable_bg_post_post","valueType":{"__typename":"VariantFlagBoolean","value":true}},{"__typename":"VariantFlag","name":"enable_braintree_apple_pay","valueType":{"__typename":"VariantFlagBoolean","value":true}},{"__typename":"VariantFlag","name":"enable_braintree_client","valueType":{"__typename":"VariantFlagBoolean","value":true}},{"__typename":"VariantFlag","name":"enable_braintree_paypal","valueType":{"__typename":"VariantFlagBoolean","value":true}},{"__typename":"VariantFlag","name":"enable_configure_pronouns","valueType":{"__typename":"VariantFlagBoolean","value":true}},{"__typename":"VariantFlag","name":"android_enable_topic_portals","valueType":{"__typename":"VariantFlagBoolean","value":true}},{"__typename":"VariantFlag","name":"rex_generator_max_candidates","valueType":{"__typename":"VariantFlagNumber","value":1000}},{"__typename":"VariantFlag","name":"enable_rex_reading_history","valueType":{"__typename":"VariantFlagBoolean","value":true}},{"__typename":"VariantFlag","name":"enable_mastodon_for_members_username_selection","valueType":{"__typename":"VariantFlagBoolean","value":true}},{"__typename":"VariantFlag","name":"enable_premium_tier_badge","valueType":{"__typename":"VariantFlagBoolean","value":true}},{"__typename":"VariantFlag","name":"ios_in_app_free_trial","valueType":{"__typename":"VariantFlagBoolean","value":true}},{"__typename":"VariantFlag","name":"enable_lite_homepage","valueType":{"__typename":"VariantFlagBoolean","value":true}},{"__typename":"VariantFlag","name":"allow_access","valueType":{"__typename":"VariantFlagBoolean","value":true}},{"__typename":"VariantFlag","name":"can_send_tips_v0","valueType":{"__typename":"VariantFlagBoolean","value":true}},{"__typename":"VariantFlag","name":"enable_abandoned_cart_promotion_email","valueType":{"__typename":"VariantFlagBoolean","value":true}},{"__typename":"VariantFlag","name":"enable_bayesian_average_pub_search","valueType":{"__typename":"VariantFlagBoolean","value":true}},{"__typename":"VariantFlag","name":"enable_braintree_trial_membership","valueType":{"__typename":"VariantFlagBoolean","value":true}},{"__typename":"VariantFlag","name":"enable_conversion_ranker_v2","valueType":{"__typename":"VariantFlagString","value":"control"}},{"__typename":"VariantFlag","name":"enable_post_bottom_responses","valueType":{"__typename":"VariantFlagBoolean","value":true}}],"postResult({\"id\":\"15ecf6b0b\"})":{"__ref":"Post:15ecf6b0b"}},"ImageMetadata:1*VzTUkfeGymHP4Bvav-T-lA.png":{"__typename":"ImageMetadata","id":"1*VzTUkfeGymHP4Bvav-T-lA.png"},"Collection:7f60cf5620c9":{"__typename":"Collection","id":"7f60cf5620c9","favicon":{"__ref":"ImageMetadata:1*VzTUkfeGymHP4Bvav-T-lA.png"},"customStyleSheet":null,"colorPalette":{"__typename":"ColorPalette","highlightSpectrum":{"__typename":"ColorSpectrum","backgroundColor":"#FFFFFFFF","colorPoints":[{"__typename":"ColorPoint","color":"#FFEDF4FC","point":0},{"__typename":"ColorPoint","color":"#FFE9F2FD","point":0.1},{"__typename":"ColorPoint","color":"#FFE6F1FD","point":0.2},{"__typename":"ColorPoint","color":"#FFE2EFFD","point":0.3},{"__typename":"ColorPoint","color":"#FFDFEEFD","point":0.4},{"__typename":"ColorPoint","color":"#FFDBECFE","point":0.5},{"__typename":"ColorPoint","color":"#FFD7EBFE","point":0.6},{"__typename":"ColorPoint","color":"#FFD4E9FE","point":0.7},{"__typename":"ColorPoint","color":"#FFD0E7FF","point":0.8},{"__typename":"ColorPoint","color":"#FFCCE6FF","point":0.9},{"__typename":"ColorPoint","color":"#FFC8E4FF","point":1}]},"defaultBackgroundSpectrum":{"__typename":"ColorSpectrum","backgroundColor":"#FFFFFFFF","colorPoints":[{"__typename":"ColorPoint","color":"#FF668AAA","point":0},{"__typename":"ColorPoint","color":"#FF61809D","point":0.1},{"__typename":"ColorPoint","color":"#FF5A7690","point":0.2},{"__typename":"ColorPoint","color":"#FF546C83","point":0.3},{"__typename":"ColorPoint","color":"#FF4D6275","point":0.4},{"__typename":"ColorPoint","color":"#FF455768","point":0.5},{"__typename":"ColorPoint","color":"#FF3D4C5A","point":0.6},{"__typename":"ColorPoint","color":"#FF34414C","point":0.7},{"__typename":"ColorPoint","color":"#FF2B353E","point":0.8},{"__typename":"ColorPoint","color":"#FF21282F","point":0.9},{"__typename":"ColorPoint","color":"#FF161B1F","point":1}]},"tintBackgroundSpectrum":{"__typename":"ColorSpectrum","backgroundColor":"#FF355876","colorPoints":[{"__typename":"ColorPoint","color":"#FF355876","point":0},{"__typename":"ColorPoint","color":"#FF4D6C88","point":0.1},{"__typename":"ColorPoint","color":"#FF637F99","point":0.2},{"__typename":"ColorPoint","color":"#FF7791A8","point":0.3},{"__typename":"ColorPoint","color":"#FF8CA2B7","point":0.4},{"__typename":"ColorPoint","color":"#FF9FB3C6","point":0.5},{"__typename":"ColorPoint","color":"#FFB2C3D4","point":0.6},{"__typename":"ColorPoint","color":"#FFC5D2E1","point":0.7},{"__typename":"ColorPoint","color":"#FFD7E2EE","point":0.8},{"__typename":"ColorPoint","color":"#FFE9F1FA","point":0.9},{"__typename":"ColorPoint","color":"#FFFBFFFF","point":1}]}},"domain":"towardsdatascience.com","slug":"towards-data-science","googleAnalyticsId":null,"editors":[{"__typename":"CollectionMastheadUserItem","user":{"__ref":"User:e6ad8abedec9"}},{"__typename":"CollectionMastheadUserItem","user":{"__ref":"User:895063a310f4"}},{"__typename":"CollectionMastheadUserItem","user":{"__ref":"User:7e12c71dfa81"}}],"name":"Towards Data Science","avatar":{"__ref":"ImageMetadata:1*CJe3891yB1A1mzMdqemkdg.jpeg"},"description":"Your home for data science and AI. The world’s leading publication for data science, data analytics, data engineering, machine learning, and artificial intelligence professionals.","subscriberCount":768655,"latestPostsConnection({\"paging\":{\"limit\":1}})":{"__typename":"PostConnection","posts":[{"__ref":"Post:2dc9beb69b27"}]},"viewerEdge":{"__ref":"CollectionViewerEdge:collectionId:7f60cf5620c9-viewerId:lo_176642cbbfab"},"twitterUsername":"TDataScience","facebookPageId":null,"logo":{"__ref":"ImageMetadata:1*cFFKn8rFH4ZndmaYeAs6iQ.png"}},"User:e6ad8abedec9":{"__typename":"User","id":"e6ad8abedec9"},"User:895063a310f4":{"__typename":"User","id":"895063a310f4"},"User:7e12c71dfa81":{"__typename":"User","id":"7e12c71dfa81"},"ImageMetadata:1*CJe3891yB1A1mzMdqemkdg.jpeg":{"__typename":"ImageMetadata","id":"1*CJe3891yB1A1mzMdqemkdg.jpeg"},"User:c8a0ca9d85d8":{"__typename":"User","id":"c8a0ca9d85d8","customDomainState":null,"hasSubdomain":false,"username":"slavahead"},"Post:2dc9beb69b27":{"__typename":"Post","id":"2dc9beb69b27","firstPublishedAt":1732719740045,"creator":{"__ref":"User:c8a0ca9d85d8"},"collection":{"__ref":"Collection:7f60cf5620c9"},"isSeries":false,"mediumUrl":"https:\u002F\u002Ftowardsdatascience.com\u002Froadmap-to-becoming-a-data-scientist-part-1-maths-2dc9beb69b27","sequence":null,"uniqueSlug":"roadmap-to-becoming-a-data-scientist-part-1-maths-2dc9beb69b27"},"LinkedAccounts:8a2f7df48b90":{"__typename":"LinkedAccounts","mastodon":null,"id":"8a2f7df48b90"},"UserViewerEdge:userId:8a2f7df48b90-viewerId:lo_176642cbbfab":{"__typename":"UserViewerEdge","id":"userId:8a2f7df48b90-viewerId:lo_176642cbbfab","isFollowing":false,"isUser":false,"isMuting":false},"NewsletterV3:874b67ced0c1":{"__typename":"NewsletterV3","id":"874b67ced0c1","type":"NEWSLETTER_TYPE_AUTHOR","slug":"8a2f7df48b90","name":"8a2f7df48b90","collection":null,"user":{"__ref":"User:8a2f7df48b90"}},"User:8a2f7df48b90":{"__typename":"User","id":"8a2f7df48b90","name":"Quinn Lanners","username":"lannersq","newsletterV3":{"__ref":"NewsletterV3:874b67ced0c1"},"linkedAccounts":{"__ref":"LinkedAccounts:8a2f7df48b90"},"isSuspended":false,"imageId":"2*Brk5nEh8iz86Uf-730hMgA.png","mediumMemberAt":0,"verifications":{"__typename":"VerifiedInfo","isBookAuthor":false},"socialStats":{"__typename":"SocialStats","followerCount":96,"followingCount":1,"collectionFollowingCount":0},"customDomainState":null,"hasSubdomain":false,"bio":"Biostatistics PhD student at Duke University. My work centers around interpretable AI and Causal Inference with observational data, particularly in medicine.","isPartnerProgramEnrolled":false,"viewerEdge":{"__ref":"UserViewerEdge:userId:8a2f7df48b90-viewerId:lo_176642cbbfab"},"viewerIsUser":false,"postSubscribeMembershipUpsellShownAt":0,"membership":null,"allowNotes":true,"twitterScreenName":""},"Topic:1eca0103fff3":{"__typename":"Topic","slug":"machine-learning","id":"1eca0103fff3","name":"Machine Learning"},"Paragraph:44abc7cff577_0":{"__typename":"Paragraph","id":"44abc7cff577_0","name":"4d3b","type":"H3","href":null,"layout":null,"metadata":null,"text":"Neural Machine Translation","hasDropCap":null,"dropCapImage":null,"markups":[],"codeBlockMetadata":null,"iframe":null,"mixtapeMetadata":null},"Paragraph:44abc7cff577_1":{"__typename":"Paragraph","id":"44abc7cff577_1","name":"c415","type":"H4","href":null,"layout":null,"metadata":null,"text":"A guide to Neural Machine Translation using an Encoder Decoder structure with attention. Includes a detailed tutorial using PyTorch in Google Colaboratory.","hasDropCap":null,"dropCapImage":null,"markups":[],"codeBlockMetadata":null,"iframe":null,"mixtapeMetadata":null},"ImageMetadata:1*H441VINdbjxItCdtgb-1Xw.jpeg":{"__typename":"ImageMetadata","id":"1*H441VINdbjxItCdtgb-1Xw.jpeg","originalHeight":585,"originalWidth":960,"focusPercentX":null,"focusPercentY":null,"alt":null},"Paragraph:44abc7cff577_2":{"__typename":"Paragraph","id":"44abc7cff577_2","name":"3868","type":"IMG","href":null,"layout":"INSET_CENTER","metadata":{"__ref":"ImageMetadata:1*H441VINdbjxItCdtgb-1Xw.jpeg"},"text":"Image from pixabay.com","hasDropCap":null,"dropCapImage":null,"markups":[],"codeBlockMetadata":null,"iframe":null,"mixtapeMetadata":null},"Paragraph:44abc7cff577_3":{"__typename":"Paragraph","id":"44abc7cff577_3","name":"87ab","type":"P","href":null,"layout":null,"metadata":null,"text":"Machine Translation (MT) is a subfield of computational linguistics that is focused on translating text from one language to another. With the power of deep learning, Neural Machine Translation (NMT) has arisen as the most powerful algorithm to perform this task. While Google Translate is the leading industry example of NMT, tech companies all over the globe are going all in on NMT. This state-of-the-art algorithm is an application of deep learning in which massive datasets of translated sentences are used to train a model capable of translating between any two languages. With the vast amount of research in recent years, there are several variations of NMT currently being investigated and deployed in the industry. One of the older and more established versions of NMT is the Encoder Decoder structure. This architecture is composed of two recurrent neural networks (RNNs) used together in tandem to create a translation model. And when coupled with the power of attention mechanisms, this architecture can achieve impressive results.","hasDropCap":null,"dropCapImage":null,"markups":[{"__typename":"Markup","type":"A","start":371,"end":384,"href":"https:\u002F\u002Fslator.com\u002Ftechnology\u002Fcorporates-going-all-in-on-neural-machine-translation-research\u002F","anchorType":"LINK","userId":null,"linkMetadata":null},{"__typename":"Markup","type":"A","start":972,"end":992,"href":"https:\u002F\u002Farxiv.org\u002Fpdf\u002F1508.04025.pdf","anchorType":"LINK","userId":null,"linkMetadata":null}],"codeBlockMetadata":null,"iframe":null,"mixtapeMetadata":null},"Paragraph:44abc7cff577_4":{"__typename":"Paragraph","id":"44abc7cff577_4","name":"394d","type":"P","href":null,"layout":null,"metadata":null,"text":"This post is broken into two distinct parts. The first section consists of a brief explanation of NMT and the Encoder Decoder structure. Following this, the latter part of this article provides a tutorial which will allow the chance for you to create one of these structures yourself. This code tutorial is based largely on the PyTorch tutorial on NMT with a number of enhancements. Most notably, this code tutorial can be run on a GPU to receive significantly better results.","hasDropCap":null,"dropCapImage":null,"markups":[{"__typename":"Markup","type":"A","start":328,"end":351,"href":"https:\u002F\u002Fpytorch.org\u002Ftutorials\u002Fintermediate\u002Fseq2seq_translation_tutorial.html","anchorType":"LINK","userId":null,"linkMetadata":null}],"codeBlockMetadata":null,"iframe":null,"mixtapeMetadata":null},"Paragraph:44abc7cff577_5":{"__typename":"Paragraph","id":"44abc7cff577_5","name":"c768","type":"P","href":null,"layout":null,"metadata":null,"text":"Before we begin, it is assumed that if you are reading this article you have at least a general knowledge of neural networks and deep learning; particularly the ideas of forward-propagation, loss functions and back-propagation, and the importance of train and test sets.","hasDropCap":null,"dropCapImage":null,"markups":[{"__typename":"Markup","type":"A","start":109,"end":142,"href":"https:\u002F\u002Fskymind.ai\u002Fwiki\u002Fneural-network","anchorType":"LINK","userId":null,"linkMetadata":null},{"__typename":"Markup","type":"A","start":250,"end":269,"href":"https:\u002F\u002Ftowardsdatascience.com\u002Ftrain-validation-and-test-sets-72cb40cba9e7","anchorType":"LINK","userId":null,"linkMetadata":null}],"codeBlockMetadata":null,"iframe":null,"mixtapeMetadata":null},"Paragraph:44abc7cff577_6":{"__typename":"Paragraph","id":"44abc7cff577_6","name":"9249","type":"P","href":null,"layout":null,"metadata":null,"text":"If you are interested in jumping straight to the code, you can find the complete Jupyter notebook (or Python script) of the Google Colab tutorial outlined in this article on my GitHub page for this project.","hasDropCap":null,"dropCapImage":null,"markups":[{"__typename":"Markup","type":"A","start":177,"end":205,"href":"https:\u002F\u002Fgithub.com\u002Fqlanners\u002Fnmt_tutorial","anchorType":"LINK","userId":null,"linkMetadata":null},{"__typename":"Markup","type":"EM","start":0,"end":206,"href":null,"anchorType":null,"userId":null,"linkMetadata":null}],"codeBlockMetadata":null,"iframe":null,"mixtapeMetadata":null},"Paragraph:44abc7cff577_7":{"__typename":"Paragraph","id":"44abc7cff577_7","name":"5789","type":"H3","href":null,"layout":null,"metadata":null,"text":"Brief Explanation of NMT and the Encoder Decoder Structure","hasDropCap":null,"dropCapImage":null,"markups":[],"codeBlockMetadata":null,"iframe":null,"mixtapeMetadata":null},"Paragraph:44abc7cff577_8":{"__typename":"Paragraph","id":"44abc7cff577_8","name":"9011","type":"P","href":null,"layout":null,"metadata":null,"text":"The ultimate goal of any NMT model is to take a sentence in one language as input and return that sentence translated into a different language as output. The figure below is a naive representation of a translation algorithm (such as Google Translate) tasked with translating from English to Spanish.","hasDropCap":null,"dropCapImage":null,"markups":[],"codeBlockMetadata":null,"iframe":null,"mixtapeMetadata":null},"ImageMetadata:1*qWNEL7xMGraPLS6d6hCqyA.png":{"__typename":"ImageMetadata","id":"1*qWNEL7xMGraPLS6d6hCqyA.png","originalHeight":320,"originalWidth":1868,"focusPercentX":null,"focusPercentY":null,"alt":null},"Paragraph:44abc7cff577_9":{"__typename":"Paragraph","id":"44abc7cff577_9","name":"b3f3","type":"IMG","href":null,"layout":"INSET_CENTER","metadata":{"__ref":"ImageMetadata:1*qWNEL7xMGraPLS6d6hCqyA.png"},"text":"Figure 1: Translation from English to Spanish of the English sentence “the cat likes to eat pizza”","hasDropCap":null,"dropCapImage":null,"markups":[],"codeBlockMetadata":null,"iframe":null,"mixtapeMetadata":null},"Paragraph:44abc7cff577_10":{"__typename":"Paragraph","id":"44abc7cff577_10","name":"0533","type":"P","href":null,"layout":null,"metadata":null,"text":"Before diving into the Encoder Decoder structure that is oftentimes used as the algorithm in the above figure, we first must understand how we overcome a large hurdle in any machine translation task. Namely, we need a way to transform sentences into a data format that can be inputted into a machine learning model. In essence, we must somehow convert our textual data into a numeric form.","hasDropCap":null,"dropCapImage":null,"markups":[],"codeBlockMetadata":null,"iframe":null,"mixtapeMetadata":null},"Paragraph:44abc7cff577_11":{"__typename":"Paragraph","id":"44abc7cff577_11","name":"69f7","type":"P","href":null,"layout":null,"metadata":null,"text":"To do this in machine translation, each word is transformed into a One Hot Encoding vector which can then be inputted into the model. A One Hot Encoding vector is simply a vector with a 0 at every index except for a 1 at a single index corresponding to that particular word. In this way, each word has a distinct One Hot Encoding vector and thus we can represent every word in our dataset with a numerical representation.","hasDropCap":null,"dropCapImage":null,"markups":[],"codeBlockMetadata":null,"iframe":null,"mixtapeMetadata":null},"Paragraph:44abc7cff577_12":{"__typename":"Paragraph","id":"44abc7cff577_12","name":"8226","type":"P","href":null,"layout":null,"metadata":null,"text":"The first step towards creating these vectors is to assign an index to each unique word in the input language, and then repeat this process for the output language. In assigning a unique index to each unique word, we will be creating what is referred to as a Vocabulary for each language. Ideally, the Vocabulary for each language would simply contain every unique word in that language. However, given that any single language can have hundreds of thousands of words, the vocabulary is often trimmed to the N most common words in the dataset we are working with (where N is chosen arbitrarily, but often ranges from 1,000–100,000 depending on the dataset size).","hasDropCap":null,"dropCapImage":null,"markups":[],"codeBlockMetadata":null,"iframe":null,"mixtapeMetadata":null},"Paragraph:44abc7cff577_13":{"__typename":"Paragraph","id":"44abc7cff577_13","name":"14aa","type":"P","href":null,"layout":null,"metadata":null,"text":"To understand how we can then use a Vocabulary to create One Hot Encoding vectors for every word in our dataset, consider a mini-Vocabulary containing just the words in Table 1 below.","hasDropCap":null,"dropCapImage":null,"markups":[],"codeBlockMetadata":null,"iframe":null,"mixtapeMetadata":null},"ImageMetadata:1*_Pp0bAv3nZPYHbPFlvO7Hg.png":{"__typename":"ImageMetadata","id":"1*_Pp0bAv3nZPYHbPFlvO7Hg.png","originalHeight":356,"originalWidth":163,"focusPercentX":null,"focusPercentY":null,"alt":null},"Paragraph:44abc7cff577_14":{"__typename":"Paragraph","id":"44abc7cff577_14","name":"e484","type":"IMG","href":null,"layout":"INSET_CENTER","metadata":{"__ref":"ImageMetadata:1*_Pp0bAv3nZPYHbPFlvO7Hg.png"},"text":"Table 1: Mini-vocabulary for the English language","hasDropCap":null,"dropCapImage":null,"markups":[],"codeBlockMetadata":null,"iframe":null,"mixtapeMetadata":null},"Paragraph:44abc7cff577_15":{"__typename":"Paragraph","id":"44abc7cff577_15","name":"d89a","type":"P","href":null,"layout":null,"metadata":null,"text":"Given this table, we have assigned a unique index 0–12 to every word in our mini-Vocabulary. The \u003CSOS\u003E and \u003CEOS\u003E tokens in the table are added to every Vocabulary and stand for START OF SENTENCE and END OF SENTENCE respectively. They are used by the NMT model to help identify these crucial points in sentences.","hasDropCap":null,"dropCapImage":null,"markups":[],"codeBlockMetadata":null,"iframe":null,"mixtapeMetadata":null},"Paragraph:44abc7cff577_16":{"__typename":"Paragraph","id":"44abc7cff577_16","name":"fd00","type":"P","href":null,"layout":null,"metadata":null,"text":"Now, let’s say we want to convert the words in the sentence “the blue whale ate the red fish” to their one hot encoding vectors. Using Table 1, we would do this as shown in Figure 2 below.","hasDropCap":null,"dropCapImage":null,"markups":[],"codeBlockMetadata":null,"iframe":null,"mixtapeMetadata":null},"ImageMetadata:1*nxHrAM5dwoqqFFldP0Wv6w.png":{"__typename":"ImageMetadata","id":"1*nxHrAM5dwoqqFFldP0Wv6w.png","originalHeight":337,"originalWidth":753,"focusPercentX":null,"focusPercentY":null,"alt":null},"Paragraph:44abc7cff577_17":{"__typename":"Paragraph","id":"44abc7cff577_17","name":"8506","type":"IMG","href":null,"layout":"INSET_CENTER","metadata":{"__ref":"ImageMetadata:1*nxHrAM5dwoqqFFldP0Wv6w.png"},"text":"Figure 2: One Hot Encoding vectors for the sentence “the blue whale ate the red fish”","hasDropCap":null,"dropCapImage":null,"markups":[],"codeBlockMetadata":null,"iframe":null,"mixtapeMetadata":null},"Paragraph:44abc7cff577_18":{"__typename":"Paragraph","id":"44abc7cff577_18","name":"066b","type":"P","href":null,"layout":null,"metadata":null,"text":"As you can see above, each word becomes a vector of length 13 (which is the size of our vocabulary) and consists entirely of 0s except for a 1 at the index that was assigned to that word in Table 1.","hasDropCap":null,"dropCapImage":null,"markups":[],"codeBlockMetadata":null,"iframe":null,"mixtapeMetadata":null},"Paragraph:44abc7cff577_19":{"__typename":"Paragraph","id":"44abc7cff577_19","name":"7904","type":"P","href":null,"layout":null,"metadata":null,"text":"By creating a vocabulary for both the input and output languages, we can perform this technique on every sentence in each language to completely transform any corpus of translated sentences into a format suitable for the task of machine translation.","hasDropCap":null,"dropCapImage":null,"markups":[],"codeBlockMetadata":null,"iframe":null,"mixtapeMetadata":null},"Paragraph:44abc7cff577_20":{"__typename":"Paragraph","id":"44abc7cff577_20","name":"b472","type":"P","href":null,"layout":null,"metadata":null,"text":"Now, with an understanding of how we can represent textual data in a numeric way, let’s look at the magic behind this Encoder Decoder algorithm. At the most basic level, the Encoder portion of the model takes a sentence in the input language and creates a thought vector from this sentence. This thought vector stores the meaning of the sentence and is subsequently passed to a Decoder which outputs the translation of the sentence in the output language. This process is shown in the figure below.","hasDropCap":null,"dropCapImage":null,"markups":[{"__typename":"Markup","type":"EM","start":256,"end":270,"href":null,"anchorType":null,"userId":null,"linkMetadata":null},{"__typename":"Markup","type":"EM","start":296,"end":310,"href":null,"anchorType":null,"userId":null,"linkMetadata":null}],"codeBlockMetadata":null,"iframe":null,"mixtapeMetadata":null},"ImageMetadata:1*KeD0mc9o9DQZ59-nO95sPw.png":{"__typename":"ImageMetadata","id":"1*KeD0mc9o9DQZ59-nO95sPw.png","originalHeight":244,"originalWidth":1036,"focusPercentX":null,"focusPercentY":null,"alt":null},"Paragraph:44abc7cff577_21":{"__typename":"Paragraph","id":"44abc7cff577_21","name":"129a","type":"IMG","href":null,"layout":"INSET_CENTER","metadata":{"__ref":"ImageMetadata:1*KeD0mc9o9DQZ59-nO95sPw.png"},"text":"Figure 3: Encoder Decoder structure translating the English sentence “the cat likes to eat pizza” to the Spanish sentence “el gato le gusta comer pizza”","hasDropCap":null,"dropCapImage":null,"markups":[],"codeBlockMetadata":null,"iframe":null,"mixtapeMetadata":null},"Paragraph:44abc7cff577_22":{"__typename":"Paragraph","id":"44abc7cff577_22","name":"cb9c","type":"P","href":null,"layout":null,"metadata":null,"text":"In the above architecture, the Encoder and the Decoder are both recurrent neural networks (RNN). In this particular tutorial, we will be using Long Short-Term Memory (LSTM) models, which are a type of RNN. However other RNN architectures, such as a GRU, are often used. At a basic level, RNNs are neural networks designed specifically to deal with temporal\u002Ftextual data. This article will give a high-level overview of how RNNs work in the context of NMT, however, I would strongly recommend looking further into these concepts if you are not already familiar with them. For a more thorough explanation of RNNs and LSTMs see here, and for a deeper article on LSTMs in the context of language translation, in particular, see here.","hasDropCap":null,"dropCapImage":null,"markups":[{"__typename":"Markup","type":"A","start":625,"end":629,"href":"https:\u002F\u002Fcolah.github.io\u002Fposts\u002F2015-08-Understanding-LSTMs\u002F","anchorType":"LINK","userId":null,"linkMetadata":null},{"__typename":"Markup","type":"A","start":724,"end":728,"href":"http:\u002F\u002Fciteseerx.ist.psu.edu\u002Fviewdoc\u002Fdownload?doi=10.1.1.248.4448&rep=rep1&type=pdf","anchorType":"LINK","userId":null,"linkMetadata":null}],"codeBlockMetadata":null,"iframe":null,"mixtapeMetadata":null},"Paragraph:44abc7cff577_23":{"__typename":"Paragraph","id":"44abc7cff577_23","name":"cdf9","type":"P","href":null,"layout":null,"metadata":null,"text":"In the case of the Encoder, each word in the input sentence is fed separately into the model in a number of consecutive time-steps. At each time-step, t, the model updates a hidden vector, h, using information from the word inputted to the model at that time-step. This hidden vector works to store information about the inputted sentence. In this way, since no words have yet been inputted to the Encoder at time-step t=0, the hidden state in the Encoder starts out as an empty vector at this time-step. We represent this hidden state with the blue box in Figure 4, where the subscript t=0 indicates the time-step and the superscript E corresponds to the fact that it’s a hidden state of the Encoder (rather than a D for the Decoder).","hasDropCap":null,"dropCapImage":null,"markups":[{"__typename":"Markup","type":"EM","start":151,"end":152,"href":null,"anchorType":null,"userId":null,"linkMetadata":null},{"__typename":"Markup","type":"EM","start":189,"end":190,"href":null,"anchorType":null,"userId":null,"linkMetadata":null},{"__typename":"Markup","type":"EM","start":191,"end":192,"href":null,"anchorType":null,"userId":null,"linkMetadata":null},{"__typename":"Markup","type":"EM","start":263,"end":265,"href":null,"anchorType":null,"userId":null,"linkMetadata":null},{"__typename":"Markup","type":"EM","start":419,"end":421,"href":null,"anchorType":null,"userId":null,"linkMetadata":null},{"__typename":"Markup","type":"EM","start":587,"end":588,"href":null,"anchorType":null,"userId":null,"linkMetadata":null}],"codeBlockMetadata":null,"iframe":null,"mixtapeMetadata":null},"ImageMetadata:1*GZCzxYAMHdiCLiTTHNBsSw.png":{"__typename":"ImageMetadata","id":"1*GZCzxYAMHdiCLiTTHNBsSw.png","originalHeight":108,"originalWidth":100,"focusPercentX":null,"focusPercentY":null,"alt":null},"Paragraph:44abc7cff577_24":{"__typename":"Paragraph","id":"44abc7cff577_24","name":"52a2","type":"IMG","href":null,"layout":"INSET_CENTER","metadata":{"__ref":"ImageMetadata:1*GZCzxYAMHdiCLiTTHNBsSw.png"},"text":"Figure 4: Encoder hidden vector at t=0","hasDropCap":null,"dropCapImage":null,"markups":[],"codeBlockMetadata":null,"iframe":null,"mixtapeMetadata":null},"Paragraph:44abc7cff577_25":{"__typename":"Paragraph","id":"44abc7cff577_25","name":"0773","type":"P","href":null,"layout":null,"metadata":null,"text":"At each time-step, this hidden vector takes in information from the inputted word at that time-step, while preserving the information it has already stored from previous time-steps. Thus, at the final time-step, the meaning of the whole input sentence is stored in the hidden vector. This hidden vector at the final time-step is the thought vector referred to above, which is then inputted into the Decoder. The process of encoding the English sentence “the cat likes to eat pizza” is represented in Figure 5.","hasDropCap":null,"dropCapImage":null,"markups":[{"__typename":"Markup","type":"EM","start":333,"end":347,"href":null,"anchorType":null,"userId":null,"linkMetadata":null}],"codeBlockMetadata":null,"iframe":null,"mixtapeMetadata":null},"ImageMetadata:1*xd8j4KoKRSzRq0b1Vx0FAA.png":{"__typename":"ImageMetadata","id":"1*xd8j4KoKRSzRq0b1Vx0FAA.png","originalHeight":406,"originalWidth":1972,"focusPercentX":null,"focusPercentY":null,"alt":null},"Paragraph:44abc7cff577_26":{"__typename":"Paragraph","id":"44abc7cff577_26","name":"d821","type":"IMG","href":null,"layout":"INSET_CENTER","metadata":{"__ref":"ImageMetadata:1*xd8j4KoKRSzRq0b1Vx0FAA.png"},"text":"Figure 5: Encoding of the sentence “the cat likes to eat pizza”","hasDropCap":null,"dropCapImage":null,"markups":[],"codeBlockMetadata":null,"iframe":null,"mixtapeMetadata":null},"Paragraph:44abc7cff577_27":{"__typename":"Paragraph","id":"44abc7cff577_27","name":"b862","type":"P","href":null,"layout":null,"metadata":null,"text":"In the above figure, the blue arrows correspond to weight matrices, which we will work to enhance through training to achieve more accurate translations.","hasDropCap":null,"dropCapImage":null,"markups":[],"codeBlockMetadata":null,"iframe":null,"mixtapeMetadata":null},"Paragraph:44abc7cff577_28":{"__typename":"Paragraph","id":"44abc7cff577_28","name":"a1b2","type":"P","href":null,"layout":null,"metadata":null,"text":"Also, notice how the final hidden state of the Encoder becomes the thought vector and is relabeled with superscript D at t=0. This is because this final hidden vector of the Encoder becomes the initial hidden vector of the Decoder. In this way, we are passing the encoded meaning of the sentence to the Decoder to be translated to a sentence in the output language. However, unlike the Encoder, we need the Decoder to output a translated sentence of variable length. Thus, we are going to have our Decoder output a prediction word at each time-step until we have outputted a complete sentence.","hasDropCap":null,"dropCapImage":null,"markups":[{"__typename":"Markup","type":"EM","start":121,"end":122,"href":null,"anchorType":null,"userId":null,"linkMetadata":null}],"codeBlockMetadata":null,"iframe":null,"mixtapeMetadata":null},"Paragraph:44abc7cff577_29":{"__typename":"Paragraph","id":"44abc7cff577_29","name":"d964","type":"P","href":null,"layout":null,"metadata":null,"text":"In order to start this translation, we are going to input a \u003CSOS\u003E tag as the input at the first time-step in the Decoder. Just as in the Encoder, the Decoder will use the \u003CSOS\u003E input at time-step t=1 to update its hidden state. However, rather than just proceeding to the next time-step, the Decoder will use an additional weight matrix to create a probability over all of the words in the output vocabulary. In this way, the word with the highest probability in the output vocabulary will become the first word in the predicted output sentence. This first step of the Decoder, translating from “the cat likes to eat pizza” to “el gato le gusta comer pizza” is shown in Figure 6. For the sake of simplicity, the output vocabulary is restricted to the words in the output sentence (but in practice would consist of the thousands of words in the entire output vocabulary).","hasDropCap":null,"dropCapImage":null,"markups":[{"__typename":"Markup","type":"EM","start":196,"end":197,"href":null,"anchorType":null,"userId":null,"linkMetadata":null}],"codeBlockMetadata":null,"iframe":null,"mixtapeMetadata":null},"ImageMetadata:1*0kJbrSpwyzneRE1hHUHB6g.png":{"__typename":"ImageMetadata","id":"1*0kJbrSpwyzneRE1hHUHB6g.png","originalHeight":484,"originalWidth":312,"focusPercentX":null,"focusPercentY":null,"alt":null},"Paragraph:44abc7cff577_30":{"__typename":"Paragraph","id":"44abc7cff577_30","name":"8e82","type":"IMG","href":null,"layout":"INSET_CENTER","metadata":{"__ref":"ImageMetadata:1*0kJbrSpwyzneRE1hHUHB6g.png"},"text":"Figure 6: First step of the Decoder","hasDropCap":null,"dropCapImage":null,"markups":[],"codeBlockMetadata":null,"iframe":null,"mixtapeMetadata":null},"Paragraph:44abc7cff577_31":{"__typename":"Paragraph","id":"44abc7cff577_31","name":"b8f8","type":"P","href":null,"layout":null,"metadata":null,"text":"Now, given that the word “el” was given the highest probability, this word becomes the first word in our outputted prediction sentence. And we proceed by using “el” as the input in the next time-step as in Figure 7 below.","hasDropCap":null,"dropCapImage":null,"markups":[],"codeBlockMetadata":null,"iframe":null,"mixtapeMetadata":null},"ImageMetadata:1*fIOdNSYBADB4452cRXWcsA.png":{"__typename":"ImageMetadata","id":"1*fIOdNSYBADB4452cRXWcsA.png","originalHeight":510,"originalWidth":441,"focusPercentX":null,"focusPercentY":null,"alt":null},"Paragraph:44abc7cff577_32":{"__typename":"Paragraph","id":"44abc7cff577_32","name":"e4fd","type":"IMG","href":null,"layout":"INSET_CENTER","metadata":{"__ref":"ImageMetadata:1*fIOdNSYBADB4452cRXWcsA.png"},"text":"Figure 7: Second step of the Decoder","hasDropCap":null,"dropCapImage":null,"markups":[],"codeBlockMetadata":null,"iframe":null,"mixtapeMetadata":null},"Paragraph:44abc7cff577_33":{"__typename":"Paragraph","id":"44abc7cff577_33","name":"f8f4","type":"P","href":null,"layout":null,"metadata":null,"text":"We proceed in this way through the duration of the sentence — that is until we run into an error such as that depicted below in Figure 8.","hasDropCap":null,"dropCapImage":null,"markups":[],"codeBlockMetadata":null,"iframe":null,"mixtapeMetadata":null},"ImageMetadata:1*cMo4JhbtDkm1Wy1EnqhEkg.png":{"__typename":"ImageMetadata","id":"1*cMo4JhbtDkm1Wy1EnqhEkg.png","originalHeight":511,"originalWidth":752,"focusPercentX":null,"focusPercentY":null,"alt":null},"Paragraph:44abc7cff577_34":{"__typename":"Paragraph","id":"44abc7cff577_34","name":"ab9e","type":"IMG","href":null,"layout":"INSET_CENTER","metadata":{"__ref":"ImageMetadata:1*cMo4JhbtDkm1Wy1EnqhEkg.png"},"text":"Figure 8: Translation error in Decoder","hasDropCap":null,"dropCapImage":null,"markups":[],"codeBlockMetadata":null,"iframe":null,"mixtapeMetadata":null},"Paragraph:44abc7cff577_35":{"__typename":"Paragraph","id":"44abc7cff577_35","name":"58e9","type":"P","href":null,"layout":null,"metadata":null,"text":"As you can see, the Decoder has predicted “pizza” to be the next word in the translated sentence, when it should actually be “comer”. When testing the model on the test set, we would do nothing to correct this error and would allow the Decoder to use this improper prediction as the input at the next time-step. However, during the training process, we are going to keep “pizza” as the predicted word at that point in the sentence, but force our Decoder to input the correct word “comer” as the input for the next time-step. This is a strategy referred to as teacher-forcing and helps speed up the training process. It is shown in the below figure.","hasDropCap":null,"dropCapImage":null,"markups":[{"__typename":"Markup","type":"A","start":559,"end":574,"href":"https:\u002F\u002Fmachinelearningmastery.com\u002Fteacher-forcing-for-recurrent-neural-networks\u002F","anchorType":"LINK","userId":null,"linkMetadata":null}],"codeBlockMetadata":null,"iframe":null,"mixtapeMetadata":null},"ImageMetadata:1*ignkCc7wFznUGDBKN-3ylg.png":{"__typename":"ImageMetadata","id":"1*ignkCc7wFznUGDBKN-3ylg.png","originalHeight":497,"originalWidth":963,"focusPercentX":null,"focusPercentY":null,"alt":null},"Paragraph:44abc7cff577_36":{"__typename":"Paragraph","id":"44abc7cff577_36","name":"23fb","type":"IMG","href":null,"layout":"INSET_CENTER","metadata":{"__ref":"ImageMetadata:1*ignkCc7wFznUGDBKN-3ylg.png"},"text":"Figure 9: Teacher-forcing","hasDropCap":null,"dropCapImage":null,"markups":[],"codeBlockMetadata":null,"iframe":null,"mixtapeMetadata":null},"Paragraph:44abc7cff577_37":{"__typename":"Paragraph","id":"44abc7cff577_37","name":"6901","type":"P","href":null,"layout":null,"metadata":null,"text":"Now, since the Decoder has to output prediction sentences of variable lengths, the Decoder will continue predicting words in this fashion until it predicts the next word in the sentence to be a \u003CEOS\u003E tag. Once this tag has been predicted, the decoding process is complete and we are left with a complete predicted translation of the input sentence. The entire process of decoding the thought vector for the input sentence “the cat likes to eat pizza” is shown in Figure 10.","hasDropCap":null,"dropCapImage":null,"markups":[],"codeBlockMetadata":null,"iframe":null,"mixtapeMetadata":null},"ImageMetadata:1*GwKpF9yMipPWuruXoTWKPQ.png":{"__typename":"ImageMetadata","id":"1*GwKpF9yMipPWuruXoTWKPQ.png","originalHeight":347,"originalWidth":970,"focusPercentX":null,"focusPercentY":null,"alt":null},"Paragraph:44abc7cff577_38":{"__typename":"Paragraph","id":"44abc7cff577_38","name":"8b35","type":"IMG","href":null,"layout":"INSET_CENTER","metadata":{"__ref":"ImageMetadata:1*GwKpF9yMipPWuruXoTWKPQ.png"},"text":"Figure 10: Decoding of the sentence “the cat likes to eat pizza”","hasDropCap":null,"dropCapImage":null,"markups":[],"codeBlockMetadata":null,"iframe":null,"mixtapeMetadata":null},"Paragraph:44abc7cff577_39":{"__typename":"Paragraph","id":"44abc7cff577_39","name":"fea2","type":"P","href":null,"layout":null,"metadata":null,"text":"We can then compare the accuracy of this predicted translation to the actual translation of the input sentence to compute a loss. While there are several varieties of loss functions, a very common one to utilize is the Cross-Entropy Loss. The equation of this loss function is detailed in Figure 11.","hasDropCap":null,"dropCapImage":null,"markups":[],"codeBlockMetadata":null,"iframe":null,"mixtapeMetadata":null},"ImageMetadata:1*JZ-qea3BYaGOT4Vdhds9mQ.png":{"__typename":"ImageMetadata","id":"1*JZ-qea3BYaGOT4Vdhds9mQ.png","originalHeight":394,"originalWidth":498,"focusPercentX":null,"focusPercentY":null,"alt":null},"Paragraph:44abc7cff577_40":{"__typename":"Paragraph","id":"44abc7cff577_40","name":"ef8d","type":"IMG","href":null,"layout":"INSET_CENTER","metadata":{"__ref":"ImageMetadata:1*JZ-qea3BYaGOT4Vdhds9mQ.png"},"text":"Figure 11: Cross-Entropy Loss function","hasDropCap":null,"dropCapImage":null,"markups":[],"codeBlockMetadata":null,"iframe":null,"mixtapeMetadata":null},"Paragraph:44abc7cff577_41":{"__typename":"Paragraph","id":"44abc7cff577_41","name":"f487","type":"P","href":null,"layout":null,"metadata":null,"text":"In essence, what this loss function does is sum over the negative log likelihoods that the model gives to the correct word at each position in the output sentence. Given that the negative log function has a value of 0 when the input is 1 and increases exponentially as the input approaches 0 (as shown in Figure 12), the closer the probability that the model gives to the correct word at each point in the sentence is to 100%, the lower the loss.","hasDropCap":null,"dropCapImage":null,"markups":[],"codeBlockMetadata":null,"iframe":null,"mixtapeMetadata":null},"ImageMetadata:1*GLLqWJkqlF4RcbuftbIIDA.png":{"__typename":"ImageMetadata","id":"1*GLLqWJkqlF4RcbuftbIIDA.png","originalHeight":528,"originalWidth":419,"focusPercentX":null,"focusPercentY":null,"alt":null},"Paragraph:44abc7cff577_42":{"__typename":"Paragraph","id":"44abc7cff577_42","name":"6cd3","type":"IMG","href":null,"layout":"INSET_CENTER","metadata":{"__ref":"ImageMetadata:1*GLLqWJkqlF4RcbuftbIIDA.png"},"text":"Figure 12: Graph of the function y = -log(x)","hasDropCap":null,"dropCapImage":null,"markups":[],"codeBlockMetadata":null,"iframe":null,"mixtapeMetadata":null},"Paragraph:44abc7cff577_43":{"__typename":"Paragraph","id":"44abc7cff577_43","name":"5c8f","type":"P","href":null,"layout":null,"metadata":null,"text":"For example, given that the correct first word in the output sentence above is “el”, and our model gave a fairly high probability to the word “el” at that position, the loss for this position would be fairly low. Conversely, since the correct word at time-step t=5 is “comer”, but our model gave a rather low probability to the word “comer”, the loss at that step would be relatively high.","hasDropCap":null,"dropCapImage":null,"markups":[{"__typename":"Markup","type":"EM","start":261,"end":262,"href":null,"anchorType":null,"userId":null,"linkMetadata":null}],"codeBlockMetadata":null,"iframe":null,"mixtapeMetadata":null},"Paragraph:44abc7cff577_44":{"__typename":"Paragraph","id":"44abc7cff577_44","name":"2bec","type":"P","href":null,"layout":null,"metadata":null,"text":"By summing over the loss for each word in the output sentence a total loss for the sentence is obtained. This loss corresponds to the accuracy of the translation, with lower loss values corresponding to better translations. When training, the loss values of several sentences in a batch would be summed together, resulting in a total batch loss. This batch loss would then be used to perform mini-batch gradient descent to update all of the weight matrices in both the Decoder and the Encoder. These updates modify the weight matrices to slightly enhance the accuracy of the model’s translations. Thus, by performing this process iteratively, we eventually construct weight matrices that are capable of creating quality translations.","hasDropCap":null,"dropCapImage":null,"markups":[],"codeBlockMetadata":null,"iframe":null,"mixtapeMetadata":null},"Paragraph:44abc7cff577_45":{"__typename":"Paragraph","id":"44abc7cff577_45","name":"a897","type":"P","href":null,"layout":null,"metadata":null,"text":"If you are unfamiliar with the concept of batches and\u002For mini-batch gradient descent you can find a short explanation of these concepts here.","hasDropCap":null,"dropCapImage":null,"markups":[{"__typename":"Markup","type":"A","start":136,"end":140,"href":"https:\u002F\u002Fmachinelearningmastery.com\u002Fgentle-introduction-mini-batch-gradient-descent-configure-batch-size\u002F","anchorType":"LINK","userId":null,"linkMetadata":null},{"__typename":"Markup","type":"EM","start":0,"end":141,"href":null,"anchorType":null,"userId":null,"linkMetadata":null}],"codeBlockMetadata":null,"iframe":null,"mixtapeMetadata":null},"Paragraph:44abc7cff577_46":{"__typename":"Paragraph","id":"44abc7cff577_46","name":"452a","type":"P","href":null,"layout":null,"metadata":null,"text":"As mentioned in the introduction, an attention mechanism is an incredible tool that greatly enhances an NMT model’s ability to create accurate translations. While there are a number of different types of attention mechanisms, some of which you can read about here, the model built in this tutorial uses a rather simple implementation of global attention. In this method of attention, at each time-step, the Decoder “looks back” at all of the hidden vectors of the Encoder to create a memory vector. It then uses this memory vector, along with the hidden vector in the Decoder at that time-step, to predict the next word in the translated sentence. In doing this, the Decoder utilizes valuable information from the Encoder that would otherwise go to waste. A visual representation of this process is shown in Figure 13. I’d recommend reading the linked article in this paragraph to learn more about the various ways this memory vector can be calculated to gain a better understanding of this important concept.","hasDropCap":null,"dropCapImage":null,"markups":[{"__typename":"Markup","type":"A","start":259,"end":263,"href":"https:\u002F\u002Farxiv.org\u002Fpdf\u002F1508.04025.pdf","anchorType":"LINK","userId":null,"linkMetadata":null}],"codeBlockMetadata":null,"iframe":null,"mixtapeMetadata":null},"ImageMetadata:1*ff0Uh3mefwAMH7Z0gNrqTQ.png":{"__typename":"ImageMetadata","id":"1*ff0Uh3mefwAMH7Z0gNrqTQ.png","originalHeight":513,"originalWidth":788,"focusPercentX":null,"focusPercentY":null,"alt":null},"Paragraph:44abc7cff577_47":{"__typename":"Paragraph","id":"44abc7cff577_47","name":"1dc2","type":"IMG","href":null,"layout":"INSET_CENTER","metadata":{"__ref":"ImageMetadata:1*ff0Uh3mefwAMH7Z0gNrqTQ.png"},"text":"Figure 13: Attention mechanism for time-step t=1 in Decoder","hasDropCap":null,"dropCapImage":null,"markups":[],"codeBlockMetadata":null,"iframe":null,"mixtapeMetadata":null},"Paragraph:44abc7cff577_48":{"__typename":"Paragraph","id":"44abc7cff577_48","name":"34c6","type":"P","href":null,"layout":null,"metadata":null,"text":"Note: Attention mechanisms are incredibly powerful and have recently been proposed (and shown) to be more effective when used on their own (i.e. without any RNN architecture). If you’re interested in NMT I’d recommend you look into transformers and particularly read the article “Attention Is All You Need”.","hasDropCap":null,"dropCapImage":null,"markups":[{"__typename":"Markup","type":"A","start":280,"end":305,"href":"https:\u002F\u002Farxiv.org\u002Fpdf\u002F1706.03762.pdf","anchorType":"LINK","userId":null,"linkMetadata":null},{"__typename":"Markup","type":"EM","start":0,"end":307,"href":null,"anchorType":null,"userId":null,"linkMetadata":null}],"codeBlockMetadata":null,"iframe":null,"mixtapeMetadata":null},"Paragraph:44abc7cff577_49":{"__typename":"Paragraph","id":"44abc7cff577_49","name":"74d8","type":"H3","href":null,"layout":null,"metadata":null,"text":"Coding Tutorial (Python)","hasDropCap":null,"dropCapImage":null,"markups":[{"__typename":"Markup","type":"STRONG","start":0,"end":24,"href":null,"anchorType":null,"userId":null,"linkMetadata":null}],"codeBlockMetadata":null,"iframe":null,"mixtapeMetadata":null},"Paragraph:44abc7cff577_50":{"__typename":"Paragraph","id":"44abc7cff577_50","name":"e2b5","type":"P","href":null,"layout":null,"metadata":null,"text":"Before beginning the tutorial I would like to reiterate that this tutorial is derived largely from the PyTorch tutorial “Translation with a Sequence to Sequence Network and Attention”. However, this tutorial is optimized in a number of ways. Most notably, this code allows for the data to be separated into batches (thus allowing us to utilize the enhanced parallel computing power of a GPU), can split datasets into a train and a test set, and also has added functionality to run on datasets of various formats.","hasDropCap":null,"dropCapImage":null,"markups":[{"__typename":"Markup","type":"A","start":121,"end":182,"href":"https:\u002F\u002Fpytorch.org\u002Ftutorials\u002Fintermediate\u002Fseq2seq_translation_tutorial.html","anchorType":"LINK","userId":null,"linkMetadata":null}],"codeBlockMetadata":null,"iframe":null,"mixtapeMetadata":null},"Paragraph:44abc7cff577_51":{"__typename":"Paragraph","id":"44abc7cff577_51","name":"3b5b","type":"P","href":null,"layout":null,"metadata":null,"text":"Before we dive into the code tutorial, a little setup is in store. If you’d like to run the model on a GPU (highly recommended), this tutorial is going to be using Google Colab; which offers free access to Jupyter notebooks with GPU capability. If you have other access to a GPU then feel free to use that as well. Otherwise, you can look into a variety of other free online GPU options. The code can be run on a CPU, but the capability of any model will be constricted by computational power (and make sure to change to batch-size to 1 if you choose to do so).","hasDropCap":null,"dropCapImage":null,"markups":[{"__typename":"Markup","type":"A","start":363,"end":386,"href":"https:\u002F\u002Ftowardsdatascience.com\u002Ftraining-machine-learning-models-online-for-free-gpu-tpu-enabled-5def6a5c1ce3","anchorType":"LINK","userId":null,"linkMetadata":null}],"codeBlockMetadata":null,"iframe":null,"mixtapeMetadata":null},"Paragraph:44abc7cff577_52":{"__typename":"Paragraph","id":"44abc7cff577_52","name":"9bf0","type":"P","href":null,"layout":null,"metadata":null,"text":"To get started, navigate to Google Colaboratory and log into a Google account to get started. From here, navigate to File \u003E New Python 3 Notebook to launch a Jupyter notebook. Once you’ve opened up a new notebook, we first need to enable GPU capabilities. To do so, navigate to the top left of the page and select Edit \u003E Notebook Settings. From here select GPU in the dropdown menu under “Hardware accelerator.”","hasDropCap":null,"dropCapImage":null,"markups":[{"__typename":"Markup","type":"A","start":28,"end":47,"href":"https:\u002F\u002Fcolab.research.google.com","anchorType":"LINK","userId":null,"linkMetadata":null}],"codeBlockMetadata":null,"iframe":null,"mixtapeMetadata":null},"ImageMetadata:1*xPZ_B0b44kdYuRp9jhQI1A.png":{"__typename":"ImageMetadata","id":"1*xPZ_B0b44kdYuRp9jhQI1A.png","originalHeight":304,"originalWidth":422,"focusPercentX":null,"focusPercentY":null,"alt":null},"Paragraph:44abc7cff577_53":{"__typename":"Paragraph","id":"44abc7cff577_53","name":"7f6c","type":"IMG","href":null,"layout":"INSET_CENTER","metadata":{"__ref":"ImageMetadata:1*xPZ_B0b44kdYuRp9jhQI1A.png"},"text":"Figure 14: Enabling GPU capabilities on Google Colab","hasDropCap":null,"dropCapImage":null,"markups":[],"codeBlockMetadata":null,"iframe":null,"mixtapeMetadata":null},"Paragraph:44abc7cff577_54":{"__typename":"Paragraph","id":"44abc7cff577_54","name":"b817","type":"P","href":null,"layout":null,"metadata":null,"text":"We now have a Jupyter notebook with GPU capabilities and can start working towards creating an NMT model! First, we will import all of the necessary packages.","hasDropCap":null,"dropCapImage":null,"markups":[],"codeBlockMetadata":null,"iframe":null,"mixtapeMetadata":null},"MediaResource:bd69bfd55f042d4c045055b4561fc2e9":{"__typename":"MediaResource","id":"bd69bfd55f042d4c045055b4561fc2e9","iframeSrc":"","iframeHeight":0,"iframeWidth":0,"title":"nmt"},"Paragraph:44abc7cff577_55":{"__typename":"Paragraph","id":"44abc7cff577_55","name":"a871","type":"IFRAME","href":null,"layout":"INSET_CENTER","metadata":null,"text":"","hasDropCap":null,"dropCapImage":null,"markups":[],"codeBlockMetadata":null,"iframe":{"__typename":"Iframe","mediaResource":{"__ref":"MediaResource:bd69bfd55f042d4c045055b4561fc2e9"}},"mixtapeMetadata":null},"Paragraph:44abc7cff577_56":{"__typename":"Paragraph","id":"44abc7cff577_56","name":"ff32","type":"P","href":null,"layout":null,"metadata":null,"text":"Now, run the following code to check if GPU capabilities are enabled.","hasDropCap":null,"dropCapImage":null,"markups":[],"codeBlockMetadata":null,"iframe":null,"mixtapeMetadata":null},"MediaResource:118f6197a18173b97b6f93b4f69a0a29":{"__typename":"MediaResource","id":"118f6197a18173b97b6f93b4f69a0a29","iframeSrc":"","iframeHeight":0,"iframeWidth":0,"title":"nmt"},"Paragraph:44abc7cff577_57":{"__typename":"Paragraph","id":"44abc7cff577_57","name":"f8c4","type":"IFRAME","href":null,"layout":"INSET_CENTER","metadata":null,"text":"","hasDropCap":null,"dropCapImage":null,"markups":[],"codeBlockMetadata":null,"iframe":{"__typename":"Iframe","mediaResource":{"__ref":"MediaResource:118f6197a18173b97b6f93b4f69a0a29"}},"mixtapeMetadata":null},"Paragraph:44abc7cff577_58":{"__typename":"Paragraph","id":"44abc7cff577_58","name":"cc28","type":"P","href":null,"layout":null,"metadata":null,"text":"If TRUE is returned, GPU is available. Now, before we begin doing any translation, we first need to create a number of functions which will prepare the data. The following functions serve to clean the data and allow functionality for us to remove sentences that are too long or whose input sentences don’t start with certain words.","hasDropCap":null,"dropCapImage":null,"markups":[],"codeBlockMetadata":null,"iframe":null,"mixtapeMetadata":null},"MediaResource:c118cd5ee74be2e066c9d99dd294f881":{"__typename":"MediaResource","id":"c118cd5ee74be2e066c9d99dd294f881","iframeSrc":"","iframeHeight":0,"iframeWidth":0,"title":"nmt"},"Paragraph:44abc7cff577_59":{"__typename":"Paragraph","id":"44abc7cff577_59","name":"8ec1","type":"IFRAME","href":null,"layout":"INSET_CENTER","metadata":null,"text":"","hasDropCap":null,"dropCapImage":null,"markups":[],"codeBlockMetadata":null,"iframe":{"__typename":"Iframe","mediaResource":{"__ref":"MediaResource:c118cd5ee74be2e066c9d99dd294f881"}},"mixtapeMetadata":null},"Paragraph:44abc7cff577_60":{"__typename":"Paragraph","id":"44abc7cff577_60","name":"6125","type":"P","href":null,"layout":null,"metadata":null,"text":"Now, with functions that will clean the data, we need a way to transform this cleaned textual data into One Hot Encoding vectors. First, we create a Lang class which will essentially allow us to construct a vocabulary for both the input and output languages.","hasDropCap":null,"dropCapImage":null,"markups":[{"__typename":"Markup","type":"EM","start":149,"end":154,"href":null,"anchorType":null,"userId":null,"linkMetadata":null}],"codeBlockMetadata":null,"iframe":null,"mixtapeMetadata":null},"MediaResource:61b6b7c16f612f6761d67a62eafdc931":{"__typename":"MediaResource","id":"61b6b7c16f612f6761d67a62eafdc931","iframeSrc":"","iframeHeight":0,"iframeWidth":0,"title":"nmt"},"Paragraph:44abc7cff577_61":{"__typename":"Paragraph","id":"44abc7cff577_61","name":"4fad","type":"IFRAME","href":null,"layout":"INSET_CENTER","metadata":null,"text":"","hasDropCap":null,"dropCapImage":null,"markups":[],"codeBlockMetadata":null,"iframe":{"__typename":"Iframe","mediaResource":{"__ref":"MediaResource:61b6b7c16f612f6761d67a62eafdc931"}},"mixtapeMetadata":null},"Paragraph:44abc7cff577_62":{"__typename":"Paragraph","id":"44abc7cff577_62","name":"a2d7","type":"P","href":null,"layout":null,"metadata":null,"text":"Next, we create a prepareLangs function which will take a dataset of translated sentences and create Lang classes for the input and the output languages of a dataset.","hasDropCap":null,"dropCapImage":null,"markups":[{"__typename":"Markup","type":"EM","start":18,"end":30,"href":null,"anchorType":null,"userId":null,"linkMetadata":null},{"__typename":"Markup","type":"EM","start":101,"end":105,"href":null,"anchorType":null,"userId":null,"linkMetadata":null}],"codeBlockMetadata":null,"iframe":null,"mixtapeMetadata":null},"MediaResource:5a4b9e8ea8632e5c6844b1bcc67ca6da":{"__typename":"MediaResource","id":"5a4b9e8ea8632e5c6844b1bcc67ca6da","iframeSrc":"","iframeHeight":0,"iframeWidth":0,"title":"nmt"},"Paragraph:44abc7cff577_63":{"__typename":"Paragraph","id":"44abc7cff577_63","name":"d3fa","type":"IFRAME","href":null,"layout":"INSET_CENTER","metadata":null,"text":"","hasDropCap":null,"dropCapImage":null,"markups":[],"codeBlockMetadata":null,"iframe":{"__typename":"Iframe","mediaResource":{"__ref":"MediaResource:5a4b9e8ea8632e5c6844b1bcc67ca6da"}},"mixtapeMetadata":null},"Paragraph:44abc7cff577_64":{"__typename":"Paragraph","id":"44abc7cff577_64","name":"efc7","type":"P","href":null,"layout":null,"metadata":null,"text":"This function has the ability to work with input and output sentences that are contained in two separate files or in a single file. If the sentences are in two separate files, each sentence must be separated by a newline and each line in the files must correspond to each other (i.e. make a sentence pair). For example, if your input file is english.txt and output file in espanol.txt the files should be formatted as in Figure 15.","hasDropCap":null,"dropCapImage":null,"markups":[],"codeBlockMetadata":null,"iframe":null,"mixtapeMetadata":null},"ImageMetadata:1*xCEe0bM_9WB00iZEYWj90Q.png":{"__typename":"ImageMetadata","id":"1*xCEe0bM_9WB00iZEYWj90Q.png","originalHeight":316,"originalWidth":1116,"focusPercentX":null,"focusPercentY":null,"alt":null},"Paragraph:44abc7cff577_65":{"__typename":"Paragraph","id":"44abc7cff577_65","name":"02cb","type":"IMG","href":null,"layout":"INSET_CENTER","metadata":{"__ref":"ImageMetadata:1*xCEe0bM_9WB00iZEYWj90Q.png"},"text":"Figure 15: Format for dataset stored in two separate files.","hasDropCap":null,"dropCapImage":null,"markups":[],"codeBlockMetadata":null,"iframe":null,"mixtapeMetadata":null},"Paragraph:44abc7cff577_66":{"__typename":"Paragraph","id":"44abc7cff577_66","name":"2202","type":"P","href":null,"layout":null,"metadata":null,"text":"On the other hand, if the input and output sentences are stored in a single file, each sentence in the pair must be separated by a tab and each sentence pair must be separated by a newline. For example, if your single file name is data.txt, the file should be formatted as in Figure 16.","hasDropCap":null,"dropCapImage":null,"markups":[],"codeBlockMetadata":null,"iframe":null,"mixtapeMetadata":null},"ImageMetadata:1*hK9a8XjUdkyv-_nHd80KYw.png":{"__typename":"ImageMetadata","id":"1*hK9a8XjUdkyv-_nHd80KYw.png","originalHeight":313,"originalWidth":688,"focusPercentX":null,"focusPercentY":null,"alt":null},"Paragraph:44abc7cff577_67":{"__typename":"Paragraph","id":"44abc7cff577_67","name":"2791","type":"IMG","href":null,"layout":"INSET_CENTER","metadata":{"__ref":"ImageMetadata:1*hK9a8XjUdkyv-_nHd80KYw.png"},"text":"Figure 16: Format for dataset stored in one single file.","hasDropCap":null,"dropCapImage":null,"markups":[],"codeBlockMetadata":null,"iframe":null,"mixtapeMetadata":null},"Paragraph:44abc7cff577_68":{"__typename":"Paragraph","id":"44abc7cff577_68","name":"d89c","type":"P","href":null,"layout":null,"metadata":null,"text":"Note: In order for this function to work with both one and two files, the file_path argument must be in the tuple format with two elements in the tuple if the data is stored in two files, and one element in the tuple if the data is stored in a single file.","hasDropCap":null,"dropCapImage":null,"markups":[{"__typename":"Markup","type":"EM","start":0,"end":256,"href":null,"anchorType":null,"userId":null,"linkMetadata":null}],"codeBlockMetadata":null,"iframe":null,"mixtapeMetadata":null},"Paragraph:44abc7cff577_69":{"__typename":"Paragraph","id":"44abc7cff577_69","name":"3c5b","type":"P","href":null,"layout":null,"metadata":null,"text":"With a function that works to prepare the language vocabularies for both the input and output languages, we can use all of the above functions to create a single function that will take a dataset of both input and target sentences and complete all of the preprocessing steps. Thus, the prepareData function will creates Lang classes for each language and fully clean and trim the data according to the specified passed arguments. In the end, this function will return both language classes along with a set of training pairs and a set of test pairs.","hasDropCap":null,"dropCapImage":null,"markups":[{"__typename":"Markup","type":"EM","start":286,"end":298,"href":null,"anchorType":null,"userId":null,"linkMetadata":null},{"__typename":"Markup","type":"EM","start":320,"end":324,"href":null,"anchorType":null,"userId":null,"linkMetadata":null}],"codeBlockMetadata":null,"iframe":null,"mixtapeMetadata":null},"MediaResource:a8af8b00bfe7b13a1bc44f37602c06e9":{"__typename":"MediaResource","id":"a8af8b00bfe7b13a1bc44f37602c06e9","iframeSrc":"","iframeHeight":0,"iframeWidth":0,"title":"nmt"},"Paragraph:44abc7cff577_70":{"__typename":"Paragraph","id":"44abc7cff577_70","name":"0e36","type":"IFRAME","href":null,"layout":"INSET_CENTER","metadata":null,"text":"","hasDropCap":null,"dropCapImage":null,"markups":[],"codeBlockMetadata":null,"iframe":{"__typename":"Iframe","mediaResource":{"__ref":"MediaResource:a8af8b00bfe7b13a1bc44f37602c06e9"}},"mixtapeMetadata":null},"Paragraph:44abc7cff577_71":{"__typename":"Paragraph","id":"44abc7cff577_71","name":"a4d0","type":"P","href":null,"layout":null,"metadata":null,"text":"While we have created a vocabulary for each language, we still need to create functions which use these vocabularies to transform sentence pairs both to and from their One Hot Encoding vector representations.","hasDropCap":null,"dropCapImage":null,"markups":[],"codeBlockMetadata":null,"iframe":null,"mixtapeMetadata":null},"MediaResource:6c08bbc45d47676da696d5b78e0fee0b":{"__typename":"MediaResource","id":"6c08bbc45d47676da696d5b78e0fee0b","iframeSrc":"","iframeHeight":0,"iframeWidth":0,"title":""},"Paragraph:44abc7cff577_72":{"__typename":"Paragraph","id":"44abc7cff577_72","name":"c03c","type":"IFRAME","href":null,"layout":"INSET_CENTER","metadata":null,"text":"","hasDropCap":null,"dropCapImage":null,"markups":[],"codeBlockMetadata":null,"iframe":{"__typename":"Iframe","mediaResource":{"__ref":"MediaResource:6c08bbc45d47676da696d5b78e0fee0b"}},"mixtapeMetadata":null},"Paragraph:44abc7cff577_73":{"__typename":"Paragraph","id":"44abc7cff577_73","name":"53f8","type":"P","href":null,"layout":null,"metadata":null,"text":"NMT is no different than normal machine learning in that minibatch gradient descent is the most effective way to train a model. Thus, before we begin building our model, we want to create a function to batchify our sentence pairs so that we can perform gradient descent on mini-batches. We also create the function pad_batch to handle the issue of variable length sentences in a batch. This function essentially appends\u003CEOS\u003E tags to the end of each of the shorter sentences until every sentence in the batch is the same length.","hasDropCap":null,"dropCapImage":null,"markups":[{"__typename":"Markup","type":"A","start":57,"end":126,"href":"https:\u002F\u002Fmachinelearningmastery.com\u002Fgentle-introduction-mini-batch-gradient-descent-configure-batch-size\u002F","anchorType":"LINK","userId":null,"linkMetadata":null},{"__typename":"Markup","type":"EM","start":202,"end":210,"href":null,"anchorType":null,"userId":null,"linkMetadata":null},{"__typename":"Markup","type":"EM","start":315,"end":324,"href":null,"anchorType":null,"userId":null,"linkMetadata":null}],"codeBlockMetadata":null,"iframe":null,"mixtapeMetadata":null},"MediaResource:9dabbf9329432bd02019676b4a428ea4":{"__typename":"MediaResource","id":"9dabbf9329432bd02019676b4a428ea4","iframeSrc":"","iframeHeight":0,"iframeWidth":0,"title":""},"Paragraph:44abc7cff577_74":{"__typename":"Paragraph","id":"44abc7cff577_74","name":"24fd","type":"IFRAME","href":null,"layout":"INSET_CENTER","metadata":null,"text":"","hasDropCap":null,"dropCapImage":null,"markups":[],"codeBlockMetadata":null,"iframe":{"__typename":"Iframe","mediaResource":{"__ref":"MediaResource:9dabbf9329432bd02019676b4a428ea4"}},"mixtapeMetadata":null},"Paragraph:44abc7cff577_75":{"__typename":"Paragraph","id":"44abc7cff577_75","name":"9e98","type":"P","href":null,"layout":null,"metadata":null,"text":"And with that, we have created all of the necessary functions to preprocess the data and are finally ready to build our Encoder Decoder model! With a general understanding of the Encoder Decoder architecture and attention mechanisms, let’s dive into the Python code that creates these frameworks. Rather than explain each aspect of the Encoder and the Decoder, I will simply provide the code and refer you to the PyTorch documentation for any questions you may have on various aspects of the code.","hasDropCap":null,"dropCapImage":null,"markups":[{"__typename":"Markup","type":"A","start":413,"end":434,"href":"https:\u002F\u002Fpytorch.org\u002Fdocs\u002Fstable\u002Fnn.html","anchorType":"LINK","userId":null,"linkMetadata":null}],"codeBlockMetadata":null,"iframe":null,"mixtapeMetadata":null},"MediaResource:ab5858243109b65ba4955d1a9388ca54":{"__typename":"MediaResource","id":"ab5858243109b65ba4955d1a9388ca54","iframeSrc":"","iframeHeight":0,"iframeWidth":0,"title":"encoder"},"Paragraph:44abc7cff577_76":{"__typename":"Paragraph","id":"44abc7cff577_76","name":"de08","type":"IFRAME","href":null,"layout":"INSET_CENTER","metadata":null,"text":"","hasDropCap":null,"dropCapImage":null,"markups":[],"codeBlockMetadata":null,"iframe":{"__typename":"Iframe","mediaResource":{"__ref":"MediaResource:ab5858243109b65ba4955d1a9388ca54"}},"mixtapeMetadata":null},"MediaResource:fa0825d74b85e610860945aef1ae82b5":{"__typename":"MediaResource","id":"fa0825d74b85e610860945aef1ae82b5","iframeSrc":"","iframeHeight":0,"iframeWidth":0,"title":"attn_decoder"},"Paragraph:44abc7cff577_77":{"__typename":"Paragraph","id":"44abc7cff577_77","name":"e4eb","type":"IFRAME","href":null,"layout":"INSET_CENTER","metadata":null,"text":"","hasDropCap":null,"dropCapImage":null,"markups":[],"codeBlockMetadata":null,"iframe":{"__typename":"Iframe","mediaResource":{"__ref":"MediaResource:fa0825d74b85e610860945aef1ae82b5"}},"mixtapeMetadata":null},"Paragraph:44abc7cff577_78":{"__typename":"Paragraph","id":"44abc7cff577_78","name":"3904","type":"P","href":null,"layout":null,"metadata":null,"text":"Now, in order to train and test the model, we will use the following functions. The train_batch function below performs a training loop on a single training batch. This includes a completing a forward pass through the model to create a predicted translation for each sentence in the batch, computing the total loss for the batch, and then back-propagating on the loss to update all of the weight matrices in both the Encoder and the Decoder.","hasDropCap":null,"dropCapImage":null,"markups":[{"__typename":"Markup","type":"EM","start":84,"end":95,"href":null,"anchorType":null,"userId":null,"linkMetadata":null}],"codeBlockMetadata":null,"iframe":null,"mixtapeMetadata":null},"MediaResource:4017f05c5ffdbd73329597ff5ac0a367":{"__typename":"MediaResource","id":"4017f05c5ffdbd73329597ff5ac0a367","iframeSrc":"","iframeHeight":0,"iframeWidth":0,"title":"train_batch"},"Paragraph:44abc7cff577_79":{"__typename":"Paragraph","id":"44abc7cff577_79","name":"affa","type":"IFRAME","href":null,"layout":"INSET_CENTER","metadata":null,"text":"","hasDropCap":null,"dropCapImage":null,"markups":[],"codeBlockMetadata":null,"iframe":{"__typename":"Iframe","mediaResource":{"__ref":"MediaResource:4017f05c5ffdbd73329597ff5ac0a367"}},"mixtapeMetadata":null},"Paragraph:44abc7cff577_80":{"__typename":"Paragraph","id":"44abc7cff577_80","name":"8aad","type":"P","href":null,"layout":null,"metadata":null,"text":"The train function simply performs the train_batch function iteratively for each batch in a list of batches. In this way, we can pass a list of all of the training batches to complete a full epoch through the training data.","hasDropCap":null,"dropCapImage":null,"markups":[{"__typename":"Markup","type":"EM","start":4,"end":9,"href":null,"anchorType":null,"userId":null,"linkMetadata":null},{"__typename":"Markup","type":"EM","start":39,"end":50,"href":null,"anchorType":null,"userId":null,"linkMetadata":null}],"codeBlockMetadata":null,"iframe":null,"mixtapeMetadata":null},"MediaResource:565cb44177dbda1652647dd73a426e43":{"__typename":"MediaResource","id":"565cb44177dbda1652647dd73a426e43","iframeSrc":"","iframeHeight":0,"iframeWidth":0,"title":"train"},"Paragraph:44abc7cff577_81":{"__typename":"Paragraph","id":"44abc7cff577_81","name":"4e71","type":"IFRAME","href":null,"layout":"INSET_CENTER","metadata":null,"text":"","hasDropCap":null,"dropCapImage":null,"markups":[],"codeBlockMetadata":null,"iframe":{"__typename":"Iframe","mediaResource":{"__ref":"MediaResource:565cb44177dbda1652647dd73a426e43"}},"mixtapeMetadata":null},"Paragraph:44abc7cff577_82":{"__typename":"Paragraph","id":"44abc7cff577_82","name":"6b6a","type":"P","href":null,"layout":null,"metadata":null,"text":"The following test_batch and test functions are essentially the same as the train_batch and train functions, with the exception that these test functions are to be performed on the test data and do not include a back-propagation step. Thus, these functions do not update the weight matrices in the model and are solely used to evaluate the loss (i.e. the accuracy) on training data. In turn, this will help us track how the model performs on data outside of the training set.","hasDropCap":null,"dropCapImage":null,"markups":[{"__typename":"Markup","type":"EM","start":14,"end":24,"href":null,"anchorType":null,"userId":null,"linkMetadata":null},{"__typename":"Markup","type":"EM","start":29,"end":33,"href":null,"anchorType":null,"userId":null,"linkMetadata":null},{"__typename":"Markup","type":"EM","start":76,"end":87,"href":null,"anchorType":null,"userId":null,"linkMetadata":null},{"__typename":"Markup","type":"EM","start":92,"end":97,"href":null,"anchorType":null,"userId":null,"linkMetadata":null}],"codeBlockMetadata":null,"iframe":null,"mixtapeMetadata":null},"MediaResource:d0fb61030e7bce77474bbaa6d916ae5f":{"__typename":"MediaResource","id":"d0fb61030e7bce77474bbaa6d916ae5f","iframeSrc":"","iframeHeight":0,"iframeWidth":0,"title":"test_batch"},"Paragraph:44abc7cff577_83":{"__typename":"Paragraph","id":"44abc7cff577_83","name":"dba5","type":"IFRAME","href":null,"layout":"INSET_CENTER","metadata":null,"text":"","hasDropCap":null,"dropCapImage":null,"markups":[],"codeBlockMetadata":null,"iframe":{"__typename":"Iframe","mediaResource":{"__ref":"MediaResource:d0fb61030e7bce77474bbaa6d916ae5f"}},"mixtapeMetadata":null},"MediaResource:fda944e8f24fc716014caa77b5789b66":{"__typename":"MediaResource","id":"fda944e8f24fc716014caa77b5789b66","iframeSrc":"","iframeHeight":0,"iframeWidth":0,"title":"test"},"Paragraph:44abc7cff577_84":{"__typename":"Paragraph","id":"44abc7cff577_84","name":"7113","type":"IFRAME","href":null,"layout":"INSET_CENTER","metadata":null,"text":"","hasDropCap":null,"dropCapImage":null,"markups":[],"codeBlockMetadata":null,"iframe":{"__typename":"Iframe","mediaResource":{"__ref":"MediaResource:fda944e8f24fc716014caa77b5789b66"}},"mixtapeMetadata":null},"Paragraph:44abc7cff577_85":{"__typename":"Paragraph","id":"44abc7cff577_85","name":"a2d9","type":"P","href":null,"layout":null,"metadata":null,"text":"During training, it will also be nice to be able to track our progress in a more qualitative sense. The evaluate function will allow us to do so by returning the predicted translation that our model makes for a given input sentence. And the evaluate_randomly function will simply predict translation for a specified number of sentences chosen randomly from the test set (if we have one) or the train set.","hasDropCap":null,"dropCapImage":null,"markups":[{"__typename":"Markup","type":"EM","start":104,"end":113,"href":null,"anchorType":null,"userId":null,"linkMetadata":null},{"__typename":"Markup","type":"EM","start":241,"end":258,"href":null,"anchorType":null,"userId":null,"linkMetadata":null}],"codeBlockMetadata":null,"iframe":null,"mixtapeMetadata":null},"MediaResource:5a760c84a0855b1fcf81a252e3547dd2":{"__typename":"MediaResource","id":"5a760c84a0855b1fcf81a252e3547dd2","iframeSrc":"","iframeHeight":0,"iframeWidth":0,"title":"evaluate"},"Paragraph:44abc7cff577_86":{"__typename":"Paragraph","id":"44abc7cff577_86","name":"8150","type":"IFRAME","href":null,"layout":"INSET_CENTER","metadata":null,"text":"","hasDropCap":null,"dropCapImage":null,"markups":[],"codeBlockMetadata":null,"iframe":{"__typename":"Iframe","mediaResource":{"__ref":"MediaResource:5a760c84a0855b1fcf81a252e3547dd2"}},"mixtapeMetadata":null},"MediaResource:9581f11b954cb58f0291bbc903684493":{"__typename":"MediaResource","id":"9581f11b954cb58f0291bbc903684493","iframeSrc":"","iframeHeight":0,"iframeWidth":0,"title":"evaluate_randomly"},"Paragraph:44abc7cff577_87":{"__typename":"Paragraph","id":"44abc7cff577_87","name":"c082","type":"IFRAME","href":null,"layout":"INSET_CENTER","metadata":null,"text":"","hasDropCap":null,"dropCapImage":null,"markups":[],"codeBlockMetadata":null,"iframe":{"__typename":"Iframe","mediaResource":{"__ref":"MediaResource:9581f11b954cb58f0291bbc903684493"}},"mixtapeMetadata":null},"Paragraph:44abc7cff577_88":{"__typename":"Paragraph","id":"44abc7cff577_88","name":"2ef6","type":"P","href":null,"layout":null,"metadata":null,"text":"A few helper functions below will work to plot our training progress, print memory consumption, and reformat time measurements.","hasDropCap":null,"dropCapImage":null,"markups":[],"codeBlockMetadata":null,"iframe":null,"mixtapeMetadata":null},"MediaResource:f80e72a3719f6df455b816c8b9fdcdca":{"__typename":"MediaResource","id":"f80e72a3719f6df455b816c8b9fdcdca","iframeSrc":"","iframeHeight":0,"iframeWidth":0,"title":"helper_functions"},"Paragraph:44abc7cff577_89":{"__typename":"Paragraph","id":"44abc7cff577_89","name":"1fe5","type":"IFRAME","href":null,"layout":"INSET_CENTER","metadata":null,"text":"","hasDropCap":null,"dropCapImage":null,"markups":[],"codeBlockMetadata":null,"iframe":{"__typename":"Iframe","mediaResource":{"__ref":"MediaResource:f80e72a3719f6df455b816c8b9fdcdca"}},"mixtapeMetadata":null},"Paragraph:44abc7cff577_90":{"__typename":"Paragraph","id":"44abc7cff577_90","name":"472d","type":"P","href":null,"layout":null,"metadata":null,"text":"And finally, we can put all of these functions into a master function which we will call train_and_test. This function will take quite a few arguments, but will completely train our model while evaluating our progress on the train set (and test set if present) at specified intervals. Also, some arguments will specify whether we want to save the output in a separate .txt file, create a graph of the loss values over time, and also allow us to save the weights of both the Encoder and the Decoder for future use. The next few cells after this function will outline how you can modify each argument, but just know that this function will essentially be all we need to run in order to train the model.","hasDropCap":null,"dropCapImage":null,"markups":[{"__typename":"Markup","type":"A","start":502,"end":512,"href":"https:\u002F\u002Fpytorch.org\u002Ftutorials\u002Fbeginner\u002Fsaving_loading_models.html","anchorType":"LINK","userId":null,"linkMetadata":null},{"__typename":"Markup","type":"EM","start":89,"end":103,"href":null,"anchorType":null,"userId":null,"linkMetadata":null}],"codeBlockMetadata":null,"iframe":null,"mixtapeMetadata":null},"MediaResource:ba5254761ac61bc2d648748a8df608fd":{"__typename":"MediaResource","id":"ba5254761ac61bc2d648748a8df608fd","iframeSrc":"","iframeHeight":0,"iframeWidth":0,"title":"train_and_test"},"Paragraph:44abc7cff577_91":{"__typename":"Paragraph","id":"44abc7cff577_91","name":"519d","type":"IFRAME","href":null,"layout":"INSET_CENTER","metadata":null,"text":"","hasDropCap":null,"dropCapImage":null,"markups":[],"codeBlockMetadata":null,"iframe":{"__typename":"Iframe","mediaResource":{"__ref":"MediaResource:ba5254761ac61bc2d648748a8df608fd"}},"mixtapeMetadata":null},"Paragraph:44abc7cff577_92":{"__typename":"Paragraph","id":"44abc7cff577_92","name":"8663","type":"P","href":null,"layout":null,"metadata":null,"text":"Now that we have everything in place we are ready to import our dataset, initialize all of the hyperparameters, and start training!","hasDropCap":null,"dropCapImage":null,"markups":[],"codeBlockMetadata":null,"iframe":null,"mixtapeMetadata":null},"Paragraph:44abc7cff577_93":{"__typename":"Paragraph","id":"44abc7cff577_93","name":"20ba","type":"P","href":null,"layout":null,"metadata":null,"text":"First, in order to upload a dataset, run the following cell:","hasDropCap":null,"dropCapImage":null,"markups":[],"codeBlockMetadata":null,"iframe":null,"mixtapeMetadata":null},"MediaResource:29e76b165da70d8d808321308a08e9b1":{"__typename":"MediaResource","id":"29e76b165da70d8d808321308a08e9b1","iframeSrc":"","iframeHeight":0,"iframeWidth":0,"title":"upload_data"},"Paragraph:44abc7cff577_94":{"__typename":"Paragraph","id":"44abc7cff577_94","name":"6560","type":"IFRAME","href":null,"layout":"INSET_CENTER","metadata":null,"text":"","hasDropCap":null,"dropCapImage":null,"markups":[],"codeBlockMetadata":null,"iframe":{"__typename":"Iframe","mediaResource":{"__ref":"MediaResource:29e76b165da70d8d808321308a08e9b1"}},"mixtapeMetadata":null},"Paragraph:44abc7cff577_95":{"__typename":"Paragraph","id":"44abc7cff577_95","name":"efc2","type":"P","href":null,"layout":null,"metadata":null,"text":"And you will see the following:","hasDropCap":null,"dropCapImage":null,"markups":[],"codeBlockMetadata":null,"iframe":null,"mixtapeMetadata":null},"ImageMetadata:1*pHnANxJlp7IdPElhnvJWLw.png":{"__typename":"ImageMetadata","id":"1*pHnANxJlp7IdPElhnvJWLw.png","originalHeight":87,"originalWidth":383,"focusPercentX":null,"focusPercentY":null,"alt":null},"Paragraph:44abc7cff577_96":{"__typename":"Paragraph","id":"44abc7cff577_96","name":"c829","type":"IMG","href":null,"layout":"INSET_CENTER","metadata":{"__ref":"ImageMetadata:1*pHnANxJlp7IdPElhnvJWLw.png"},"text":"Figure 17: Upload data to Google Colab","hasDropCap":null,"dropCapImage":null,"markups":[],"codeBlockMetadata":null,"iframe":null,"mixtapeMetadata":null},"Paragraph:44abc7cff577_97":{"__typename":"Paragraph","id":"44abc7cff577_97","name":"e1e8","type":"P","href":null,"layout":null,"metadata":null,"text":"Simply click on the “Choose Files” button and navigate to the dataset you wish to upload. In this tutorial, we are using the same dataset that was used in the original PyTorch tutorial. You can download that dataset of English to French translations here. You can also experiment with a number of other datasets of various languages here.","hasDropCap":null,"dropCapImage":null,"markups":[{"__typename":"Markup","type":"A","start":250,"end":254,"href":"https:\u002F\u002Fgithub.com\u002Fqlanners\u002Fnmt_tutorial\u002Fblob\u002Fmaster\u002Feng-fra.txt","anchorType":"LINK","userId":null,"linkMetadata":null},{"__typename":"Markup","type":"A","start":333,"end":337,"href":"https:\u002F\u002Fwww.manythings.org\u002Fanki\u002F","anchorType":"LINK","userId":null,"linkMetadata":null}],"codeBlockMetadata":null,"iframe":null,"mixtapeMetadata":null},"Paragraph:44abc7cff577_98":{"__typename":"Paragraph","id":"44abc7cff577_98","name":"0400","type":"P","href":null,"layout":null,"metadata":null,"text":"If you are looking to get more state-of-the-art results I’d recommend trying to train on a larger dataset. You can find some larger datasets here, but also feel free to use any corpus of translated excerpts as long as they are formatted like in Figure 15 or Figure 16 above.","hasDropCap":null,"dropCapImage":null,"markups":[{"__typename":"Markup","type":"A","start":141,"end":145,"href":"http:\u002F\u002Fwww.statmt.org\u002Fwmt14\u002Ftranslation-task.html","anchorType":"LINK","userId":null,"linkMetadata":null}],"codeBlockMetadata":null,"iframe":null,"mixtapeMetadata":null},"Paragraph:44abc7cff577_99":{"__typename":"Paragraph","id":"44abc7cff577_99","name":"aeb8","type":"P","href":null,"layout":null,"metadata":null,"text":"Note: You may have issues uploading larger datasets to Google Colab using the upload method presented in this tutorial. If you run into such issues, read this article to learn how to upload large files.","hasDropCap":null,"dropCapImage":null,"markups":[{"__typename":"Markup","type":"A","start":154,"end":166,"href":"https:\u002F\u002Fwww.freecodecamp.org\u002Fnews\u002Fhow-to-transfer-large-files-to-google-colab-and-remote-jupyter-notebooks-26ca252892fa\u002F","anchorType":"LINK","userId":null,"linkMetadata":null},{"__typename":"Markup","type":"EM","start":0,"end":202,"href":null,"anchorType":null,"userId":null,"linkMetadata":null}],"codeBlockMetadata":null,"iframe":null,"mixtapeMetadata":null},"Paragraph:44abc7cff577_100":{"__typename":"Paragraph","id":"44abc7cff577_100","name":"5a51","type":"P","href":null,"layout":null,"metadata":null,"text":"Now, run the following cell to ensure that your dataset has been successfully uploaded.","hasDropCap":null,"dropCapImage":null,"markups":[],"codeBlockMetadata":null,"iframe":null,"mixtapeMetadata":null},"ImageMetadata:1*TNecR90YCwzPKhiOZCNytg.png":{"__typename":"ImageMetadata","id":"1*TNecR90YCwzPKhiOZCNytg.png","originalHeight":67,"originalWidth":230,"focusPercentX":null,"focusPercentY":null,"alt":null},"Paragraph:44abc7cff577_101":{"__typename":"Paragraph","id":"44abc7cff577_101","name":"297b","type":"IMG","href":null,"layout":"INSET_CENTER","metadata":{"__ref":"ImageMetadata:1*TNecR90YCwzPKhiOZCNytg.png"},"text":"Figure 18: Run ls to ensure dataset has been uploaded","hasDropCap":null,"dropCapImage":null,"markups":[],"codeBlockMetadata":null,"iframe":null,"mixtapeMetadata":null},"Paragraph:44abc7cff577_102":{"__typename":"Paragraph","id":"44abc7cff577_102","name":"8af5","type":"P","href":null,"layout":null,"metadata":null,"text":"From here, edit the following cells to apply to your dataset and desires.","hasDropCap":null,"dropCapImage":null,"markups":[],"codeBlockMetadata":null,"iframe":null,"mixtapeMetadata":null},"MediaResource:1d4c039e1ad50572e039fab8c5069b60":{"__typename":"MediaResource","id":"1d4c039e1ad50572e039fab8c5069b60","iframeSrc":"","iframeHeight":0,"iframeWidth":0,"title":"data_info"},"Paragraph:44abc7cff577_103":{"__typename":"Paragraph","id":"44abc7cff577_103","name":"8636","type":"IFRAME","href":null,"layout":"INSET_CENTER","metadata":null,"text":"","hasDropCap":null,"dropCapImage":null,"markups":[],"codeBlockMetadata":null,"iframe":{"__typename":"Iframe","mediaResource":{"__ref":"MediaResource:1d4c039e1ad50572e039fab8c5069b60"}},"mixtapeMetadata":null},"MediaResource:7acaea7efff11aca26a6229229e7d266":{"__typename":"MediaResource","id":"7acaea7efff11aca26a6229229e7d266","iframeSrc":"","iframeHeight":0,"iframeWidth":0,"title":"output_options"},"Paragraph:44abc7cff577_104":{"__typename":"Paragraph","id":"44abc7cff577_104","name":"2513","type":"IFRAME","href":null,"layout":"INSET_CENTER","metadata":null,"text":"","hasDropCap":null,"dropCapImage":null,"markups":[],"codeBlockMetadata":null,"iframe":{"__typename":"Iframe","mediaResource":{"__ref":"MediaResource:7acaea7efff11aca26a6229229e7d266"}},"mixtapeMetadata":null},"Paragraph:44abc7cff577_105":{"__typename":"Paragraph","id":"44abc7cff577_105","name":"46ba","type":"P","href":null,"layout":null,"metadata":null,"text":"The following cell consists of the variety of hyperparameters that you are going to need to play with towards finding an effective NMT model. So have fun experimenting with these.","hasDropCap":null,"dropCapImage":null,"markups":[],"codeBlockMetadata":null,"iframe":null,"mixtapeMetadata":null},"MediaResource:153749da86a552396f6f5b337c8a7f1d":{"__typename":"MediaResource","id":"153749da86a552396f6f5b337c8a7f1d","iframeSrc":"","iframeHeight":0,"iframeWidth":0,"title":"hyperparameters"},"Paragraph:44abc7cff577_106":{"__typename":"Paragraph","id":"44abc7cff577_106","name":"e895","type":"IFRAME","href":null,"layout":"INSET_CENTER","metadata":null,"text":"","hasDropCap":null,"dropCapImage":null,"markups":[],"codeBlockMetadata":null,"iframe":{"__typename":"Iframe","mediaResource":{"__ref":"MediaResource:153749da86a552396f6f5b337c8a7f1d"}},"mixtapeMetadata":null},"Paragraph:44abc7cff577_107":{"__typename":"Paragraph","id":"44abc7cff577_107","name":"5ded","type":"P","href":null,"layout":null,"metadata":null,"text":"And finally, you just need to run the following cell to train your model according to all of the hyperparameters you set above.","hasDropCap":null,"dropCapImage":null,"markups":[],"codeBlockMetadata":null,"iframe":null,"mixtapeMetadata":null},"MediaResource:3db3f786423bee16cbf00e56a3b20a4f":{"__typename":"MediaResource","id":"3db3f786423bee16cbf00e56a3b20a4f","iframeSrc":"","iframeHeight":0,"iframeWidth":0,"title":"main"},"Paragraph:44abc7cff577_108":{"__typename":"Paragraph","id":"44abc7cff577_108","name":"4c23","type":"IFRAME","href":null,"layout":"INSET_CENTER","metadata":null,"text":"","hasDropCap":null,"dropCapImage":null,"markups":[],"codeBlockMetadata":null,"iframe":{"__typename":"Iframe","mediaResource":{"__ref":"MediaResource:3db3f786423bee16cbf00e56a3b20a4f"}},"mixtapeMetadata":null},"Paragraph:44abc7cff577_109":{"__typename":"Paragraph","id":"44abc7cff577_109","name":"0f2b","type":"P","href":null,"layout":null,"metadata":null,"text":"And voilà! You have just trained an NMT model! Congrats! If you saved any graphs, output files, or output weights, you can view all of the saved files by running ls again. And to download any of these files simply run the code below.","hasDropCap":null,"dropCapImage":null,"markups":[],"codeBlockMetadata":null,"iframe":null,"mixtapeMetadata":null},"MediaResource:ddfb20e15e57ca77421e58d1878758fc":{"__typename":"MediaResource","id":"ddfb20e15e57ca77421e58d1878758fc","iframeSrc":"","iframeHeight":0,"iframeWidth":0,"title":"download_file"},"Paragraph:44abc7cff577_110":{"__typename":"Paragraph","id":"44abc7cff577_110","name":"1074","type":"IFRAME","href":null,"layout":"INSET_CENTER","metadata":null,"text":"","hasDropCap":null,"dropCapImage":null,"markups":[],"codeBlockMetadata":null,"iframe":{"__typename":"Iframe","mediaResource":{"__ref":"MediaResource:ddfb20e15e57ca77421e58d1878758fc"}},"mixtapeMetadata":null},"Paragraph:44abc7cff577_111":{"__typename":"Paragraph","id":"44abc7cff577_111","name":"13b6","type":"P","href":null,"layout":null,"metadata":null,"text":"Now, if you’d like to test the model on sentences outside both the train and the test set you can do that as well. Just make sure the sentence you are trying to translate is in the same language as the input language of your model.","hasDropCap":null,"dropCapImage":null,"markups":[],"codeBlockMetadata":null,"iframe":null,"mixtapeMetadata":null},"MediaResource:f7be36498a19c0a0b265758994f171a6":{"__typename":"MediaResource","id":"f7be36498a19c0a0b265758994f171a6","iframeSrc":"","iframeHeight":0,"iframeWidth":0,"title":"outside_sentence_test"},"Paragraph:44abc7cff577_112":{"__typename":"Paragraph","id":"44abc7cff577_112","name":"1981","type":"IFRAME","href":null,"layout":"INSET_CENTER","metadata":null,"text":"","hasDropCap":null,"dropCapImage":null,"markups":[],"codeBlockMetadata":null,"iframe":{"__typename":"Iframe","mediaResource":{"__ref":"MediaResource:f7be36498a19c0a0b265758994f171a6"}},"mixtapeMetadata":null},"Paragraph:44abc7cff577_113":{"__typename":"Paragraph","id":"44abc7cff577_113","name":"f262","type":"P","href":null,"layout":null,"metadata":null,"text":"I trained my model and the PyTorch tutorial model on the same dataset used in the PyTorch tutorial (which is the same dataset of English to French translations mentioned above). To preprocess the data, the trim was set to 10 and the eng_prefixes filters that PyTorch used was set to TRUE. With these restrictions, the dataset was cut to a rather small set of 10,853 sentence pairs. The PyTorch tutorial broke one of the fundamental rules of machine learning and didn’t to use a test set (not good practice!). So, just for comparison purposes, I kept all of these sentence pairs in my train set and didn’t use a test set (i.e. perc_train_set = 1.0). However, I’d recommend that you always use a test set when training any sort of machine learning model.","hasDropCap":null,"dropCapImage":null,"markups":[],"codeBlockMetadata":null,"iframe":null,"mixtapeMetadata":null},"Paragraph:44abc7cff577_114":{"__typename":"Paragraph","id":"44abc7cff577_114","name":"94b5","type":"P","href":null,"layout":null,"metadata":null,"text":"A comparison of the hyperparameters I chose for my model vs. the hyperparameters in the PyTorch tutorial model is shown in Table 1. The graph below in Figure 19 depicts the results of training for 40 minutes on an NVIDIA GeForce GTX 1080 (a bit older GPU, you can actually achieve superior results using Google Colab).","hasDropCap":null,"dropCapImage":null,"markups":[],"codeBlockMetadata":null,"iframe":null,"mixtapeMetadata":null},"ImageMetadata:1*ldHhAVYcbeRxZcQ4u3HPWQ.png":{"__typename":"ImageMetadata","id":"1*ldHhAVYcbeRxZcQ4u3HPWQ.png","originalHeight":184,"originalWidth":446,"focusPercentX":null,"focusPercentY":null,"alt":null},"Paragraph:44abc7cff577_115":{"__typename":"Paragraph","id":"44abc7cff577_115","name":"a9c7","type":"IMG","href":null,"layout":"INSET_CENTER","metadata":{"__ref":"ImageMetadata:1*ldHhAVYcbeRxZcQ4u3HPWQ.png"},"text":"Table 1: Hyperparameters comparison","hasDropCap":null,"dropCapImage":null,"markups":[],"codeBlockMetadata":null,"iframe":null,"mixtapeMetadata":null},"ImageMetadata:1*gWMmg6860K7Q8uXhOKAk4A.png":{"__typename":"ImageMetadata","id":"1*gWMmg6860K7Q8uXhOKAk4A.png","originalHeight":1125,"originalWidth":1260,"focusPercentX":null,"focusPercentY":null,"alt":null},"Paragraph:44abc7cff577_116":{"__typename":"Paragraph","id":"44abc7cff577_116","name":"6de0","type":"IMG","href":null,"layout":"INSET_CENTER","metadata":{"__ref":"ImageMetadata:1*gWMmg6860K7Q8uXhOKAk4A.png"},"text":"Figure 19: Loss over 40 minute training period for this tutorial model (My Model) vs PyTorch Tutorial Model","hasDropCap":null,"dropCapImage":null,"markups":[],"codeBlockMetadata":null,"iframe":null,"mixtapeMetadata":null},"Paragraph:44abc7cff577_117":{"__typename":"Paragraph","id":"44abc7cff577_117","name":"2429","type":"P","href":null,"layout":null,"metadata":null,"text":"Since this dataset has no training set, I evaluated the model on a few sentences from the train set.","hasDropCap":null,"dropCapImage":null,"markups":[],"codeBlockMetadata":null,"iframe":null,"mixtapeMetadata":null},"ImageMetadata:1*rUvPuDqINjJPUwnpe8uyEw.png":{"__typename":"ImageMetadata","id":"1*rUvPuDqINjJPUwnpe8uyEw.png","originalHeight":339,"originalWidth":445,"focusPercentX":null,"focusPercentY":null,"alt":null},"Paragraph:44abc7cff577_118":{"__typename":"Paragraph","id":"44abc7cff577_118","name":"4137","type":"IMG","href":null,"layout":"INSET_CENTER","metadata":{"__ref":"ImageMetadata:1*rUvPuDqINjJPUwnpe8uyEw.png"},"text":"Figure 20: Predicted translation of PyTorch tutorial model (Blue) vs. My Model (Orange)","hasDropCap":null,"dropCapImage":null,"markups":[],"codeBlockMetadata":null,"iframe":null,"mixtapeMetadata":null},"Paragraph:44abc7cff577_119":{"__typename":"Paragraph","id":"44abc7cff577_119","name":"d0f8","type":"P","href":null,"layout":null,"metadata":null,"text":"From these results, we can see that the model in this tutorial can create a more effective translation model in the same amount of training time. However, when we try to use this model to translate sentences outside of the train set, it immediately breaks down. We can see this in the model’s attempted translation of the following sentence which was NOT in the dataset.","hasDropCap":null,"dropCapImage":null,"markups":[],"codeBlockMetadata":null,"iframe":null,"mixtapeMetadata":null},"ImageMetadata:1*XpJeIJvIWRivzTtG5ZHxdw.png":{"__typename":"ImageMetadata","id":"1*XpJeIJvIWRivzTtG5ZHxdw.png","originalHeight":106,"originalWidth":760,"focusPercentX":null,"focusPercentY":null,"alt":null},"Paragraph:44abc7cff577_120":{"__typename":"Paragraph","id":"44abc7cff577_120","name":"884d","type":"IMG","href":null,"layout":"INSET_CENTER","metadata":{"__ref":"ImageMetadata:1*XpJeIJvIWRivzTtG5ZHxdw.png"},"text":"Figure 21: Failed translation on sentence outside the dataset.","hasDropCap":null,"dropCapImage":null,"markups":[],"codeBlockMetadata":null,"iframe":null,"mixtapeMetadata":null},"Paragraph:44abc7cff577_121":{"__typename":"Paragraph","id":"44abc7cff577_121","name":"6fe7","type":"P","href":null,"layout":null,"metadata":null,"text":"This failure of the model is largely due to the fact that it was trained on such a small dataset. Furthermore, we were not aware of this problem because we had no test set to check the model’s ability to translate on sentences outside of the train set. To combat this issue, I retrained my model on the same dataset, this time with a trim=40 and without the eng_prefixes filter. Even when I set aside 10% of the sentence pairs for a train set, the test set was still over 10x the size of the one used to train the model before (122,251 train pairs). I also modified the hidden size of the model from 440 to 1080 and decreased the batch size from 32 to 10. Finally, I changed the initial learning rate to 0.5 and installed a learning rate schedule which decreased the learning rate by a factor of five after every five epochs.","hasDropCap":null,"dropCapImage":null,"markups":[],"codeBlockMetadata":null,"iframe":null,"mixtapeMetadata":null},"Paragraph:44abc7cff577_122":{"__typename":"Paragraph","id":"44abc7cff577_122","name":"2f4c","type":"P","href":null,"layout":null,"metadata":null,"text":"With this larger dataset and updated hyperparameters, the model was trained on the same GPU. The loss on the train and test set during training, as well as the translation of the same sentence it failed on above, are shown below.","hasDropCap":null,"dropCapImage":null,"markups":[],"codeBlockMetadata":null,"iframe":null,"mixtapeMetadata":null},"ImageMetadata:1*6v4ywPVnzs2Km4HkAPlCZw.png":{"__typename":"ImageMetadata","id":"1*6v4ywPVnzs2Km4HkAPlCZw.png","originalHeight":398,"originalWidth":496,"focusPercentX":null,"focusPercentY":null,"alt":null},"Paragraph:44abc7cff577_123":{"__typename":"Paragraph","id":"44abc7cff577_123","name":"c047","type":"IMG","href":null,"layout":"INSET_CENTER","metadata":{"__ref":"ImageMetadata:1*6v4ywPVnzs2Km4HkAPlCZw.png"},"text":"Figure 22: Train and Test loss vs. time","hasDropCap":null,"dropCapImage":null,"markups":[],"codeBlockMetadata":null,"iframe":null,"mixtapeMetadata":null},"ImageMetadata:1*rAAWxzXj8zRfE6cfxrEzOQ.png":{"__typename":"ImageMetadata","id":"1*rAAWxzXj8zRfE6cfxrEzOQ.png","originalHeight":111,"originalWidth":802,"focusPercentX":null,"focusPercentY":null,"alt":null},"Paragraph:44abc7cff577_124":{"__typename":"Paragraph","id":"44abc7cff577_124","name":"d109","type":"IMG","href":null,"layout":"INSET_CENTER","metadata":{"__ref":"ImageMetadata:1*rAAWxzXj8zRfE6cfxrEzOQ.png"},"text":"Figure 23: Improved (yet still imperfect) translation of sentence outside of the dataset.","hasDropCap":null,"dropCapImage":null,"markups":[],"codeBlockMetadata":null,"iframe":null,"mixtapeMetadata":null},"Paragraph:44abc7cff577_125":{"__typename":"Paragraph","id":"44abc7cff577_125","name":"7d07","type":"P","href":null,"layout":null,"metadata":null,"text":"As you can see, the translation of this sentence is significantly improved. However, in order to achieve a perfect translation, we would probably need to increase the size of the dataset by even more.","hasDropCap":null,"dropCapImage":null,"markups":[],"codeBlockMetadata":null,"iframe":null,"mixtapeMetadata":null},"Paragraph:44abc7cff577_126":{"__typename":"Paragraph","id":"44abc7cff577_126","name":"b6d8","type":"H3","href":null,"layout":null,"metadata":null,"text":"Conclusion","hasDropCap":null,"dropCapImage":null,"markups":[],"codeBlockMetadata":null,"iframe":null,"mixtapeMetadata":null},"Paragraph:44abc7cff577_127":{"__typename":"Paragraph","id":"44abc7cff577_127","name":"c3d6","type":"P","href":null,"layout":null,"metadata":null,"text":"While this tutorial provides an introduction to NMT using the Encoder Decoder structure, the implemented attention mechanism is rather basic. If you are interested in creating a more state-of-the-art model I’d recommend looking into the concept of local attention and attempting to implement this more advanced type of attention within the Decoder portion of the model.","hasDropCap":null,"dropCapImage":null,"markups":[],"codeBlockMetadata":null,"iframe":null,"mixtapeMetadata":null},"Paragraph:44abc7cff577_128":{"__typename":"Paragraph","id":"44abc7cff577_128","name":"5141","type":"P","href":null,"layout":null,"metadata":null,"text":"Otherwise, I hope you enjoyed the tutorial and learned a lot! The basis of the material covered in this post was from my thesis at Loyola Marymount University. If you want to take a look at the PPT presentation I used to share these ideas (which includes the majority of the images in this article) you can find that here. You can also read the Thesis paper I wrote on the topic, which explains the math behind NMT in much greater depth, here. And lastly, the full Jupyter notebook for this project can be found here or alternatively a Python script version can be found here.","hasDropCap":null,"dropCapImage":null,"markups":[{"__typename":"Markup","type":"A","start":317,"end":321,"href":"https:\u002F\u002Fgithub.com\u002Fqlanners\u002Fnmt_tutorial\u002Fblob\u002Fmaster\u002Fthesis_presentation.pptx","anchorType":"LINK","userId":null,"linkMetadata":null},{"__typename":"Markup","type":"A","start":438,"end":442,"href":"https:\u002F\u002Fgithub.com\u002Fqlanners\u002Fnmt_tutorial\u002Fblob\u002Fmaster\u002Fquinn_thesis_final.pdf","anchorType":"LINK","userId":null,"linkMetadata":null},{"__typename":"Markup","type":"A","start":512,"end":516,"href":"https:\u002F\u002Fgithub.com\u002Fqlanners\u002Fnmt_tutorial\u002Fblob\u002Fmaster\u002Fnmt_tutorial.ipynb","anchorType":"LINK","userId":null,"linkMetadata":null},{"__typename":"Markup","type":"A","start":571,"end":575,"href":"https:\u002F\u002Fgithub.com\u002Fqlanners\u002Fnmt_tutorial\u002Fblob\u002Fmaster\u002Fnmt_tutorial.py","anchorType":"LINK","userId":null,"linkMetadata":null}],"codeBlockMetadata":null,"iframe":null,"mixtapeMetadata":null},"CollectionViewerEdge:collectionId:7f60cf5620c9-viewerId:lo_176642cbbfab":{"__typename":"CollectionViewerEdge","id":"collectionId:7f60cf5620c9-viewerId:lo_176642cbbfab","isEditor":false,"isMuting":false},"ImageMetadata:1*cFFKn8rFH4ZndmaYeAs6iQ.png":{"__typename":"ImageMetadata","id":"1*cFFKn8rFH4ZndmaYeAs6iQ.png","originalWidth":2381,"originalHeight":743},"PostViewerEdge:postId:15ecf6b0b-viewerId:lo_176642cbbfab":{"__typename":"PostViewerEdge","shouldIndexPostForExternalSearch":true,"id":"postId:15ecf6b0b-viewerId:lo_176642cbbfab"},"Tag:machine-learning":{"__typename":"Tag","id":"machine-learning","displayTitle":"Machine Learning","normalizedTagSlug":"machine-learning"},"Tag:recurrent-neural-network":{"__typename":"Tag","id":"recurrent-neural-network","displayTitle":"Recurrent Neural Network","normalizedTagSlug":"recurrent-neural-network"},"Tag:machine-translation":{"__typename":"Tag","id":"machine-translation","displayTitle":"Machine Translation","normalizedTagSlug":"machine-translation"},"Tag:deep-learning":{"__typename":"Tag","id":"deep-learning","displayTitle":"Deep Learning","normalizedTagSlug":"deep-learning"},"Tag:pytorch":{"__typename":"Tag","id":"pytorch","displayTitle":"Pytorch","normalizedTagSlug":"pytorch"},"Post:15ecf6b0b":{"__typename":"Post","id":"15ecf6b0b","collection":{"__ref":"Collection:7f60cf5620c9"},"content({\"postMeteringOptions\":{}})":{"__typename":"PostContent","isLockedPreviewOnly":false,"bodyModel":{"__typename":"RichText","sections":[{"__typename":"Section","name":"1817","startIndex":0,"textLayout":null,"imageLayout":null,"backgroundImage":null,"videoLayout":null,"backgroundVideo":null},{"__typename":"Section","name":"4fde","startIndex":7,"textLayout":null,"imageLayout":null,"backgroundImage":null,"videoLayout":null,"backgroundVideo":null},{"__typename":"Section","name":"b9fc","startIndex":49,"textLayout":null,"imageLayout":null,"backgroundImage":null,"videoLayout":null,"backgroundVideo":null},{"__typename":"Section","name":"7331","startIndex":126,"textLayout":null,"imageLayout":null,"backgroundImage":null,"videoLayout":null,"backgroundVideo":null}],"paragraphs":[{"__ref":"Paragraph:44abc7cff577_0"},{"__ref":"Paragraph:44abc7cff577_1"},{"__ref":"Paragraph:44abc7cff577_2"},{"__ref":"Paragraph:44abc7cff577_3"},{"__ref":"Paragraph:44abc7cff577_4"},{"__ref":"Paragraph:44abc7cff577_5"},{"__ref":"Paragraph:44abc7cff577_6"},{"__ref":"Paragraph:44abc7cff577_7"},{"__ref":"Paragraph:44abc7cff577_8"},{"__ref":"Paragraph:44abc7cff577_9"},{"__ref":"Paragraph:44abc7cff577_10"},{"__ref":"Paragraph:44abc7cff577_11"},{"__ref":"Paragraph:44abc7cff577_12"},{"__ref":"Paragraph:44abc7cff577_13"},{"__ref":"Paragraph:44abc7cff577_14"},{"__ref":"Paragraph:44abc7cff577_15"},{"__ref":"Paragraph:44abc7cff577_16"},{"__ref":"Paragraph:44abc7cff577_17"},{"__ref":"Paragraph:44abc7cff577_18"},{"__ref":"Paragraph:44abc7cff577_19"},{"__ref":"Paragraph:44abc7cff577_20"},{"__ref":"Paragraph:44abc7cff577_21"},{"__ref":"Paragraph:44abc7cff577_22"},{"__ref":"Paragraph:44abc7cff577_23"},{"__ref":"Paragraph:44abc7cff577_24"},{"__ref":"Paragraph:44abc7cff577_25"},{"__ref":"Paragraph:44abc7cff577_26"},{"__ref":"Paragraph:44abc7cff577_27"},{"__ref":"Paragraph:44abc7cff577_28"},{"__ref":"Paragraph:44abc7cff577_29"},{"__ref":"Paragraph:44abc7cff577_30"},{"__ref":"Paragraph:44abc7cff577_31"},{"__ref":"Paragraph:44abc7cff577_32"},{"__ref":"Paragraph:44abc7cff577_33"},{"__ref":"Paragraph:44abc7cff577_34"},{"__ref":"Paragraph:44abc7cff577_35"},{"__ref":"Paragraph:44abc7cff577_36"},{"__ref":"Paragraph:44abc7cff577_37"},{"__ref":"Paragraph:44abc7cff577_38"},{"__ref":"Paragraph:44abc7cff577_39"},{"__ref":"Paragraph:44abc7cff577_40"},{"__ref":"Paragraph:44abc7cff577_41"},{"__ref":"Paragraph:44abc7cff577_42"},{"__ref":"Paragraph:44abc7cff577_43"},{"__ref":"Paragraph:44abc7cff577_44"},{"__ref":"Paragraph:44abc7cff577_45"},{"__ref":"Paragraph:44abc7cff577_46"},{"__ref":"Paragraph:44abc7cff577_47"},{"__ref":"Paragraph:44abc7cff577_48"},{"__ref":"Paragraph:44abc7cff577_49"},{"__ref":"Paragraph:44abc7cff577_50"},{"__ref":"Paragraph:44abc7cff577_51"},{"__ref":"Paragraph:44abc7cff577_52"},{"__ref":"Paragraph:44abc7cff577_53"},{"__ref":"Paragraph:44abc7cff577_54"},{"__ref":"Paragraph:44abc7cff577_55"},{"__ref":"Paragraph:44abc7cff577_56"},{"__ref":"Paragraph:44abc7cff577_57"},{"__ref":"Paragraph:44abc7cff577_58"},{"__ref":"Paragraph:44abc7cff577_59"},{"__ref":"Paragraph:44abc7cff577_60"},{"__ref":"Paragraph:44abc7cff577_61"},{"__ref":"Paragraph:44abc7cff577_62"},{"__ref":"Paragraph:44abc7cff577_63"},{"__ref":"Paragraph:44abc7cff577_64"},{"__ref":"Paragraph:44abc7cff577_65"},{"__ref":"Paragraph:44abc7cff577_66"},{"__ref":"Paragraph:44abc7cff577_67"},{"__ref":"Paragraph:44abc7cff577_68"},{"__ref":"Paragraph:44abc7cff577_69"},{"__ref":"Paragraph:44abc7cff577_70"},{"__ref":"Paragraph:44abc7cff577_71"},{"__ref":"Paragraph:44abc7cff577_72"},{"__ref":"Paragraph:44abc7cff577_73"},{"__ref":"Paragraph:44abc7cff577_74"},{"__ref":"Paragraph:44abc7cff577_75"},{"__ref":"Paragraph:44abc7cff577_76"},{"__ref":"Paragraph:44abc7cff577_77"},{"__ref":"Paragraph:44abc7cff577_78"},{"__ref":"Paragraph:44abc7cff577_79"},{"__ref":"Paragraph:44abc7cff577_80"},{"__ref":"Paragraph:44abc7cff577_81"},{"__ref":"Paragraph:44abc7cff577_82"},{"__ref":"Paragraph:44abc7cff577_83"},{"__ref":"Paragraph:44abc7cff577_84"},{"__ref":"Paragraph:44abc7cff577_85"},{"__ref":"Paragraph:44abc7cff577_86"},{"__ref":"Paragraph:44abc7cff577_87"},{"__ref":"Paragraph:44abc7cff577_88"},{"__ref":"Paragraph:44abc7cff577_89"},{"__ref":"Paragraph:44abc7cff577_90"},{"__ref":"Paragraph:44abc7cff577_91"},{"__ref":"Paragraph:44abc7cff577_92"},{"__ref":"Paragraph:44abc7cff577_93"},{"__ref":"Paragraph:44abc7cff577_94"},{"__ref":"Paragraph:44abc7cff577_95"},{"__ref":"Paragraph:44abc7cff577_96"},{"__ref":"Paragraph:44abc7cff577_97"},{"__ref":"Paragraph:44abc7cff577_98"},{"__ref":"Paragraph:44abc7cff577_99"},{"__ref":"Paragraph:44abc7cff577_100"},{"__ref":"Paragraph:44abc7cff577_101"},{"__ref":"Paragraph:44abc7cff577_102"},{"__ref":"Paragraph:44abc7cff577_103"},{"__ref":"Paragraph:44abc7cff577_104"},{"__ref":"Paragraph:44abc7cff577_105"},{"__ref":"Paragraph:44abc7cff577_106"},{"__ref":"Paragraph:44abc7cff577_107"},{"__ref":"Paragraph:44abc7cff577_108"},{"__ref":"Paragraph:44abc7cff577_109"},{"__ref":"Paragraph:44abc7cff577_110"},{"__ref":"Paragraph:44abc7cff577_111"},{"__ref":"Paragraph:44abc7cff577_112"},{"__ref":"Paragraph:44abc7cff577_113"},{"__ref":"Paragraph:44abc7cff577_114"},{"__ref":"Paragraph:44abc7cff577_115"},{"__ref":"Paragraph:44abc7cff577_116"},{"__ref":"Paragraph:44abc7cff577_117"},{"__ref":"Paragraph:44abc7cff577_118"},{"__ref":"Paragraph:44abc7cff577_119"},{"__ref":"Paragraph:44abc7cff577_120"},{"__ref":"Paragraph:44abc7cff577_121"},{"__ref":"Paragraph:44abc7cff577_122"},{"__ref":"Paragraph:44abc7cff577_123"},{"__ref":"Paragraph:44abc7cff577_124"},{"__ref":"Paragraph:44abc7cff577_125"},{"__ref":"Paragraph:44abc7cff577_126"},{"__ref":"Paragraph:44abc7cff577_127"},{"__ref":"Paragraph:44abc7cff577_128"}]},"validatedShareKey":"","shareKeyCreator":null},"creator":{"__ref":"User:8a2f7df48b90"},"inResponseToEntityType":null,"isLocked":false,"isMarkedPaywallOnly":false,"lockedSource":"LOCKED_POST_SOURCE_NONE","mediumUrl":"https:\u002F\u002Ftowardsdatascience.com\u002Fneural-machine-translation-15ecf6b0b","primaryTopic":{"__ref":"Topic:1eca0103fff3"},"topics":[{"__typename":"Topic","slug":"machine-learning"},{"__typename":"Topic","slug":"data-science"},{"__typename":"Topic","slug":"programming"}],"isPublished":true,"latestPublishedVersion":"44abc7cff577","visibility":"PUBLIC","postResponses":{"__typename":"PostResponses","count":5},"clapCount":386,"allowResponses":true,"isLimitedState":false,"title":"Neural Machine Translation","isSeries":false,"sequence":null,"uniqueSlug":"neural-machine-translation-15ecf6b0b","socialTitle":"","socialDek":"","canonicalUrl":"","metaDescription":"A guide to Neural Machine Translation using an Encoder Decoder structure with attention. Includes a detailed tutorial using PyTorch in Google Colaboratory.","latestPublishedAt":1559922492033,"readingTime":20.182075471698113,"previewContent":{"__typename":"PreviewContent","subtitle":"Machine Translation using Recurrent Neural Networks (includes tutorial in PyTorch)"},"previewImage":{"__ref":"ImageMetadata:1*H441VINdbjxItCdtgb-1Xw.jpeg"},"isShortform":false,"seoTitle":"","firstPublishedAt":1559602324362,"updatedAt":1648671923343,"shortformType":"SHORTFORM_TYPE_LINK","seoDescription":"","viewerEdge":{"__ref":"PostViewerEdge:postId:15ecf6b0b-viewerId:lo_176642cbbfab"},"isSuspended":false,"license":"ALL_RIGHTS_RESERVED","tags":[{"__ref":"Tag:machine-learning"},{"__ref":"Tag:recurrent-neural-network"},{"__ref":"Tag:machine-translation"},{"__ref":"Tag:deep-learning"},{"__ref":"Tag:pytorch"}],"isNewsletter":false,"statusForCollection":"APPROVED","pendingCollection":null,"detectedLanguage":"en","wordCount":4805,"layerCake":2,"responsesLocked":false}}</script><script src="https://cdn-client.medium.com/lite/static/js/manifest.aa9242f7.js"></script><script src="https://cdn-client.medium.com/lite/static/js/9865.1496d74a.js"></script><script src="https://cdn-client.medium.com/lite/static/js/main.e556b4ac.js"></script><script src="https://cdn-client.medium.com/lite/static/js/instrumentation.d9108df7.chunk.js"></script> <script src="https://cdn-client.medium.com/lite/static/js/reporting.ff22a7a5.chunk.js"></script> <script src="https://cdn-client.medium.com/lite/static/js/9120.5df29668.chunk.js"></script> <script src="https://cdn-client.medium.com/lite/static/js/5049.d1ead72d.chunk.js"></script> <script src="https://cdn-client.medium.com/lite/static/js/4810.6318add7.chunk.js"></script> <script src="https://cdn-client.medium.com/lite/static/js/6618.db187378.chunk.js"></script> <script src="https://cdn-client.medium.com/lite/static/js/2707.b0942613.chunk.js"></script> <script src="https://cdn-client.medium.com/lite/static/js/9977.5b3eb23a.chunk.js"></script> <script src="https://cdn-client.medium.com/lite/static/js/8599.1ab63137.chunk.js"></script> <script src="https://cdn-client.medium.com/lite/static/js/5250.9f9e01d2.chunk.js"></script> <script src="https://cdn-client.medium.com/lite/static/js/5787.e66a3a4d.chunk.js"></script> <script src="https://cdn-client.medium.com/lite/static/js/2648.26563adf.chunk.js"></script> <script src="https://cdn-client.medium.com/lite/static/js/8393.826a25fb.chunk.js"></script> <script src="https://cdn-client.medium.com/lite/static/js/3104.c3413b66.chunk.js"></script> <script src="https://cdn-client.medium.com/lite/static/js/3735.afb7e926.chunk.js"></script> <script src="https://cdn-client.medium.com/lite/static/js/5642.8ad8a900.chunk.js"></script> <script src="https://cdn-client.medium.com/lite/static/js/6546.cd03f950.chunk.js"></script> <script src="https://cdn-client.medium.com/lite/static/js/6834.08de95de.chunk.js"></script> <script src="https://cdn-client.medium.com/lite/static/js/7346.72622eb9.chunk.js"></script> <script src="https://cdn-client.medium.com/lite/static/js/2420.2a5e2d95.chunk.js"></script> <script src="https://cdn-client.medium.com/lite/static/js/839.ca7937c2.chunk.js"></script> <script src="https://cdn-client.medium.com/lite/static/js/7975.d195c6f1.chunk.js"></script> <script src="https://cdn-client.medium.com/lite/static/js/2106.21ff89d3.chunk.js"></script> <script src="https://cdn-client.medium.com/lite/static/js/7394.094844de.chunk.js"></script> <script src="https://cdn-client.medium.com/lite/static/js/2961.00a48598.chunk.js"></script> <script src="https://cdn-client.medium.com/lite/static/js/8204.c4082863.chunk.js"></script> <script src="https://cdn-client.medium.com/lite/static/js/4391.59acaed3.chunk.js"></script> <script src="https://cdn-client.medium.com/lite/static/js/PostPage.MainContent.1387c5dc.chunk.js"></script> <script src="https://cdn-client.medium.com/lite/static/js/8414.6565ad5f.chunk.js"></script> <script src="https://cdn-client.medium.com/lite/static/js/3974.8d3e0217.chunk.js"></script> <script src="https://cdn-client.medium.com/lite/static/js/2527.a0afad8a.chunk.js"></script> <script src="https://cdn-client.medium.com/lite/static/js/PostResponsesContent.36c2ecf4.chunk.js"></script><script>window.main();</script><script>(function(){function c(){var b=a.contentDocument||a.contentWindow.document;if(b){var d=b.createElement('script');d.innerHTML="window.__CF$cv$params={r:'8e934a91586289a4',t:'MTczMjcyMjgwOS4wMDAwMDA='};var a=document.createElement('script');a.nonce='';a.src='/cdn-cgi/challenge-platform/scripts/jsd/main.js';document.getElementsByTagName('head')[0].appendChild(a);";b.getElementsByTagName('head')[0].appendChild(d)}}if(document.body){var a=document.createElement('iframe');a.height=1;a.width=1;a.style.position='absolute';a.style.top=0;a.style.left=0;a.style.border='none';a.style.visibility='hidden';document.body.appendChild(a);if('loading'!==document.readyState)c();else if(window.addEventListener)document.addEventListener('DOMContentLoaded',c);else{var e=document.onreadystatechange||function(){};document.onreadystatechange=function(b){e(b);'loading'!==document.readyState&&(document.onreadystatechange=e,c())}}}})();</script></body></html>