CINXE.COM

<!doctype html><html lang="en"><head><title data-rh="true">Deep Learning for Action Recognition: From Basics to Efficiency Advancements | by Rahul Deora | Medium</title><meta data-rh="true" charset="utf-8"/><meta data-rh="true" name="viewport" content="width=device-width,minimum-scale=1,initial-scale=1,maximum-scale=1"/><meta data-rh="true" name="theme-color" content="#000000"/><meta data-rh="true" name="twitter:app:name:iphone" content="Medium"/><meta data-rh="true" name="twitter:app:id:iphone" content="828256236"/><meta data-rh="true" property="al:ios:app_name" content="Medium"/><meta data-rh="true" property="al:ios:app_store_id" content="828256236"/><meta data-rh="true" property="al:android:package" content="com.medium.reader"/><meta data-rh="true" property="fb:app_id" content="542599432471018"/><meta data-rh="true" property="og:site_name" content="Medium"/><meta data-rh="true" property="og:type" content="article"/><meta data-rh="true" property="article:published_time" content="2024-01-23T16:35:41.420Z"/><meta data-rh="true" name="title" content="Deep Learning for Action Recognition: From Basics to Efficiency Advancements | by Rahul Deora | Medium"/><meta data-rh="true" property="og:title" content="Deep Learning for Action Recognition: From Basics to Efficiency Advancements"/><meta data-rh="true" property="al:android:url" content="medium://p/12d803da3854"/><meta data-rh="true" property="al:ios:url" content="medium://p/12d803da3854"/><meta data-rh="true" property="al:android:app_name" content="Medium"/><meta data-rh="true" name="description" content="Action recognition is an important task in the field of computer vision that entails classifying human actions depicted in video frames. Think of it as the video counterpart of image classification…"/><meta data-rh="true" property="og:description" content="Action recognition is an important task in the field of computer vision that entails classifying human actions depicted in video frames…"/><meta data-rh="true" property="og:url" content="https://rahuld3eora.medium.com/deep-learning-for-action-recognition-from-basics-to-efficiency-advancements-12d803da3854"/><meta data-rh="true" property="al:web:url" content="https://rahuld3eora.medium.com/deep-learning-for-action-recognition-from-basics-to-efficiency-advancements-12d803da3854"/><meta data-rh="true" property="og:image" content="https://miro.medium.com/v2/resize:fit:1200/1*aNj3n_P-oyNwuOSCZxmFiQ.png"/><meta data-rh="true" property="article:author" content="https://rahuld3eora.medium.com"/><meta data-rh="true" name="author" content="Rahul Deora"/><meta data-rh="true" name="robots" content="index,noarchive,follow,max-image-preview:large"/><meta data-rh="true" name="referrer" content="unsafe-url"/><meta data-rh="true" property="twitter:title" content="Deep Learning for Action Recognition: From Basics to Efficiency Advancements"/><meta data-rh="true" name="twitter:site" content="@Medium"/><meta data-rh="true" name="twitter:app:url:iphone" content="medium://p/12d803da3854"/><meta data-rh="true" property="twitter:description" content="Action recognition is an important task in the field of computer vision that entails classifying human actions depicted in video frames…"/><meta data-rh="true" name="twitter:image:src" content="https://miro.medium.com/v2/resize:fit:1200/1*aNj3n_P-oyNwuOSCZxmFiQ.png"/><meta data-rh="true" name="twitter:card" content="summary_large_image"/><meta data-rh="true" name="twitter:label1" content="Reading time"/><meta data-rh="true" name="twitter:data1" content="10 min read"/><link data-rh="true" rel="icon" href="https://miro.medium.com/v2/5d8de952517e8160e40ef9841c781cdc14a5db313057fa3c3de41c6f5b494b19"/><link data-rh="true" rel="search" type="application/opensearchdescription+xml" title="Medium" href="/osd.xml"/><link data-rh="true" rel="apple-touch-icon" sizes="152x152" href="https://miro.medium.com/v2/resize:fill:304:304/10fd5c419ac61637245384e7099e131627900034828f4f386bdaa47a74eae156"/><link data-rh="true" rel="apple-touch-icon" sizes="120x120" href="https://miro.medium.com/v2/resize:fill:240:240/10fd5c419ac61637245384e7099e131627900034828f4f386bdaa47a74eae156"/><link data-rh="true" rel="apple-touch-icon" sizes="76x76" href="https://miro.medium.com/v2/resize:fill:152:152/10fd5c419ac61637245384e7099e131627900034828f4f386bdaa47a74eae156"/><link data-rh="true" rel="apple-touch-icon" sizes="60x60" href="https://miro.medium.com/v2/resize:fill:120:120/10fd5c419ac61637245384e7099e131627900034828f4f386bdaa47a74eae156"/><link data-rh="true" rel="mask-icon" href="https://miro.medium.com/v2/resize:fill:1000:1000/7*GAOKVe--MXbEJmV9230oOQ.png" color="#171717"/><link data-rh="true" rel="preconnect" href="https://glyph.medium.com" crossOrigin=""/><link data-rh="true" id="glyph_preload_link" rel="preload" as="style" type="text/css" href="https://glyph.medium.com/css/unbound.css"/><link data-rh="true" id="glyph_link" rel="stylesheet" type="text/css" href="https://glyph.medium.com/css/unbound.css"/><link data-rh="true" rel="author" href="https://rahuld3eora.medium.com"/><link data-rh="true" rel="canonical" href="https://rahuld3eora.medium.com/deep-learning-for-action-recognition-from-basics-to-efficiency-advancements-12d803da3854"/><link data-rh="true" rel="alternate" href="android-app://com.medium.reader/https/medium.com/p/12d803da3854"/><script data-rh="true" type="application/ld+json">{"@context":"http:\u002F\u002Fschema.org","@type":"NewsArticle","image":["https:\u002F\u002Fmiro.medium.com\u002Fv2\u002Fresize:fit:1200\u002F1*aNj3n_P-oyNwuOSCZxmFiQ.png"],"url":"https:\u002F\u002Frahuld3eora.medium.com\u002Fdeep-learning-for-action-recognition-from-basics-to-efficiency-advancements-12d803da3854","dateCreated":"2024-01-23T16:35:41.420Z","datePublished":"2024-01-23T16:35:41.420Z","dateModified":"2024-02-27T05:21:44.996Z","headline":"Deep Learning for Action Recognition: From Basics to Efficiency Advancements","name":"Deep Learning for Action Recognition: From Basics to Efficiency Advancements","description":"Action recognition is an important task in the field of computer vision that entails classifying human actions depicted in video frames. Think of it as the video counterpart of image classification…","identifier":"12d803da3854","author":{"@type":"Person","name":"Rahul Deora","url":"https:\u002F\u002Frahuld3eora.medium.com"},"creator":["Rahul Deora"],"publisher":{"@type":"Organization","name":"Medium","url":"https:\u002F\u002Frahuld3eora.medium.com\u002F","logo":{"@type":"ImageObject","width":272,"height":60,"url":"https:\u002F\u002Fmiro.medium.com\u002Fv2\u002Fresize:fit:544\u002F7*V1_7XP4snlmqrc_0Njontw.png"}},"mainEntityOfPage":"https:\u002F\u002Frahuld3eora.medium.com\u002Fdeep-learning-for-action-recognition-from-basics-to-efficiency-advancements-12d803da3854"}</script><style type="text/css" data-fela-rehydration="564" data-fela-type="STATIC">html{box-sizing:border-box;-webkit-text-size-adjust:100%}*, *:before, *:after{box-sizing:inherit}body{margin:0;padding:0;text-rendering:optimizeLegibility;-webkit-font-smoothing:antialiased;color:rgba(0,0,0,0.8);position:relative;min-height:100vh}h1, h2, h3, h4, h5, h6, dl, dd, ol, ul, menu, figure, blockquote, p, pre, form{margin:0}menu, ol, ul{padding:0;list-style:none;list-style-image:none}main{display:block}a{color:inherit;text-decoration:none}a, button, input{-webkit-tap-highlight-color:transparent}img, svg{vertical-align:middle}button{background:transparent;overflow:visible}button, input, optgroup, select, textarea{margin:0}:root{--reach-tabs:1;--reach-menu-button:1}#speechify-root{font-family:Sohne, sans-serif}div[data-popper-reference-hidden="true"]{visibility:hidden;pointer-events:none}.grecaptcha-badge{visibility:hidden} /*XCode style (c) Angel Garcia <angelgarcia.mail@gmail.com>*/.hljs {background: #fff;color: black; }/* Gray DOCTYPE selectors like WebKit */ .xml .hljs-meta {color: #c0c0c0; }.hljs-comment, .hljs-quote {color: #007400; }.hljs-tag, .hljs-attribute, .hljs-keyword, .hljs-selector-tag, .hljs-literal, .hljs-name {color: #aa0d91; }.hljs-variable, .hljs-template-variable {color: #3F6E74; }.hljs-code, .hljs-string, .hljs-meta .hljs-string {color: #c41a16; }.hljs-regexp, .hljs-link {color: #0E0EFF; }.hljs-title, .hljs-symbol, .hljs-bullet, .hljs-number {color: #1c00cf; }.hljs-section, .hljs-meta {color: #643820; }.hljs-title.class_, .hljs-class .hljs-title, .hljs-type, .hljs-built_in, .hljs-params {color: #5c2699; }.hljs-attr {color: #836C28; }.hljs-subst {color: #000; }.hljs-formula {background-color: #eee;font-style: italic; }.hljs-addition {background-color: #baeeba; }.hljs-deletion {background-color: #ffc8bd; }.hljs-selector-id, .hljs-selector-class {color: #9b703f; }.hljs-doctag, .hljs-strong {font-weight: bold; }.hljs-emphasis {font-style: italic; } </style><style type="text/css" data-fela-rehydration="564" data-fela-type="KEYFRAME">@-webkit-keyframes k1{0%{opacity:0.8}50%{opacity:0.5}100%{opacity:0.8}}@-moz-keyframes k1{0%{opacity:0.8}50%{opacity:0.5}100%{opacity:0.8}}@keyframes k1{0%{opacity:0.8}50%{opacity:0.5}100%{opacity:0.8}}</style><style type="text/css" data-fela-rehydration="564" data-fela-type="RULE">.a{font-family:medium-content-sans-serif-font, -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, Oxygen, Ubuntu, Cantarell, "Open Sans", "Helvetica Neue", sans-serif}.b{font-weight:400}.c{background-color:rgba(255, 255, 255, 1)}.l{display:block}.m{position:sticky}.n{top:0}.o{z-index:500}.p{padding:0 24px}.q{align-items:center}.r{border-bottom:solid 1px #F2F2F2}.y{height:41px}.z{line-height:20px}.ab{display:flex}.ac{height:57px}.ae{flex:1 0 auto}.af{color:inherit}.ag{fill:inherit}.ah{font-size:inherit}.ai{border:inherit}.aj{font-family:inherit}.ak{letter-spacing:inherit}.al{font-weight:inherit}.am{padding:0}.an{margin:0}.ao{cursor:pointer}.ap:disabled{cursor:not-allowed}.aq:disabled{color:#6B6B6B}.ar:disabled{fill:#6B6B6B}.au{width:auto}.av path{fill:#242424}.aw{height:25px}.ax{margin-left:16px}.ay{border:none}.az{border-radius:20px}.ba{width:240px}.bb{background:#F9F9F9}.bc path{fill:#6B6B6B}.be{outline:none}.bf{font-family:sohne, "Helvetica Neue", Helvetica, Arial, sans-serif}.bg{font-size:14px}.bh{width:100%}.bi{padding:10px 20px 10px 0}.bj{background-color:transparent}.bk{color:#242424}.bl::placeholder{color:#6B6B6B}.bm{display:inline-block}.bn{margin-left:12px}.bo{margin-right:12px}.bp{border-radius:4px}.bq{margin-left:24px}.br{height:24px}.bx{background-color:#F9F9F9}.by{border-radius:50%}.bz{height:32px}.ca{width:32px}.cb{justify-content:center}.ch{max-width:680px}.ci{min-width:0}.cj{animation:k1 1.2s ease-in-out infinite}.ck{height:100vh}.cl{margin-bottom:16px}.cm{margin-top:48px}.cn{align-items:flex-start}.co{flex-direction:column}.cp{justify-content:space-between}.cq{margin-bottom:24px}.cw{width:80%}.cx{background-color:#F2F2F2}.dd{height:44px}.de{width:44px}.df{margin:auto 0}.dg{margin-bottom:4px}.dh{height:16px}.di{width:120px}.dj{width:80px}.dp{margin-bottom:8px}.dq{width:96%}.dr{width:98%}.ds{width:81%}.dt{margin-left:8px}.du{color:#6B6B6B}.dv{font-size:13px}.dw{height:100%}.ep{color:#FFFFFF}.eq{fill:#FFFFFF}.er{background:#1A8917}.es{border-color:#1A8917}.ew:disabled{cursor:inherit !important}.ex:disabled{opacity:0.3}.ey:disabled:hover{background:#1A8917}.ez:disabled:hover{border-color:#1A8917}.fa{border-radius:99em}.fb{border-width:1px}.fc{border-style:solid}.fd{box-sizing:border-box}.fe{text-decoration:none}.ff{text-align:center}.fi{margin-right:32px}.fj{position:relative}.fk{fill:#6B6B6B}.fn{background:transparent}.fo svg{margin-left:4px}.fp svg{fill:#6B6B6B}.fr{box-shadow:inset 0 0 0 1px rgba(0, 0, 0, 0.05)}.fs{position:absolute}.fz{margin:0 24px}.gd{background:rgba(255, 255, 255, 1)}.ge{border:1px solid #F2F2F2}.gf{box-shadow:0 1px 4px #F2F2F2}.gg{max-height:100vh}.gh{overflow-y:auto}.gi{left:0}.gj{top:calc(100vh + 100px)}.gk{bottom:calc(100vh + 100px)}.gl{width:10px}.gm{pointer-events:none}.gn{word-break:break-word}.go{word-wrap:break-word}.gp:after{display:block}.gq:after{content:""}.gr:after{clear:both}.gs{line-height:1.23}.gt{letter-spacing:0}.gu{font-style:normal}.gv{font-weight:700}.ia{align-items:baseline}.ib{width:48px}.ic{height:48px}.id{border:2px solid rgba(255, 255, 255, 1)}.ie{z-index:0}.if{box-shadow:none}.ig{border:1px solid rgba(0, 0, 0, 0.05)}.ih{margin-bottom:2px}.ii{flex-wrap:nowrap}.ij{font-size:16px}.ik{line-height:24px}.im{margin:0 8px}.in{display:inline}.io{color:#1A8917}.ip{fill:#1A8917}.is{flex:0 0 auto}.iv{flex-wrap:wrap}.iw{padding-left:8px}.ix{padding-right:8px}.jy> *{flex-shrink:0}.jz{overflow-x:scroll}.ka::-webkit-scrollbar{display:none}.kb{scrollbar-width:none}.kc{-ms-overflow-style:none}.kd{width:74px}.ke{flex-direction:row}.kf{z-index:2}.kg{margin-right:4px}.kj{-webkit-user-select:none}.kk{border:0}.kl{fill:rgba(117, 117, 117, 1)}.ko{outline:0}.kp{user-select:none}.kq> svg{pointer-events:none}.kz{cursor:progress}.la{margin-left:4px}.lb{margin-top:0px}.lc{opacity:1}.ld{padding:4px 0}.lg{width:16px}.li{display:inline-flex}.lo{max-width:100%}.lp{padding:8px 2px}.lq svg{color:#6B6B6B}.mh{margin-left:auto}.mi{margin-right:auto}.mj{max-width:1714px}.mp{clear:both}.mr{cursor:zoom-in}.ms{z-index:auto}.mu{height:auto}.mv{margin-top:10px}.mw{max-width:728px}.mz{line-height:1.58}.na{letter-spacing:-0.004em}.nb{font-family:source-serif-pro, Georgia, Cambria, "Times New Roman", Times, serif}.nw{margin-bottom:-0.46em}.nx{list-style-type:disc}.ny{margin-left:30px}.nz{padding-left:0px}.of{line-height:1.12}.og{letter-spacing:-0.022em}.oh{font-weight:600}.pc{margin-bottom:-0.28em}.pi{text-decoration:underline}.pj{max-width:946px}.pp{font-style:italic}.pq{max-width:1344px}.pr{max-width:1023px}.ps{max-width:481px}.pt{max-width:761px}.pu{max-width:571px}.pv{max-width:244px}.pw{max-width:2000px}.px{max-width:1658px}.py{max-width:906px}.pz{max-width:1734px}.qa{max-width:1190px}.qb{max-width:1778px}.qc{max-width:1057px}.qd{max-width:1058px}.qe{max-width:1488px}.qf{max-width:1102px}.qg{max-width:2618px}.qh{max-width:1500px}.qi{max-width:1490px}.qj{max-width:1286px}.qk{max-width:1726px}.ql{margin-bottom:26px}.qm{margin-top:6px}.qn{margin-top:8px}.qo{margin-right:8px}.qp{padding:8px 16px}.qq{border-radius:100px}.qr{transition:background 300ms ease}.qt{white-space:nowrap}.qu{border-top:none}.qv{margin-bottom:14px}.qw{height:52px}.qx{max-height:52px}.qy{box-sizing:content-box}.qz{position:static}.ra{z-index:1}.rc{max-width:155px}.ri{margin-right:20px}.ro{height:0px}.rp{margin-bottom:40px}.se{height:64px}.sf{width:64px}.sg{align-self:flex-end}.sh{color:rgba(255, 255, 255, 1)}.si{fill:rgba(255, 255, 255, 1)}.sj{background:rgba(25, 25, 25, 1)}.sk{border-color:rgba(25, 25, 25, 1)}.sn:disabled{opacity:0.1}.so:disabled:hover{background:rgba(25, 25, 25, 1)}.sp:disabled:hover{border-color:rgba(25, 25, 25, 1)}.sq{flex:1 1 auto}.sw{padding-right:4px}.sx{font-weight:500}.tk{white-space:pre-wrap}.tl{margin-top:16px}.tu{gap:18px}.tv{fill:rgba(61, 61, 61, 1)}.tx{margin-top:32px}.ty{fill:#242424}.tz{background:0}.ua{border-color:#242424}.ub:disabled:hover{color:#242424}.uc:disabled:hover{fill:#242424}.ud:disabled:hover{border-color:#242424}.uo{border-bottom:solid 1px #E5E5E5}.up{margin-top:72px}.uq{padding:24px 0}.ur{margin-bottom:0px}.us{margin-right:16px}.as:hover:not(:disabled){color:rgba(25, 25, 25, 1)}.at:hover:not(:disabled){fill:rgba(25, 25, 25, 1)}.et:hover{background:#156D12}.eu:hover{border-color:#156D12}.ev:hover{cursor:pointer}.fl:hover{color:#242424}.fm:hover{fill:#242424}.fq:hover svg{fill:#242424}.ft:hover{background-color:rgba(0, 0, 0, 0.1)}.il:hover{text-decoration:underline}.iq:hover:not(:disabled){color:#156D12}.ir:hover:not(:disabled){fill:#156D12}.kn:hover{fill:rgba(8, 8, 8, 1)}.le:hover{fill:#000000}.lf:hover p{color:#000000}.lh:hover{color:#000000}.lr:hover svg{color:#000000}.qs:hover{background-color:#F2F2F2}.sd:hover{background-color:none}.sl:hover{background:#000000}.sm:hover{border-color:#242424}.tw:hover{fill:rgba(25, 25, 25, 1)}.bd:focus-within path{fill:#242424}.km:focus{fill:rgba(8, 8, 8, 1)}.ls:focus svg{color:#000000}.mt:focus{transform:scale(1.01)}.kr:active{border-style:none}</style><style type="text/css" data-fela-rehydration="564" data-fela-type="RULE" media="all and (min-width: 1080px)">.d{display:none}.bw{width:64px}.cg{margin:0 64px}.cv{height:48px}.dc{margin-bottom:52px}.do{margin-bottom:48px}.ef{font-size:14px}.eg{line-height:20px}.em{font-size:13px}.eo{padding:5px 12px}.fh{display:flex}.fy{margin-bottom:68px}.gc{max-width:680px}.hq{font-size:42px}.hr{margin-top:1.19em}.hs{margin-bottom:32px}.ht{line-height:52px}.hu{letter-spacing:-0.011em}.hz{align-items:center}.jk{border-top:solid 1px #F2F2F2}.jl{border-bottom:solid 1px #F2F2F2}.jm{margin:32px 0 0}.jn{padding:3px 8px}.jw> *{margin-right:24px}.jx> :last-child{margin-right:0}.ky{margin-top:0px}.ln{margin:0}.mo{margin-top:40px}.ns{font-size:20px}.nt{margin-top:2.14em}.nu{line-height:32px}.nv{letter-spacing:-0.003em}.oe{margin-top:1.14em}.oy{font-size:24px}.oz{margin-top:1.95em}.pa{line-height:30px}.pb{letter-spacing:-0.016em}.ph{margin-top:0.94em}.po{margin-top:56px}.rh{display:inline-block}.rn{margin-bottom:104px}.rq{flex-direction:row}.rt{margin-bottom:0}.ru{margin-right:20px}.sr{max-width:500px}.ti{line-height:24px}.tj{letter-spacing:0}.tq{margin-bottom:88px}.tt{margin-bottom:72px}.ui{width:min-width}.un{padding-top:72px}</style><style type="text/css" data-fela-rehydration="564" data-fela-type="RULE" media="all and (max-width: 1079.98px)">.e{display:none}.kx{margin-top:0px}.mx{margin-left:auto}.my{text-align:center}.rg{display:inline-block}</style><style type="text/css" data-fela-rehydration="564" data-fela-type="RULE" media="all and (max-width: 903.98px)">.f{display:none}.kw{margin-top:0px}.rf{display:inline-block}</style><style type="text/css" data-fela-rehydration="564" data-fela-type="RULE" media="all and (max-width: 727.98px)">.g{display:none}.ku{margin-top:0px}.kv{margin-right:0px}.re{display:inline-block}</style><style type="text/css" data-fela-rehydration="564" data-fela-type="RULE" media="all and (max-width: 551.98px)">.h{display:none}.s{display:flex}.t{justify-content:space-between}.bs{width:24px}.cc{margin:0 24px}.cr{height:40px}.cy{margin-bottom:44px}.dk{margin-bottom:32px}.dx{font-size:13px}.dy{line-height:20px}.eh{padding:0px 8px 1px}.fu{margin-bottom:4px}.gw{font-size:32px}.gx{margin-top:1.01em}.gy{margin-bottom:24px}.gz{line-height:38px}.ha{letter-spacing:-0.014em}.hv{align-items:flex-start}.it{flex-direction:column}.iy{margin:24px -24px 0}.iz{padding:0}.jo> *{margin-right:8px}.jp> :last-child{margin-right:24px}.kh{margin-left:0px}.ks{margin-top:0px}.kt{margin-right:0px}.lj{margin:0}.lt{border:1px solid #F2F2F2}.lu{border-radius:99em}.lv{padding:0px 16px 0px 12px}.lw{height:38px}.lx{align-items:center}.lz svg{margin-right:8px}.mk{margin-top:32px}.nc{font-size:18px}.nd{margin-top:1.56em}.ne{line-height:28px}.nf{letter-spacing:-0.003em}.oa{margin-top:1.34em}.oi{font-size:20px}.oj{margin-top:1.2em}.ok{line-height:24px}.ol{letter-spacing:0}.pd{margin-top:0.67em}.pk{margin-top:40px}.rd{display:inline-block}.rj{margin-bottom:96px}.sb{margin-bottom:20px}.sc{margin-right:0}.sv{max-width:100%}.sy{font-size:24px}.sz{line-height:30px}.ta{letter-spacing:-0.016em}.tm{margin-bottom:64px}.ue{width:100%}.uj{padding-top:48px}.ly:hover{border-color:#E5E5E5}</style><style type="text/css" data-fela-rehydration="564" data-fela-type="RULE" media="all and (min-width: 904px) and (max-width: 1079.98px)">.i{display:none}.bv{width:64px}.cf{margin:0 64px}.cu{height:48px}.db{margin-bottom:52px}.dn{margin-bottom:48px}.ed{font-size:14px}.ee{line-height:20px}.ek{font-size:13px}.el{padding:5px 12px}.fg{display:flex}.fx{margin-bottom:68px}.gb{max-width:680px}.hl{font-size:42px}.hm{margin-top:1.19em}.hn{margin-bottom:32px}.ho{line-height:52px}.hp{letter-spacing:-0.011em}.hy{align-items:center}.jg{border-top:solid 1px #F2F2F2}.jh{border-bottom:solid 1px #F2F2F2}.ji{margin:32px 0 0}.jj{padding:3px 8px}.ju> *{margin-right:24px}.jv> :last-child{margin-right:0}.lm{margin:0}.mn{margin-top:40px}.no{font-size:20px}.np{margin-top:2.14em}.nq{line-height:32px}.nr{letter-spacing:-0.003em}.od{margin-top:1.14em}.ou{font-size:24px}.ov{margin-top:1.95em}.ow{line-height:30px}.ox{letter-spacing:-0.016em}.pg{margin-top:0.94em}.pn{margin-top:56px}.rm{margin-bottom:104px}.rr{flex-direction:row}.rv{margin-bottom:0}.rw{margin-right:20px}.ss{max-width:500px}.tg{line-height:24px}.th{letter-spacing:0}.tp{margin-bottom:88px}.ts{margin-bottom:72px}.uh{width:min-width}.um{padding-top:72px}</style><style type="text/css" data-fela-rehydration="564" data-fela-type="RULE" media="all and (min-width: 728px) and (max-width: 903.98px)">.j{display:none}.w{display:flex}.x{justify-content:space-between}.bu{width:64px}.ce{margin:0 48px}.ct{height:48px}.da{margin-bottom:52px}.dm{margin-bottom:48px}.eb{font-size:13px}.ec{line-height:20px}.ej{padding:0px 8px 1px}.fw{margin-bottom:68px}.ga{max-width:680px}.hg{font-size:42px}.hh{margin-top:1.19em}.hi{margin-bottom:32px}.hj{line-height:52px}.hk{letter-spacing:-0.011em}.hx{align-items:center}.jc{border-top:solid 1px #F2F2F2}.jd{border-bottom:solid 1px #F2F2F2}.je{margin:32px 0 0}.jf{padding:3px 8px}.js> *{margin-right:24px}.jt> :last-child{margin-right:0}.ll{margin:0}.mm{margin-top:40px}.nk{font-size:20px}.nl{margin-top:2.14em}.nm{line-height:32px}.nn{letter-spacing:-0.003em}.oc{margin-top:1.14em}.oq{font-size:24px}.or{margin-top:1.95em}.os{line-height:30px}.ot{letter-spacing:-0.016em}.pf{margin-top:0.94em}.pm{margin-top:56px}.rl{margin-bottom:104px}.rs{flex-direction:row}.rx{margin-bottom:0}.ry{margin-right:20px}.st{max-width:500px}.te{line-height:24px}.tf{letter-spacing:0}.to{margin-bottom:88px}.tr{margin-bottom:72px}.ug{width:min-width}.ul{padding-top:72px}</style><style type="text/css" data-fela-rehydration="564" data-fela-type="RULE" media="all and (min-width: 552px) and (max-width: 727.98px)">.k{display:none}.u{display:flex}.v{justify-content:space-between}.bt{width:24px}.cd{margin:0 24px}.cs{height:40px}.cz{margin-bottom:44px}.dl{margin-bottom:32px}.dz{font-size:13px}.ea{line-height:20px}.ei{padding:0px 8px 1px}.fv{margin-bottom:4px}.hb{font-size:32px}.hc{margin-top:1.01em}.hd{margin-bottom:24px}.he{line-height:38px}.hf{letter-spacing:-0.014em}.hw{align-items:flex-start}.iu{flex-direction:column}.ja{margin:24px 0 0}.jb{padding:0}.jq> *{margin-right:8px}.jr> :last-child{margin-right:8px}.ki{margin-left:0px}.lk{margin:0}.ma{border:1px solid #F2F2F2}.mb{border-radius:99em}.mc{padding:0px 16px 0px 12px}.md{height:38px}.me{align-items:center}.mg svg{margin-right:8px}.ml{margin-top:32px}.ng{font-size:18px}.nh{margin-top:1.56em}.ni{line-height:28px}.nj{letter-spacing:-0.003em}.ob{margin-top:1.34em}.om{font-size:20px}.on{margin-top:1.2em}.oo{line-height:24px}.op{letter-spacing:0}.pe{margin-top:0.67em}.pl{margin-top:40px}.rk{margin-bottom:96px}.rz{margin-bottom:20px}.sa{margin-right:0}.su{max-width:100%}.tb{font-size:24px}.tc{line-height:30px}.td{letter-spacing:-0.016em}.tn{margin-bottom:64px}.uf{width:100%}.uk{padding-top:48px}.mf:hover{border-color:#E5E5E5}</style><style type="text/css" data-fela-rehydration="564" data-fela-type="RULE" media="print">.rb{display:none}</style><style type="text/css" data-fela-rehydration="564" data-fela-type="RULE" media="(prefers-reduced-motion: no-preference)">.mq{transition:transform 300ms cubic-bezier(0.2, 0, 0.2, 1)}</style></head><body><div id="root"><div class="a b c"><div class="d e f g h i j k"></div><script>document.domain = document.domain;</script><div class="l c"><div class="l m n o c"><div class="p q r s t u v w x i d y z"><a class="du ag dv bf ak b am an ao ap aq ar as at s u w i d q dw z" href="https://rsci.app.link/?%24canonical_url=https%3A%2F%2Fmedium.com%2Fp%2F12d803da3854&%7Efeature=LoOpenInAppButton&%7Echannel=ShowPostUnderUser&source=---top_nav_layout_nav----------------------------------" rel="noopener follow">Open in app<svg xmlns="http://www.w3.org/2000/svg" width="10" height="10" fill="none" viewBox="0 0 10 10" class="dt"><path fill="currentColor" d="M.985 8.485a.375.375 0 1 0 .53.53zM8.75 1.25h.375A.375.375 0 0 0 8.75.875zM8.375 6.5a.375.375 0 1 0 .75 0zM3.5.875a.375.375 0 1 0 0 .75zm-1.985 8.14 7.5-7.5-.53-.53-7.5 7.5zm6.86-7.765V6.5h.75V1.25zM3.5 1.625h5.25v-.75H3.5z"></path></svg></a><div class="ab q"><p class="bf b dx dy dz ea eb ec ed ee ef eg du"><span><button class="bf b dx dy eh dz ea ei eb ec ej ek ee el em eg eo ep eq er es et eu ev ew ex ey ez fa fb fc fd bm fe ff" data-testid="headerSignUpButton">Sign up</button></span></p><div class="ax l"><p class="bf b dx dy dz ea eb ec ed ee ef eg du"><span><a class="af ag ah ai aj ak al am an ao ap aq ar as at" data-testid="headerSignInButton" href="https://medium.com/m/signin?operation=login&redirect=https%3A%2F%2Frahuld3eora.medium.com%2Fdeep-learning-for-action-recognition-from-basics-to-efficiency-advancements-12d803da3854&source=post_page---top_nav_layout_nav-----------------------global_nav-----------" rel="noopener follow">Sign in</a></span></p></div></div></div><div class="p q r ab ac"><div class="ab q ae"><a class="af ag ah ai aj ak al am an ao ap aq ar as at ab" aria-label="Homepage" data-testid="headerMediumLogo" href="https://medium.com/?source=---top_nav_layout_nav----------------------------------" rel="noopener follow"><svg xmlns="http://www.w3.org/2000/svg" width="719" height="160" fill="none" viewBox="0 0 719 160" class="au av aw"><path fill="#242424" d="m174.104 9.734.215-.047V8.02H130.39L89.6 103.89 48.81 8.021H1.472v1.666l.212.047c8.018 1.81 12.09 4.509 12.09 14.242V137.93c0 9.734-4.087 12.433-12.106 14.243l-.212.047v1.671h32.118v-1.665l-.213-.048c-8.018-1.809-12.089-4.509-12.089-14.242V30.586l52.399 123.305h2.972l53.925-126.743V140.75c-.687 7.688-4.721 10.062-11.982 11.701l-.215.05v1.652h55.948v-1.652l-.215-.05c-7.269-1.639-11.4-4.013-12.087-11.701l-.037-116.774h.037c0-9.733 4.071-12.432 12.087-14.242m25.555 75.488c.915-20.474 8.268-35.252 20.606-35.507 3.806.063 6.998 1.312 9.479 3.714 5.272 5.118 7.751 15.812 7.368 31.793zm-.553 5.77h65.573v-.275c-.186-15.656-4.721-27.834-13.466-36.196-7.559-7.227-18.751-11.203-30.507-11.203h-.263c-6.101 0-13.584 1.48-18.909 4.16-6.061 2.807-11.407 7.003-15.855 12.511-7.161 8.874-11.499 20.866-12.554 34.343q-.05.606-.092 1.212a50 50 0 0 0-.065 1.151 85.807 85.807 0 0 0-.094 5.689c.71 30.524 17.198 54.917 46.483 54.917 25.705 0 40.675-18.791 44.407-44.013l-1.886-.664c-6.557 13.556-18.334 21.771-31.738 20.769-18.297-1.369-32.314-19.922-31.042-42.395m139.722 41.359c-2.151 5.101-6.639 7.908-12.653 7.908s-11.513-4.129-15.418-11.63c-4.197-8.053-6.405-19.436-6.405-32.92 0-28.067 8.729-46.22 22.24-46.22 5.657 0 10.111 2.807 12.236 7.704zm43.499 20.008c-8.019-1.897-12.089-4.722-12.089-14.951V1.309l-48.716 14.353v1.757l.299-.024c6.72-.543 11.278.386 13.925 2.83 2.072 1.915 3.082 4.853 3.082 8.987v18.66c-4.803-3.067-10.516-4.56-17.448-4.56-14.059 0-26.909 5.92-36.176 16.672-9.66 11.205-14.767 26.518-14.767 44.278-.003 31.72 15.612 53.039 38.851 53.039 13.595 0 24.533-7.449 29.54-20.013v16.865h43.711v-1.746zM424.1 19.819c0-9.904-7.468-17.374-17.375-17.374-9.859 0-17.573 7.632-17.573 17.374s7.721 17.374 17.573 17.374c9.907 0 17.375-7.47 17.375-17.374m11.499 132.546c-8.019-1.897-12.089-4.722-12.089-14.951h-.035V43.635l-43.714 12.551v1.705l.263.024c9.458.842 12.047 4.1 12.047 15.152v81.086h43.751v-1.746zm112.013 0c-8.018-1.897-12.089-4.722-12.089-14.951V43.635l-41.621 12.137v1.71l.246.026c7.733.813 9.967 4.257 9.967 15.36v59.279c-2.578 5.102-7.415 8.131-13.274 8.336-9.503 0-14.736-6.419-14.736-18.073V43.638l-43.714 12.55v1.703l.262.024c9.459.84 12.05 4.097 12.05 15.152v50.17a56.3 56.3 0 0 0 .91 10.444l.787 3.423c3.701 13.262 13.398 20.197 28.59 20.197 12.868 0 24.147-7.966 29.115-20.43v17.311h43.714v-1.747zm169.818 1.788v-1.749l-.213-.05c-8.7-2.006-12.089-5.789-12.089-13.49v-63.79c0-19.89-11.171-31.761-29.883-31.761-13.64 0-25.141 7.882-29.569 20.16-3.517-13.01-13.639-20.16-28.606-20.16-13.146 0-23.449 6.938-27.869 18.657V43.643L545.487 55.68v1.715l.263.024c9.345.829 12.047 4.181 12.047 14.95v81.784h40.787v-1.746l-.215-.053c-6.941-1.631-9.181-4.606-9.181-12.239V66.998c1.836-4.289 5.537-9.37 12.853-9.37 9.086 0 13.692 6.296 13.692 18.697v77.828h40.797v-1.746l-.215-.053c-6.94-1.631-9.18-4.606-9.18-12.239V75.066a42 42 0 0 0-.578-7.26c1.947-4.661 5.86-10.177 13.475-10.177 9.214 0 13.691 6.114 13.691 18.696v77.828z"></path></svg></a><div class="ax h"><div class="ab ay az ba bb q bc bd"><div class="bm" aria-hidden="false" aria-describedby="searchResults" aria-labelledby="searchResults"></div><div class="bn bo ab"><svg xmlns="http://www.w3.org/2000/svg" width="24" height="24" fill="none" viewBox="0 0 24 24"><path fill="currentColor" fill-rule="evenodd" d="M4.092 11.06a6.95 6.95 0 1 1 13.9 0 6.95 6.95 0 0 1-13.9 0m6.95-8.05a8.05 8.05 0 1 0 5.13 14.26l3.75 3.75a.56.56 0 1 0 .79-.79l-3.73-3.73A8.05 8.05 0 0 0 11.042 3z" clip-rule="evenodd"></path></svg></div><input role="combobox" aria-controls="searchResults" aria-expanded="false" aria-label="search" data-testid="headerSearchInput" tabindex="0" class="ay be bf bg z bh bi bj bk bl" placeholder="Search" value=""/></div></div></div><div class="h k w fg fh"><div class="fi ab"><span><a class="af ag ah ai aj ak al am an ao ap aq ar as at" data-testid="headerWriteButton" href="https://medium.com/m/signin?operation=register&redirect=https%3A%2F%2Fmedium.com%2Fnew-story&source=---top_nav_layout_nav-----------------------new_post_topnav-----------" rel="noopener follow"><div class="bf b bg z du fj fk ab q fl fm"><svg xmlns="http://www.w3.org/2000/svg" width="24" height="24" fill="none" viewBox="0 0 24 24" aria-label="Write"><path fill="currentColor" d="M14 4a.5.5 0 0 0 0-1zm7 6a.5.5 0 0 0-1 0zm-7-7H4v1h10zM3 4v16h1V4zm1 17h16v-1H4zm17-1V10h-1v10zm-1 1a1 1 0 0 0 1-1h-1zM3 20a1 1 0 0 0 1 1v-1zM4 3a1 1 0 0 0-1 1h1z"></path><path stroke="currentColor" d="m17.5 4.5-8.458 8.458a.25.25 0 0 0-.06.098l-.824 2.47a.25.25 0 0 0 .316.316l2.47-.823a.25.25 0 0 0 .098-.06L19.5 6.5m-2-2 2.323-2.323a.25.25 0 0 1 .354 0l1.646 1.646a.25.25 0 0 1 0 .354L19.5 6.5m-2-2 2 2"></path></svg><div class="dt l">Write</div></div></a></span></div></div><div class="k j i d"><div class="fi ab"><a class="af ag ah ai aj ak al am an ao ap aq ar as at" data-testid="headerSearchButton" href="https://medium.com/search?source=---top_nav_layout_nav----------------------------------" rel="noopener follow"><div class="bf b bg z du fj fk ab q fl fm"><svg xmlns="http://www.w3.org/2000/svg" width="24" height="24" fill="none" viewBox="0 0 24 24" aria-label="Search"><path fill="currentColor" fill-rule="evenodd" d="M4.092 11.06a6.95 6.95 0 1 1 13.9 0 6.95 6.95 0 0 1-13.9 0m6.95-8.05a8.05 8.05 0 1 0 5.13 14.26l3.75 3.75a.56.56 0 1 0 .79-.79l-3.73-3.73A8.05 8.05 0 0 0 11.042 3z" clip-rule="evenodd"></path></svg></div></a></div></div><div class="fi h k j"><div class="ab q"><p class="bf b dx dy dz ea eb ec ed ee ef eg du"><span><button class="bf b dx dy eh dz ea ei eb ec ej ek ee el em eg eo ep eq er es et eu ev ew ex ey ez fa fb fc fd bm fe ff" data-testid="headerSignUpButton">Sign up</button></span></p><div class="ax l"><p class="bf b dx dy dz ea eb ec ed ee ef eg du"><span><a class="af ag ah ai aj ak al am an ao ap aq ar as at" data-testid="headerSignInButton" href="https://medium.com/m/signin?operation=login&redirect=https%3A%2F%2Frahuld3eora.medium.com%2Fdeep-learning-for-action-recognition-from-basics-to-efficiency-advancements-12d803da3854&source=post_page---top_nav_layout_nav-----------------------global_nav-----------" rel="noopener follow">Sign in</a></span></p></div></div></div><div class="l" aria-hidden="false"><button class="ay fn am ab q ao fo fp fq" aria-label="user options menu" data-testid="headerUserIcon"><div class="l fj"><img alt="" class="l fd by bz ca cx" src="https://miro.medium.com/v2/resize:fill:64:64/1*dmbNkD5D-u45r44go_cf0g.png" width="32" height="32" loading="lazy" role="presentation"/><div class="fr by l bz ca fs n ay ft"></div></div></button></div></div></div><div class="l"><div class="fu fv fw fx fy l"><div class="ab cb"><div class="ci bh fz ga gb gc"></div></div><article><div class="l"><div class="l"><span class="l"></span><section><div><div class="fs gi gj gk gl gm"></div><div class="gn go gp gq gr"><div class="ab cb"><div class="ci bh fz ga gb gc"><div><h1 id="4f80" class="pw-post-title gs gt gu bf gv gw gx gy gz ha hb hc hd he hf hg hh hi hj hk hl hm hn ho hp hq hr hs ht hu bk" data-testid="storyTitle">Deep Learning for Action Recognition: From Basics to Efficiency Advancements</h1><div><div class="speechify-ignore ab cp"><div class="speechify-ignore bh l"><div class="hv hw hx hy hz ab"><div><div class="ab ia"><div><div class="bm" aria-hidden="false"><a rel="noopener follow" href="/?source=post_page---byline--12d803da3854--------------------------------"><div class="l ib ic by id ie"><div class="l fj"><img alt="Rahul Deora" class="l fd by dd de cx" src="https://miro.medium.com/v2/resize:fill:88:88/1*yiQbwaG6UYBBeLZeDgIVLg.jpeg" width="44" height="44" loading="lazy" data-testid="authorPhoto"/><div class="if by l dd de fs n ig ft"></div></div></div></a></div></div></div></div><div class="bn bh l"><div class="ab"><div style="flex:1"><span class="bf b bg z bk"><div class="ih ab q"><div class="ab q ii"><div class="ab q"><div><div class="bm" aria-hidden="false"><p class="bf b ij ik bk"><a class="af ag ah ai aj ak al am an ao ap aq ar il" data-testid="authorName" rel="noopener follow" href="/?source=post_page---byline--12d803da3854--------------------------------">Rahul Deora</a></p></div></div></div><span class="im in" aria-hidden="true"><span class="bf b bg z du">·</span></span><p class="bf b ij ik du"><span><a class="io ip ah ai aj ak al am an ao ap aq ar ex iq ir" href="https://medium.com/m/signin?actionUrl=https%3A%2F%2Fmedium.com%2F_%2Fsubscribe%2Fuser%2Fc0b8e0e6e9b0&operation=register&redirect=https%3A%2F%2Frahuld3eora.medium.com%2Fdeep-learning-for-action-recognition-from-basics-to-efficiency-advancements-12d803da3854&user=Rahul+Deora&userId=c0b8e0e6e9b0&source=post_page-c0b8e0e6e9b0--byline--12d803da3854---------------------post_header-----------" rel="noopener follow">Follow</a></span></p></div></div></span></div></div><div class="l is"><span class="bf b bg z du"><div class="ab cn it iu iv"><span class="bf b bg z du"><div class="ab ae"><span data-testid="storyReadTime">10 min read</span><div class="iw ix l" aria-hidden="true"><span class="l" aria-hidden="true"><span class="bf b bg z du">·</span></span></div><span data-testid="storyPublishDate">Jan 23, 2024</span></div></span></div></span></div></div></div><div class="ab cp iy iz ja jb jc jd je jf jg jh ji jj jk jl jm jn"><div class="h k w fg fh q"><div class="kd l"><div class="ab q ke kf"><div class="pw-multi-vote-icon fj kg kh ki kj"><span><a class="af ag ah ai aj ak al am an ao ap aq ar as at" data-testid="headerClapButton" href="https://medium.com/m/signin?actionUrl=https%3A%2F%2Fmedium.com%2F_%2Fvote%2Fp%2F12d803da3854&operation=register&redirect=https%3A%2F%2Frahuld3eora.medium.com%2Fdeep-learning-for-action-recognition-from-basics-to-efficiency-advancements-12d803da3854&user=Rahul+Deora&userId=c0b8e0e6e9b0&source=---header_actions--12d803da3854---------------------clap_footer-----------" rel="noopener follow"><div><div class="bm" aria-hidden="false"><div class="kk ao kl km kn ko am kp kq kr kj"><svg xmlns="http://www.w3.org/2000/svg" width="24" height="24" viewBox="0 0 24 24" aria-label="clap"><path fill-rule="evenodd" d="M11.37.828 12 3.282l.63-2.454zM13.916 3.953l1.523-2.112-1.184-.39zM8.589 1.84l1.522 2.112-.337-2.501zM18.523 18.92c-.86.86-1.75 1.246-2.62 1.33a6 6 0 0 0 .407-.372c2.388-2.389 2.86-4.951 1.399-7.623l-.912-1.603-.79-1.672c-.26-.56-.194-.98.203-1.288a.7.7 0 0 1 .546-.132c.283.046.546.231.728.5l2.363 4.157c.976 1.624 1.141 4.237-1.324 6.702m-10.999-.438L3.37 14.328a.828.828 0 0 1 .585-1.408.83.83 0 0 1 .585.242l2.158 2.157a.365.365 0 0 0 .516-.516l-2.157-2.158-1.449-1.449a.826.826 0 0 1 1.167-1.17l3.438 3.44a.363.363 0 0 0 .516 0 .364.364 0 0 0 0-.516L5.293 9.513l-.97-.97a.826.826 0 0 1 0-1.166.84.84 0 0 1 1.167 0l.97.968 3.437 3.436a.36.36 0 0 0 .517 0 .366.366 0 0 0 0-.516L6.977 7.83a.82.82 0 0 1-.241-.584.82.82 0 0 1 .824-.826c.219 0 .43.087.584.242l5.787 5.787a.366.366 0 0 0 .587-.415l-1.117-2.363c-.26-.56-.194-.98.204-1.289a.7.7 0 0 1 .546-.132c.283.046.545.232.727.501l2.193 3.86c1.302 2.38.883 4.59-1.277 6.75-1.156 1.156-2.602 1.627-4.19 1.367-1.418-.236-2.866-1.033-4.079-2.246M10.75 5.971l2.12 2.12c-.41.502-.465 1.17-.128 1.89l.22.465-3.523-3.523a.8.8 0 0 1-.097-.368c0-.22.086-.428.241-.584a.847.847 0 0 1 1.167 0m7.355 1.705c-.31-.461-.746-.758-1.23-.837a1.44 1.44 0 0 0-1.11.275c-.312.24-.505.543-.59.881a1.74 1.74 0 0 0-.906-.465 1.47 1.47 0 0 0-.82.106l-2.182-2.182a1.56 1.56 0 0 0-2.2 0 1.54 1.54 0 0 0-.396.701 1.56 1.56 0 0 0-2.21-.01 1.55 1.55 0 0 0-.416.753c-.624-.624-1.649-.624-2.237-.037a1.557 1.557 0 0 0 0 2.2c-.239.1-.501.238-.715.453a1.56 1.56 0 0 0 0 2.2l.516.515a1.556 1.556 0 0 0-.753 2.615L7.01 19c1.32 1.319 2.909 2.189 4.475 2.449q.482.08.971.08c.85 0 1.653-.198 2.393-.579.231.033.46.054.686.054 1.266 0 2.457-.52 3.505-1.567 2.763-2.763 2.552-5.734 1.439-7.586z" clip-rule="evenodd"></path></svg></div></div></div></a></span></div><div class="pw-multi-vote-count l ks kt ku kv kw kx ky"><p class="bf b dv z du"><span class="kz">--</span></p></div></div></div><div><div class="bm" aria-hidden="false"><button class="ao kk lc ld ab q fk le lf" aria-label="responses"><svg xmlns="http://www.w3.org/2000/svg" width="24" height="24" viewBox="0 0 24 24" class="lb"><path d="M18.006 16.803c1.533-1.456 2.234-3.325 2.234-5.321C20.24 7.357 16.709 4 12.191 4S4 7.357 4 11.482c0 4.126 3.674 7.482 8.191 7.482.817 0 1.622-.111 2.393-.327.231.2.48.391.744.559 1.06.693 2.203 1.044 3.399 1.044.224-.008.4-.112.486-.287a.49.49 0 0 0-.042-.518c-.495-.67-.845-1.364-1.04-2.057a4 4 0 0 1-.125-.598zm-3.122 1.055-.067-.223-.315.096a8 8 0 0 1-2.311.338c-4.023 0-7.292-2.955-7.292-6.587 0-3.633 3.269-6.588 7.292-6.588 4.014 0 7.112 2.958 7.112 6.593 0 1.794-.608 3.469-2.027 4.72l-.195.168v.255c0 .056 0 .151.016.295.025.231.081.478.154.733.154.558.398 1.117.722 1.659a5.3 5.3 0 0 1-2.165-.845c-.276-.176-.714-.383-.941-.59z"></path></svg><p class="bf b dv z du"><span class="pw-responses-count la lb">1</span></p></button></div></div></div><div class="ab q jo jp jq jr js jt ju jv jw jx jy jz ka kb kc"><div class="lg k j i d"></div><div class="h k"><div><div class="bm" aria-hidden="false"><span><a class="af ag ah ai aj ak al am an ao ap aq ar as at" data-testid="headerBookmarkButton" href="https://medium.com/m/signin?actionUrl=https%3A%2F%2Fmedium.com%2F_%2Fbookmark%2Fp%2F12d803da3854&operation=register&redirect=https%3A%2F%2Frahuld3eora.medium.com%2Fdeep-learning-for-action-recognition-from-basics-to-efficiency-advancements-12d803da3854&source=---header_actions--12d803da3854---------------------bookmark_footer-----------" rel="noopener follow"><svg xmlns="http://www.w3.org/2000/svg" width="25" height="25" fill="none" viewBox="0 0 25 25" class="du lh" aria-label="Add to list bookmark button"><path fill="currentColor" d="M18 2.5a.5.5 0 0 1 1 0V5h2.5a.5.5 0 0 1 0 1H19v2.5a.5.5 0 1 1-1 0V6h-2.5a.5.5 0 0 1 0-1H18zM7 7a1 1 0 0 1 1-1h3.5a.5.5 0 0 0 0-1H8a2 2 0 0 0-2 2v14a.5.5 0 0 0 .805.396L12.5 17l5.695 4.396A.5.5 0 0 0 19 21v-8.5a.5.5 0 0 0-1 0v7.485l-5.195-4.012a.5.5 0 0 0-.61 0L7 19.985z"></path></svg></a></span></div></div></div><div class="fd li cn"><div class="l ae"><div class="ab cb"><div class="lj lk ll lm ln lo ci bh"><div class="ab"><div class="bm bh" aria-hidden="false"><div><div class="bm" aria-hidden="false"><button aria-label="Listen" data-testid="audioPlayButton" class="af fk ah ai aj ak al lp an ao ap ex lq lr lf ls lt lu lv lw s lx ly lz ma mb mc md u me mf mg"><svg xmlns="http://www.w3.org/2000/svg" width="24" height="24" fill="none" viewBox="0 0 24 24"><path fill="currentColor" fill-rule="evenodd" d="M3 12a9 9 0 1 1 18 0 9 9 0 0 1-18 0m9-10C6.477 2 2 6.477 2 12s4.477 10 10 10 10-4.477 10-10S17.523 2 12 2m3.376 10.416-4.599 3.066a.5.5 0 0 1-.777-.416V8.934a.5.5 0 0 1 .777-.416l4.599 3.066a.5.5 0 0 1 0 .832" clip-rule="evenodd"></path></svg><div class="j i d"><p class="bf b bg z du">Listen</p></div></button></div></div></div></div></div></div></div></div><div class="bm" aria-hidden="false" aria-describedby="postFooterSocialMenu" aria-labelledby="postFooterSocialMenu"><div><div class="bm" aria-hidden="false"><button aria-controls="postFooterSocialMenu" aria-expanded="false" aria-label="Share Post" data-testid="headerSocialShareButton" class="af fk ah ai aj ak al lp an ao ap ex lq lr lf ls lt lu lv lw s lx ly lz ma mb mc md u me mf mg"><svg xmlns="http://www.w3.org/2000/svg" width="24" height="24" fill="none" viewBox="0 0 24 24"><path fill="currentColor" fill-rule="evenodd" d="M15.218 4.931a.4.4 0 0 1-.118.132l.012.006a.45.45 0 0 1-.292.074.5.5 0 0 1-.3-.13l-2.02-2.02v7.07c0 .28-.23.5-.5.5s-.5-.22-.5-.5v-7.04l-2 2a.45.45 0 0 1-.57.04h-.02a.4.4 0 0 1-.16-.3.4.4 0 0 1 .1-.32l2.8-2.8a.5.5 0 0 1 .7 0l2.8 2.79a.42.42 0 0 1 .068.498m-.106.138.008.004v-.01zM16 7.063h1.5a2 2 0 0 1 2 2v10a2 2 0 0 1-2 2h-11c-1.1 0-2-.9-2-2v-10a2 2 0 0 1 2-2H8a.5.5 0 0 1 .35.15.5.5 0 0 1 .15.35.5.5 0 0 1-.15.35.5.5 0 0 1-.35.15H6.4c-.5 0-.9.4-.9.9v10.2a.9.9 0 0 0 .9.9h11.2c.5 0 .9-.4.9-.9v-10.2c0-.5-.4-.9-.9-.9H16a.5.5 0 0 1 0-1" clip-rule="evenodd"></path></svg><div class="j i d"><p class="bf b bg z du">Share</p></div></button></div></div></div></div></div></div></div></div></div><figure class="mk ml mm mn mo mp mh mi paragraph-image"><div role="button" tabindex="0" class="mq mr fj ms bh mt"><div class="mh mi mj"><picture><source srcSet="https://miro.medium.com/v2/resize:fit:640/format:webp/1*aNj3n_P-oyNwuOSCZxmFiQ.png 640w, https://miro.medium.com/v2/resize:fit:720/format:webp/1*aNj3n_P-oyNwuOSCZxmFiQ.png 720w, https://miro.medium.com/v2/resize:fit:750/format:webp/1*aNj3n_P-oyNwuOSCZxmFiQ.png 750w, https://miro.medium.com/v2/resize:fit:786/format:webp/1*aNj3n_P-oyNwuOSCZxmFiQ.png 786w, https://miro.medium.com/v2/resize:fit:828/format:webp/1*aNj3n_P-oyNwuOSCZxmFiQ.png 828w, https://miro.medium.com/v2/resize:fit:1100/format:webp/1*aNj3n_P-oyNwuOSCZxmFiQ.png 1100w, https://miro.medium.com/v2/resize:fit:1400/format:webp/1*aNj3n_P-oyNwuOSCZxmFiQ.png 1400w" sizes="(min-resolution: 4dppx) and (max-width: 700px) 50vw, (-webkit-min-device-pixel-ratio: 4) and (max-width: 700px) 50vw, (min-resolution: 3dppx) and (max-width: 700px) 67vw, (-webkit-min-device-pixel-ratio: 3) and (max-width: 700px) 65vw, (min-resolution: 2.5dppx) and (max-width: 700px) 80vw, (-webkit-min-device-pixel-ratio: 2.5) and (max-width: 700px) 80vw, (min-resolution: 2dppx) and (max-width: 700px) 100vw, (-webkit-min-device-pixel-ratio: 2) and (max-width: 700px) 100vw, 700px" type="image/webp"/><source data-testid="og" srcSet="https://miro.medium.com/v2/resize:fit:640/1*aNj3n_P-oyNwuOSCZxmFiQ.png 640w, https://miro.medium.com/v2/resize:fit:720/1*aNj3n_P-oyNwuOSCZxmFiQ.png 720w, https://miro.medium.com/v2/resize:fit:750/1*aNj3n_P-oyNwuOSCZxmFiQ.png 750w, https://miro.medium.com/v2/resize:fit:786/1*aNj3n_P-oyNwuOSCZxmFiQ.png 786w, https://miro.medium.com/v2/resize:fit:828/1*aNj3n_P-oyNwuOSCZxmFiQ.png 828w, https://miro.medium.com/v2/resize:fit:1100/1*aNj3n_P-oyNwuOSCZxmFiQ.png 1100w, https://miro.medium.com/v2/resize:fit:1400/1*aNj3n_P-oyNwuOSCZxmFiQ.png 1400w" sizes="(min-resolution: 4dppx) and (max-width: 700px) 50vw, (-webkit-min-device-pixel-ratio: 4) and (max-width: 700px) 50vw, (min-resolution: 3dppx) and (max-width: 700px) 67vw, (-webkit-min-device-pixel-ratio: 3) and (max-width: 700px) 65vw, (min-resolution: 2.5dppx) and (max-width: 700px) 80vw, (-webkit-min-device-pixel-ratio: 2.5) and (max-width: 700px) 80vw, (min-resolution: 2dppx) and (max-width: 700px) 100vw, (-webkit-min-device-pixel-ratio: 2) and (max-width: 700px) 100vw, 700px"/><img alt="" class="bh lo mu c" width="700" height="528" loading="eager" role="presentation"/></picture></div></div><figcaption class="mv ff mw mh mi mx my bf b bg z du">The many actions in a video</figcaption></figure><p id="8554" class="pw-post-body-paragraph mz na gu nb b nc nd ne nf ng nh ni nj nk nl nm nn no np nq nr ns nt nu nv nw gn bk">Action recognition is an important task in the field of computer vision that entails classifying human actions depicted in video frames. Think of it as the video counterpart of image classification. Action recognition is to videos what classification is to images. Instead of identifying objects in static 2D images, action recognition involves discerning actions within dynamic video clips, where each frame is essentially a 2D image connected to other 2D images in a sequence.</p><p id="16d5" class="pw-post-body-paragraph mz na gu nb b nc nd ne nf ng nh ni nj nk nl nm nn no np nq nr ns nt nu nv nw gn bk">Action Recognition is more challenging than 2D classification due to the following reasons:</p><ul class=""><li id="08f1" class="mz na gu nb b nc nd ne nf ng nh ni nj nk nl nm nn no np nq nr ns nt nu nv nw nx ny nz bk">Densely Packed Actions: Videos often present scenarios where numerous actions unfold concurrently or in quick succession</li><li id="fe12" class="mz na gu nb b nc oa ne nf ng ob ni nj nk oc nm nn no od nq nr ns oe nu nv nw nx ny nz bk">Long-Range Processing: Actions may extend over extended intervals, requiring long-range processing to capture the nuances and transitions effectively</li><li id="44cc" class="mz na gu nb b nc oa ne nf ng ob ni nj nk oc nm nn no od nq nr ns oe nu nv nw nx ny nz bk">Irrelevant Frames: Not every frame contributes to the action recognition process, and there may be many irrelevant frames which need to be ignored</li><li id="5ae9" class="mz na gu nb b nc oa ne nf ng ob ni nj nk oc nm nn no od nq nr ns oe nu nv nw nx ny nz bk">Expensive and Time Consuming Training: Video models are harder and more compute intensive than image models</li><li id="ae8b" class="mz na gu nb b nc oa ne nf ng ob ni nj nk oc nm nn no od nq nr ns oe nu nv nw nx ny nz bk">Generalization Challenges: Harder to generalize due to the amount of variations possible in the video space</li></ul><p id="d088" class="pw-post-body-paragraph mz na gu nb b nc nd ne nf ng nh ni nj nk nl nm nn no np nq nr ns nt nu nv nw gn bk">Videos are generally 32 or 64fps so it is common to lower the frame rate(subsample in the temporal dimension) prior to processing them.</p><p id="9029" class="pw-post-body-paragraph mz na gu nb b nc nd ne nf ng nh ni nj nk nl nm nn no np nq nr ns nt nu nv nw gn bk">In this blog, we’ll explore some of the early prominent approaches to action recognition and then cover some efficient methods that will help you get a strong overview of this field.</p><h1 id="86e7" class="of og gu bf oh oi oj ok ol om on oo op oq or os ot ou ov ow ox oy oz pa pb pc bk"><strong class="al">Single Stream Network</strong></h1><p id="2289" class="pw-post-body-paragraph mz na gu nb b nc pd ne nf ng pe ni nj nk pf nm nn no pg nq nr ns ph nu nv nw gn bk">Paper: Large-scale Video Classification with Convolutional Neural Networks: <a class="af pi" href="https://static.googleusercontent.com/media/research.google.com/en//pubs/archive/42455.pdf" rel="noopener ugc nofollow" target="_blank">https://static.googleusercontent.com/media/research.google.com/en//pubs/archive/42455.pdf</a></p><figure class="pk pl pm pn po mp mh mi paragraph-image"><div role="button" tabindex="0" class="mq mr fj ms bh mt"><div class="mh mi pj"><picture><source srcSet="https://miro.medium.com/v2/resize:fit:640/format:webp/1*rSXyK5HtenB3J3b9gvOG8A.png 640w, https://miro.medium.com/v2/resize:fit:720/format:webp/1*rSXyK5HtenB3J3b9gvOG8A.png 720w, https://miro.medium.com/v2/resize:fit:750/format:webp/1*rSXyK5HtenB3J3b9gvOG8A.png 750w, https://miro.medium.com/v2/resize:fit:786/format:webp/1*rSXyK5HtenB3J3b9gvOG8A.png 786w, https://miro.medium.com/v2/resize:fit:828/format:webp/1*rSXyK5HtenB3J3b9gvOG8A.png 828w, https://miro.medium.com/v2/resize:fit:1100/format:webp/1*rSXyK5HtenB3J3b9gvOG8A.png 1100w, https://miro.medium.com/v2/resize:fit:1400/format:webp/1*rSXyK5HtenB3J3b9gvOG8A.png 1400w" sizes="(min-resolution: 4dppx) and (max-width: 700px) 50vw, (-webkit-min-device-pixel-ratio: 4) and (max-width: 700px) 50vw, (min-resolution: 3dppx) and (max-width: 700px) 67vw, (-webkit-min-device-pixel-ratio: 3) and (max-width: 700px) 65vw, (min-resolution: 2.5dppx) and (max-width: 700px) 80vw, (-webkit-min-device-pixel-ratio: 2.5) and (max-width: 700px) 80vw, (min-resolution: 2dppx) and (max-width: 700px) 100vw, (-webkit-min-device-pixel-ratio: 2) and (max-width: 700px) 100vw, 700px" type="image/webp"/><source data-testid="og" srcSet="https://miro.medium.com/v2/resize:fit:640/1*rSXyK5HtenB3J3b9gvOG8A.png 640w, https://miro.medium.com/v2/resize:fit:720/1*rSXyK5HtenB3J3b9gvOG8A.png 720w, https://miro.medium.com/v2/resize:fit:750/1*rSXyK5HtenB3J3b9gvOG8A.png 750w, https://miro.medium.com/v2/resize:fit:786/1*rSXyK5HtenB3J3b9gvOG8A.png 786w, https://miro.medium.com/v2/resize:fit:828/1*rSXyK5HtenB3J3b9gvOG8A.png 828w, https://miro.medium.com/v2/resize:fit:1100/1*rSXyK5HtenB3J3b9gvOG8A.png 1100w, https://miro.medium.com/v2/resize:fit:1400/1*rSXyK5HtenB3J3b9gvOG8A.png 1400w" sizes="(min-resolution: 4dppx) and (max-width: 700px) 50vw, (-webkit-min-device-pixel-ratio: 4) and (max-width: 700px) 50vw, (min-resolution: 3dppx) and (max-width: 700px) 67vw, (-webkit-min-device-pixel-ratio: 3) and (max-width: 700px) 65vw, (min-resolution: 2.5dppx) and (max-width: 700px) 80vw, (-webkit-min-device-pixel-ratio: 2.5) and (max-width: 700px) 80vw, (min-resolution: 2dppx) and (max-width: 700px) 100vw, (-webkit-min-device-pixel-ratio: 2) and (max-width: 700px) 100vw, 700px"/><img alt="" class="bh lo mu c" width="700" height="283" loading="lazy" role="presentation"/></picture></div></div><figcaption class="mv ff mw mh mi mx my bf b bg z du">4 different ways to process videos</figcaption></figure><p id="2572" class="pw-post-body-paragraph mz na gu nb b nc nd ne nf ng nh ni nj nk nl nm nn no np nq nr ns nt nu nv nw gn bk">This was an early seminal paper showing different ways information from different frames can be merged. The paper makes use of 3D convolution to merge multiple frames</p><p id="b6a2" class="pw-post-body-paragraph mz na gu nb b nc nd ne nf ng nh ni nj nk nl nm nn no np nq nr ns nt nu nv nw gn bk"><strong class="nb gv">Single-frame</strong>: Only the one middle frame is used and processed by a 2D convolutional network to determine the accuracy for disregarding temporal information. A naive baseline.<br/><strong class="nb gv">Early Fusion</strong>: Here we pick T middle frames and process through a 2D convolutional network with the first filter being of size 11×11×3×<em class="pp">T</em> pixels where T is the temporal resolution of the filter. Only the middle T frames are processed as shown in the above diagram.</p><p id="897d" class="pw-post-body-paragraph mz na gu nb b nc nd ne nf ng nh ni nj nk nl nm nn no np nq nr ns nt nu nv nw gn bk"><strong class="nb gv">Late Fusion</strong>: Here two separate single-frame networks with shared parameters are used at a distance of 15 frames. Then the features are merged for a final classification. This method fuses information only at the end so is known as Late Fusion<br/><strong class="nb gv">Slow Fusion</strong>: The Slow Fusion model slowly fuses temporal information throughout the network such that higher layers get access to progressively more global information in both spatial and temporal dimensions. This is implemented by carrying out temporal convolutions(3D Convolutions) which iteratively grow in temporal receptive field size as the network progresses.</p><figure class="pk pl pm pn po mp mh mi paragraph-image"><div role="button" tabindex="0" class="mq mr fj ms bh mt"><div class="mh mi pq"><picture><source srcSet="https://miro.medium.com/v2/resize:fit:640/format:webp/1*C0WFBnELAxQZ_zYJNLkTBQ.png 640w, https://miro.medium.com/v2/resize:fit:720/format:webp/1*C0WFBnELAxQZ_zYJNLkTBQ.png 720w, https://miro.medium.com/v2/resize:fit:750/format:webp/1*C0WFBnELAxQZ_zYJNLkTBQ.png 750w, https://miro.medium.com/v2/resize:fit:786/format:webp/1*C0WFBnELAxQZ_zYJNLkTBQ.png 786w, https://miro.medium.com/v2/resize:fit:828/format:webp/1*C0WFBnELAxQZ_zYJNLkTBQ.png 828w, https://miro.medium.com/v2/resize:fit:1100/format:webp/1*C0WFBnELAxQZ_zYJNLkTBQ.png 1100w, https://miro.medium.com/v2/resize:fit:1400/format:webp/1*C0WFBnELAxQZ_zYJNLkTBQ.png 1400w" sizes="(min-resolution: 4dppx) and (max-width: 700px) 50vw, (-webkit-min-device-pixel-ratio: 4) and (max-width: 700px) 50vw, (min-resolution: 3dppx) and (max-width: 700px) 67vw, (-webkit-min-device-pixel-ratio: 3) and (max-width: 700px) 65vw, (min-resolution: 2.5dppx) and (max-width: 700px) 80vw, (-webkit-min-device-pixel-ratio: 2.5) and (max-width: 700px) 80vw, (min-resolution: 2dppx) and (max-width: 700px) 100vw, (-webkit-min-device-pixel-ratio: 2) and (max-width: 700px) 100vw, 700px" type="image/webp"/><source data-testid="og" srcSet="https://miro.medium.com/v2/resize:fit:640/1*C0WFBnELAxQZ_zYJNLkTBQ.png 640w, https://miro.medium.com/v2/resize:fit:720/1*C0WFBnELAxQZ_zYJNLkTBQ.png 720w, https://miro.medium.com/v2/resize:fit:750/1*C0WFBnELAxQZ_zYJNLkTBQ.png 750w, https://miro.medium.com/v2/resize:fit:786/1*C0WFBnELAxQZ_zYJNLkTBQ.png 786w, https://miro.medium.com/v2/resize:fit:828/1*C0WFBnELAxQZ_zYJNLkTBQ.png 828w, https://miro.medium.com/v2/resize:fit:1100/1*C0WFBnELAxQZ_zYJNLkTBQ.png 1100w, https://miro.medium.com/v2/resize:fit:1400/1*C0WFBnELAxQZ_zYJNLkTBQ.png 1400w" sizes="(min-resolution: 4dppx) and (max-width: 700px) 50vw, (-webkit-min-device-pixel-ratio: 4) and (max-width: 700px) 50vw, (min-resolution: 3dppx) and (max-width: 700px) 67vw, (-webkit-min-device-pixel-ratio: 3) and (max-width: 700px) 65vw, (min-resolution: 2.5dppx) and (max-width: 700px) 80vw, (-webkit-min-device-pixel-ratio: 2.5) and (max-width: 700px) 80vw, (min-resolution: 2dppx) and (max-width: 700px) 100vw, (-webkit-min-device-pixel-ratio: 2) and (max-width: 700px) 100vw, 700px"/><img alt="" class="bh lo mu c" width="700" height="239" loading="lazy" role="presentation"/></picture></div></div><figcaption class="mv ff mw mh mi mx my bf b bg z du">Results</figcaption></figure><p id="4de2" class="pw-post-body-paragraph mz na gu nb b nc nd ne nf ng nh ni nj nk nl nm nn no np nq nr ns nt nu nv nw gn bk">As one would expect, the results showed that Slow Fusion performed the best of all the above methods however Single Frame was a close second.</p><h1 id="9306" class="of og gu bf oh oi oj ok ol om on oo op oq or os ot ou ov ow ox oy oz pa pb pc bk">Two Stream Networks</h1><p id="9659" class="pw-post-body-paragraph mz na gu nb b nc pd ne nf ng pe ni nj nk pf nm nn no pg nq nr ns ph nu nv nw gn bk">Paper: Two-Stream Convolutional Networks for Action Recognition in Videos: <a class="af pi" href="https://arxiv.org/pdf/1406.2199.pdf" rel="noopener ugc nofollow" target="_blank">https://arxiv.org/pdf/1406.2199.pdf</a></p><p id="3090" class="pw-post-body-paragraph mz na gu nb b nc nd ne nf ng nh ni nj nk nl nm nn no np nq nr ns nt nu nv nw gn bk">One of the reasons networks such as the above, which are called Single Stream Networks, failed to live to their promise is because single frame image classification is a strong baseline. That is, it is usually possible to classify the whole video based on just the center frame run through a 2D CNN.</p><p id="30e9" class="pw-post-body-paragraph mz na gu nb b nc nd ne nf ng nh ni nj nk nl nm nn no np nq nr ns nt nu nv nw gn bk">Inspired by the two-streams hypothesis of the human visual system which states that human visual cortex contains two pathways: the ventral stream (which performs object recognition) and the dorsal stream (which recognises motion), this work attempts to aggregate spatial and temporal information via processing of a spatial and temporal components separately.</p><figure class="pk pl pm pn po mp mh mi paragraph-image"><div role="button" tabindex="0" class="mq mr fj ms bh mt"><div class="mh mi pr"><picture><source srcSet="https://miro.medium.com/v2/resize:fit:640/format:webp/1*O0HLUzyHL9wyKeieXSIJbA.png 640w, https://miro.medium.com/v2/resize:fit:720/format:webp/1*O0HLUzyHL9wyKeieXSIJbA.png 720w, https://miro.medium.com/v2/resize:fit:750/format:webp/1*O0HLUzyHL9wyKeieXSIJbA.png 750w, https://miro.medium.com/v2/resize:fit:786/format:webp/1*O0HLUzyHL9wyKeieXSIJbA.png 786w, https://miro.medium.com/v2/resize:fit:828/format:webp/1*O0HLUzyHL9wyKeieXSIJbA.png 828w, https://miro.medium.com/v2/resize:fit:1100/format:webp/1*O0HLUzyHL9wyKeieXSIJbA.png 1100w, https://miro.medium.com/v2/resize:fit:1400/format:webp/1*O0HLUzyHL9wyKeieXSIJbA.png 1400w" sizes="(min-resolution: 4dppx) and (max-width: 700px) 50vw, (-webkit-min-device-pixel-ratio: 4) and (max-width: 700px) 50vw, (min-resolution: 3dppx) and (max-width: 700px) 67vw, (-webkit-min-device-pixel-ratio: 3) and (max-width: 700px) 65vw, (min-resolution: 2.5dppx) and (max-width: 700px) 80vw, (-webkit-min-device-pixel-ratio: 2.5) and (max-width: 700px) 80vw, (min-resolution: 2dppx) and (max-width: 700px) 100vw, (-webkit-min-device-pixel-ratio: 2) and (max-width: 700px) 100vw, 700px" type="image/webp"/><source data-testid="og" srcSet="https://miro.medium.com/v2/resize:fit:640/1*O0HLUzyHL9wyKeieXSIJbA.png 640w, https://miro.medium.com/v2/resize:fit:720/1*O0HLUzyHL9wyKeieXSIJbA.png 720w, https://miro.medium.com/v2/resize:fit:750/1*O0HLUzyHL9wyKeieXSIJbA.png 750w, https://miro.medium.com/v2/resize:fit:786/1*O0HLUzyHL9wyKeieXSIJbA.png 786w, https://miro.medium.com/v2/resize:fit:828/1*O0HLUzyHL9wyKeieXSIJbA.png 828w, https://miro.medium.com/v2/resize:fit:1100/1*O0HLUzyHL9wyKeieXSIJbA.png 1100w, https://miro.medium.com/v2/resize:fit:1400/1*O0HLUzyHL9wyKeieXSIJbA.png 1400w" sizes="(min-resolution: 4dppx) and (max-width: 700px) 50vw, (-webkit-min-device-pixel-ratio: 4) and (max-width: 700px) 50vw, (min-resolution: 3dppx) and (max-width: 700px) 67vw, (-webkit-min-device-pixel-ratio: 3) and (max-width: 700px) 65vw, (min-resolution: 2.5dppx) and (max-width: 700px) 80vw, (-webkit-min-device-pixel-ratio: 2.5) and (max-width: 700px) 80vw, (min-resolution: 2dppx) and (max-width: 700px) 100vw, (-webkit-min-device-pixel-ratio: 2) and (max-width: 700px) 100vw, 700px"/><img alt="" class="bh lo mu c" width="700" height="238" loading="lazy" role="presentation"/></picture></div></div><figcaption class="mv ff mw mh mi mx my bf b bg z du">Two Stream Network with Spatial and Temporal streams</figcaption></figure><p id="5c7c" class="pw-post-body-paragraph mz na gu nb b nc nd ne nf ng nh ni nj nk nl nm nn no np nq nr ns nt nu nv nw gn bk">Video can naturally be decomposed into spatial and temporal components. Here the spatial stream performs action classification from still video frames, whilst the temporal stream is trained to recognise action from motion in the form of dense optical flow. Optimal flow better isolates motion than RGB making it easy for the network to infer movements.</p><p id="6e18" class="pw-post-body-paragraph mz na gu nb b nc nd ne nf ng nh ni nj nk nl nm nn no np nq nr ns nt nu nv nw gn bk">Decoupling the spatial and temporal nets also allows us to exploit the availability of large amounts of annotated image data by pre-training the spatial net on the ImageNet challenge dataset.</p><figure class="pk pl pm pn po mp mh mi paragraph-image"><div class="mh mi ps"><picture><source srcSet="https://miro.medium.com/v2/resize:fit:640/format:webp/1*rsKIri8ucGxwiy0mvPSnTQ.png 640w, https://miro.medium.com/v2/resize:fit:720/format:webp/1*rsKIri8ucGxwiy0mvPSnTQ.png 720w, https://miro.medium.com/v2/resize:fit:750/format:webp/1*rsKIri8ucGxwiy0mvPSnTQ.png 750w, https://miro.medium.com/v2/resize:fit:786/format:webp/1*rsKIri8ucGxwiy0mvPSnTQ.png 786w, https://miro.medium.com/v2/resize:fit:828/format:webp/1*rsKIri8ucGxwiy0mvPSnTQ.png 828w, https://miro.medium.com/v2/resize:fit:1100/format:webp/1*rsKIri8ucGxwiy0mvPSnTQ.png 1100w, https://miro.medium.com/v2/resize:fit:962/format:webp/1*rsKIri8ucGxwiy0mvPSnTQ.png 962w" sizes="(min-resolution: 4dppx) and (max-width: 700px) 50vw, (-webkit-min-device-pixel-ratio: 4) and (max-width: 700px) 50vw, (min-resolution: 3dppx) and (max-width: 700px) 67vw, (-webkit-min-device-pixel-ratio: 3) and (max-width: 700px) 65vw, (min-resolution: 2.5dppx) and (max-width: 700px) 80vw, (-webkit-min-device-pixel-ratio: 2.5) and (max-width: 700px) 80vw, (min-resolution: 2dppx) and (max-width: 700px) 100vw, (-webkit-min-device-pixel-ratio: 2) and (max-width: 700px) 100vw, 481px" type="image/webp"/><source data-testid="og" srcSet="https://miro.medium.com/v2/resize:fit:640/1*rsKIri8ucGxwiy0mvPSnTQ.png 640w, https://miro.medium.com/v2/resize:fit:720/1*rsKIri8ucGxwiy0mvPSnTQ.png 720w, https://miro.medium.com/v2/resize:fit:750/1*rsKIri8ucGxwiy0mvPSnTQ.png 750w, https://miro.medium.com/v2/resize:fit:786/1*rsKIri8ucGxwiy0mvPSnTQ.png 786w, https://miro.medium.com/v2/resize:fit:828/1*rsKIri8ucGxwiy0mvPSnTQ.png 828w, https://miro.medium.com/v2/resize:fit:1100/1*rsKIri8ucGxwiy0mvPSnTQ.png 1100w, https://miro.medium.com/v2/resize:fit:962/1*rsKIri8ucGxwiy0mvPSnTQ.png 962w" sizes="(min-resolution: 4dppx) and (max-width: 700px) 50vw, (-webkit-min-device-pixel-ratio: 4) and (max-width: 700px) 50vw, (min-resolution: 3dppx) and (max-width: 700px) 67vw, (-webkit-min-device-pixel-ratio: 3) and (max-width: 700px) 65vw, (min-resolution: 2.5dppx) and (max-width: 700px) 80vw, (-webkit-min-device-pixel-ratio: 2.5) and (max-width: 700px) 80vw, (min-resolution: 2dppx) and (max-width: 700px) 100vw, (-webkit-min-device-pixel-ratio: 2) and (max-width: 700px) 100vw, 481px"/><img alt="" class="bh lo mu c" width="481" height="165" loading="lazy" role="presentation"/></picture></div><figcaption class="mv ff mw mh mi mx my bf b bg z du">Optical flow</figcaption></figure><p id="80f3" class="pw-post-body-paragraph mz na gu nb b nc nd ne nf ng nh ni nj nk nl nm nn no np nq nr ns nt nu nv nw gn bk">The model aims to learn about structure from the Spatial Stream and movement from the Temporal Stream. These features are then fused using a linear layer at the end. This method significantly outperforms Slow Fusion mentioned above.</p><h1 id="ee3f" class="of og gu bf oh oi oj ok ol om on oo op oq or os ot ou ov ow ox oy oz pa pb pc bk">C3D: Learning Spatiotemporal Features with 3D Convolutional Networks</h1><p id="80cf" class="pw-post-body-paragraph mz na gu nb b nc pd ne nf ng pe ni nj nk pf nm nn no pg nq nr ns ph nu nv nw gn bk">Paper: Learning Spatiotemporal Features with 3D Convolutional Networks: <a class="af pi" href="https://arxiv.org/pdf/1412.0767.pdf" rel="noopener ugc nofollow" target="_blank">https://arxiv.org/pdf/1412.0767.pdf</a></p><p id="8069" class="pw-post-body-paragraph mz na gu nb b nc nd ne nf ng nh ni nj nk nl nm nn no np nq nr ns nt nu nv nw gn bk">This is another very important but simple paper which aims at replacing the deterministic optical flow method used previously by a 3D CNN. Optical flow methods are not perfect and 3D CNNs can pick more granular features provided more data and compute.</p><figure class="pk pl pm pn po mp mh mi paragraph-image"><div role="button" tabindex="0" class="mq mr fj ms bh mt"><div class="mh mi pt"><picture><source srcSet="https://miro.medium.com/v2/resize:fit:640/format:webp/1*bI_607tbITDjq6ZG80nXHA.png 640w, https://miro.medium.com/v2/resize:fit:720/format:webp/1*bI_607tbITDjq6ZG80nXHA.png 720w, https://miro.medium.com/v2/resize:fit:750/format:webp/1*bI_607tbITDjq6ZG80nXHA.png 750w, https://miro.medium.com/v2/resize:fit:786/format:webp/1*bI_607tbITDjq6ZG80nXHA.png 786w, https://miro.medium.com/v2/resize:fit:828/format:webp/1*bI_607tbITDjq6ZG80nXHA.png 828w, https://miro.medium.com/v2/resize:fit:1100/format:webp/1*bI_607tbITDjq6ZG80nXHA.png 1100w, https://miro.medium.com/v2/resize:fit:1400/format:webp/1*bI_607tbITDjq6ZG80nXHA.png 1400w" sizes="(min-resolution: 4dppx) and (max-width: 700px) 50vw, (-webkit-min-device-pixel-ratio: 4) and (max-width: 700px) 50vw, (min-resolution: 3dppx) and (max-width: 700px) 67vw, (-webkit-min-device-pixel-ratio: 3) and (max-width: 700px) 65vw, (min-resolution: 2.5dppx) and (max-width: 700px) 80vw, (-webkit-min-device-pixel-ratio: 2.5) and (max-width: 700px) 80vw, (min-resolution: 2dppx) and (max-width: 700px) 100vw, (-webkit-min-device-pixel-ratio: 2) and (max-width: 700px) 100vw, 700px" type="image/webp"/><source data-testid="og" srcSet="https://miro.medium.com/v2/resize:fit:640/1*bI_607tbITDjq6ZG80nXHA.png 640w, https://miro.medium.com/v2/resize:fit:720/1*bI_607tbITDjq6ZG80nXHA.png 720w, https://miro.medium.com/v2/resize:fit:750/1*bI_607tbITDjq6ZG80nXHA.png 750w, https://miro.medium.com/v2/resize:fit:786/1*bI_607tbITDjq6ZG80nXHA.png 786w, https://miro.medium.com/v2/resize:fit:828/1*bI_607tbITDjq6ZG80nXHA.png 828w, https://miro.medium.com/v2/resize:fit:1100/1*bI_607tbITDjq6ZG80nXHA.png 1100w, https://miro.medium.com/v2/resize:fit:1400/1*bI_607tbITDjq6ZG80nXHA.png 1400w" sizes="(min-resolution: 4dppx) and (max-width: 700px) 50vw, (-webkit-min-device-pixel-ratio: 4) and (max-width: 700px) 50vw, (min-resolution: 3dppx) and (max-width: 700px) 67vw, (-webkit-min-device-pixel-ratio: 3) and (max-width: 700px) 65vw, (min-resolution: 2.5dppx) and (max-width: 700px) 80vw, (-webkit-min-device-pixel-ratio: 2.5) and (max-width: 700px) 80vw, (min-resolution: 2dppx) and (max-width: 700px) 100vw, (-webkit-min-device-pixel-ratio: 2) and (max-width: 700px) 100vw, 700px"/><img alt="" class="bh lo mu c" width="700" height="123" loading="lazy" role="presentation"/></picture></div></div><figcaption class="mv ff mw mh mi mx my bf b bg z du">C3D Architecture</figcaption></figure><p id="4354" class="pw-post-body-paragraph mz na gu nb b nc nd ne nf ng nh ni nj nk nl nm nn no np nq nr ns nt nu nv nw gn bk">The architecture is very short and simple. 3D convolutions are significantly more expensive than their 2D counterparts and they simply stack 3D convolutions in a single stream as 3D convolutions can pick up structure and motion concurrently. Quite a bit of data augmentation is done for robustness and generalisation.</p><p id="1f99" class="pw-post-body-paragraph mz na gu nb b nc nd ne nf ng nh ni nj nk nl nm nn no np nq nr ns nt nu nv nw gn bk">The authors perform experiments with a smaller 5 layer 3D CNN to determine the optimal value for the temporal receptive field size which they determine to be 3. Then they train a larger 8 layer 3D CNN for the results below.</p><figure class="pk pl pm pn po mp mh mi paragraph-image"><div class="mh mi pu"><picture><source srcSet="https://miro.medium.com/v2/resize:fit:640/format:webp/0*hQVBMEp8nW9AnIAL.png 640w, https://miro.medium.com/v2/resize:fit:720/format:webp/0*hQVBMEp8nW9AnIAL.png 720w, https://miro.medium.com/v2/resize:fit:750/format:webp/0*hQVBMEp8nW9AnIAL.png 750w, https://miro.medium.com/v2/resize:fit:786/format:webp/0*hQVBMEp8nW9AnIAL.png 786w, https://miro.medium.com/v2/resize:fit:828/format:webp/0*hQVBMEp8nW9AnIAL.png 828w, https://miro.medium.com/v2/resize:fit:1100/format:webp/0*hQVBMEp8nW9AnIAL.png 1100w, https://miro.medium.com/v2/resize:fit:1142/format:webp/0*hQVBMEp8nW9AnIAL.png 1142w" sizes="(min-resolution: 4dppx) and (max-width: 700px) 50vw, (-webkit-min-device-pixel-ratio: 4) and (max-width: 700px) 50vw, (min-resolution: 3dppx) and (max-width: 700px) 67vw, (-webkit-min-device-pixel-ratio: 3) and (max-width: 700px) 65vw, (min-resolution: 2.5dppx) and (max-width: 700px) 80vw, (-webkit-min-device-pixel-ratio: 2.5) and (max-width: 700px) 80vw, (min-resolution: 2dppx) and (max-width: 700px) 100vw, (-webkit-min-device-pixel-ratio: 2) and (max-width: 700px) 100vw, 571px" type="image/webp"/><source data-testid="og" srcSet="https://miro.medium.com/v2/resize:fit:640/0*hQVBMEp8nW9AnIAL.png 640w, https://miro.medium.com/v2/resize:fit:720/0*hQVBMEp8nW9AnIAL.png 720w, https://miro.medium.com/v2/resize:fit:750/0*hQVBMEp8nW9AnIAL.png 750w, https://miro.medium.com/v2/resize:fit:786/0*hQVBMEp8nW9AnIAL.png 786w, https://miro.medium.com/v2/resize:fit:828/0*hQVBMEp8nW9AnIAL.png 828w, https://miro.medium.com/v2/resize:fit:1100/0*hQVBMEp8nW9AnIAL.png 1100w, https://miro.medium.com/v2/resize:fit:1142/0*hQVBMEp8nW9AnIAL.png 1142w" sizes="(min-resolution: 4dppx) and (max-width: 700px) 50vw, (-webkit-min-device-pixel-ratio: 4) and (max-width: 700px) 50vw, (min-resolution: 3dppx) and (max-width: 700px) 67vw, (-webkit-min-device-pixel-ratio: 3) and (max-width: 700px) 65vw, (min-resolution: 2.5dppx) and (max-width: 700px) 80vw, (-webkit-min-device-pixel-ratio: 2.5) and (max-width: 700px) 80vw, (min-resolution: 2dppx) and (max-width: 700px) 100vw, (-webkit-min-device-pixel-ratio: 2) and (max-width: 700px) 100vw, 571px"/><img alt="" class="bh lo mu c" width="571" height="574" loading="lazy" role="presentation"/></picture></div><figcaption class="mv ff mw mh mi mx my bf b bg z du">Results</figcaption></figure><p id="5ffb" class="pw-post-body-paragraph mz na gu nb b nc nd ne nf ng nh ni nj nk nl nm nn no np nq nr ns nt nu nv nw gn bk">They do post analysis and find their 3D Convolutions learn Temporal Gabor Filters</p><figure class="pk pl pm pn po mp mh mi paragraph-image"><div class="mh mi pv"><picture><source srcSet="https://miro.medium.com/v2/resize:fit:640/format:webp/1*gXkVtYhLR8O876TZ--0GSg.png 640w, https://miro.medium.com/v2/resize:fit:720/format:webp/1*gXkVtYhLR8O876TZ--0GSg.png 720w, https://miro.medium.com/v2/resize:fit:750/format:webp/1*gXkVtYhLR8O876TZ--0GSg.png 750w, https://miro.medium.com/v2/resize:fit:786/format:webp/1*gXkVtYhLR8O876TZ--0GSg.png 786w, https://miro.medium.com/v2/resize:fit:828/format:webp/1*gXkVtYhLR8O876TZ--0GSg.png 828w, https://miro.medium.com/v2/resize:fit:1100/format:webp/1*gXkVtYhLR8O876TZ--0GSg.png 1100w, https://miro.medium.com/v2/resize:fit:488/format:webp/1*gXkVtYhLR8O876TZ--0GSg.png 488w" sizes="(min-resolution: 4dppx) and (max-width: 700px) 50vw, (-webkit-min-device-pixel-ratio: 4) and (max-width: 700px) 50vw, (min-resolution: 3dppx) and (max-width: 700px) 67vw, (-webkit-min-device-pixel-ratio: 3) and (max-width: 700px) 65vw, (min-resolution: 2.5dppx) and (max-width: 700px) 80vw, (-webkit-min-device-pixel-ratio: 2.5) and (max-width: 700px) 80vw, (min-resolution: 2dppx) and (max-width: 700px) 100vw, (-webkit-min-device-pixel-ratio: 2) and (max-width: 700px) 100vw, 244px" type="image/webp"/><source data-testid="og" srcSet="https://miro.medium.com/v2/resize:fit:640/1*gXkVtYhLR8O876TZ--0GSg.png 640w, https://miro.medium.com/v2/resize:fit:720/1*gXkVtYhLR8O876TZ--0GSg.png 720w, https://miro.medium.com/v2/resize:fit:750/1*gXkVtYhLR8O876TZ--0GSg.png 750w, https://miro.medium.com/v2/resize:fit:786/1*gXkVtYhLR8O876TZ--0GSg.png 786w, https://miro.medium.com/v2/resize:fit:828/1*gXkVtYhLR8O876TZ--0GSg.png 828w, https://miro.medium.com/v2/resize:fit:1100/1*gXkVtYhLR8O876TZ--0GSg.png 1100w, https://miro.medium.com/v2/resize:fit:488/1*gXkVtYhLR8O876TZ--0GSg.png 488w" sizes="(min-resolution: 4dppx) and (max-width: 700px) 50vw, (-webkit-min-device-pixel-ratio: 4) and (max-width: 700px) 50vw, (min-resolution: 3dppx) and (max-width: 700px) 67vw, (-webkit-min-device-pixel-ratio: 3) and (max-width: 700px) 65vw, (min-resolution: 2.5dppx) and (max-width: 700px) 80vw, (-webkit-min-device-pixel-ratio: 2.5) and (max-width: 700px) 80vw, (min-resolution: 2dppx) and (max-width: 700px) 100vw, (-webkit-min-device-pixel-ratio: 2) and (max-width: 700px) 100vw, 244px"/><img alt="" class="bh lo mu c" width="244" height="199" loading="lazy" role="presentation"/></picture></div><figcaption class="mv ff mw mh mi mx my bf b bg z du">Temporal Gabor Filters</figcaption></figure><h1 id="1908" class="of og gu bf oh oi oj ok ol om on oo op oq or os ot ou ov ow ox oy oz pa pb pc bk">I3D: Quo Vadis, Action Recognition? A New Model and the Kinetics Dataset</h1><p id="fdb9" class="pw-post-body-paragraph mz na gu nb b nc pd ne nf ng pe ni nj nk pf nm nn no pg nq nr ns ph nu nv nw gn bk">Paper: Quo Vadis, Action Recognition? A New Model and the Kinetics Dataset: <a class="af pi" href="https://arxiv.org/abs/1705.07750" rel="noopener ugc nofollow" target="_blank">https://arxiv.org/abs/1705.07750</a></p><p id="7653" class="pw-post-body-paragraph mz na gu nb b nc nd ne nf ng nh ni nj nk nl nm nn no np nq nr ns nt nu nv nw gn bk">This paper introduced the famous Kinetics Dataset for Action Recognition and summarises the research before it in the below diagram and comes up with their Two Stream versions of 3D CNNs(just when we thought Two streams were replaced by 3D CNNs) by using optical flow in one pathway and using 3D CNNs for feature fusion of the two pathways.</p><figure class="pk pl pm pn po mp mh mi paragraph-image"><div role="button" tabindex="0" class="mq mr fj ms bh mt"><div class="mh mi pw"><picture><source srcSet="https://miro.medium.com/v2/resize:fit:640/format:webp/0*0HF_EnV6KeScUP1t.png 640w, https://miro.medium.com/v2/resize:fit:720/format:webp/0*0HF_EnV6KeScUP1t.png 720w, https://miro.medium.com/v2/resize:fit:750/format:webp/0*0HF_EnV6KeScUP1t.png 750w, https://miro.medium.com/v2/resize:fit:786/format:webp/0*0HF_EnV6KeScUP1t.png 786w, https://miro.medium.com/v2/resize:fit:828/format:webp/0*0HF_EnV6KeScUP1t.png 828w, https://miro.medium.com/v2/resize:fit:1100/format:webp/0*0HF_EnV6KeScUP1t.png 1100w, https://miro.medium.com/v2/resize:fit:1400/format:webp/0*0HF_EnV6KeScUP1t.png 1400w" sizes="(min-resolution: 4dppx) and (max-width: 700px) 50vw, (-webkit-min-device-pixel-ratio: 4) and (max-width: 700px) 50vw, (min-resolution: 3dppx) and (max-width: 700px) 67vw, (-webkit-min-device-pixel-ratio: 3) and (max-width: 700px) 65vw, (min-resolution: 2.5dppx) and (max-width: 700px) 80vw, (-webkit-min-device-pixel-ratio: 2.5) and (max-width: 700px) 80vw, (min-resolution: 2dppx) and (max-width: 700px) 100vw, (-webkit-min-device-pixel-ratio: 2) and (max-width: 700px) 100vw, 700px" type="image/webp"/><source data-testid="og" srcSet="https://miro.medium.com/v2/resize:fit:640/0*0HF_EnV6KeScUP1t.png 640w, https://miro.medium.com/v2/resize:fit:720/0*0HF_EnV6KeScUP1t.png 720w, https://miro.medium.com/v2/resize:fit:750/0*0HF_EnV6KeScUP1t.png 750w, https://miro.medium.com/v2/resize:fit:786/0*0HF_EnV6KeScUP1t.png 786w, https://miro.medium.com/v2/resize:fit:828/0*0HF_EnV6KeScUP1t.png 828w, https://miro.medium.com/v2/resize:fit:1100/0*0HF_EnV6KeScUP1t.png 1100w, https://miro.medium.com/v2/resize:fit:1400/0*0HF_EnV6KeScUP1t.png 1400w" sizes="(min-resolution: 4dppx) and (max-width: 700px) 50vw, (-webkit-min-device-pixel-ratio: 4) and (max-width: 700px) 50vw, (min-resolution: 3dppx) and (max-width: 700px) 67vw, (-webkit-min-device-pixel-ratio: 3) and (max-width: 700px) 65vw, (min-resolution: 2.5dppx) and (max-width: 700px) 80vw, (-webkit-min-device-pixel-ratio: 2.5) and (max-width: 700px) 80vw, (min-resolution: 2dppx) and (max-width: 700px) 100vw, (-webkit-min-device-pixel-ratio: 2) and (max-width: 700px) 100vw, 700px"/><img alt="" class="bh lo mu c" width="700" height="212" loading="lazy" role="presentation"/></picture></div></div><figcaption class="mv ff mw mh mi mx my bf b bg z du">The different video architectures</figcaption></figure><p id="2df2" class="pw-post-body-paragraph mz na gu nb b nc nd ne nf ng nh ni nj nk nl nm nn no np nq nr ns nt nu nv nw gn bk">This work used pretrained 2D convolutional models and converted them to 3D models by replicating learned filters in the temporal dimension. They also find that including optical flow as an additional input is helpful for the model most optical flow methods are iterative and have some information difficult for a 3D CNN to fully learn. The also provide very quick access to information from the start helping the model learn early.</p><figure class="pk pl pm pn po mp mh mi paragraph-image"><div role="button" tabindex="0" class="mq mr fj ms bh mt"><div class="mh mi px"><picture><source srcSet="https://miro.medium.com/v2/resize:fit:640/format:webp/1*23iaADN4aH4bcYjsUZMC0A.png 640w, https://miro.medium.com/v2/resize:fit:720/format:webp/1*23iaADN4aH4bcYjsUZMC0A.png 720w, https://miro.medium.com/v2/resize:fit:750/format:webp/1*23iaADN4aH4bcYjsUZMC0A.png 750w, https://miro.medium.com/v2/resize:fit:786/format:webp/1*23iaADN4aH4bcYjsUZMC0A.png 786w, https://miro.medium.com/v2/resize:fit:828/format:webp/1*23iaADN4aH4bcYjsUZMC0A.png 828w, https://miro.medium.com/v2/resize:fit:1100/format:webp/1*23iaADN4aH4bcYjsUZMC0A.png 1100w, https://miro.medium.com/v2/resize:fit:1400/format:webp/1*23iaADN4aH4bcYjsUZMC0A.png 1400w" sizes="(min-resolution: 4dppx) and (max-width: 700px) 50vw, (-webkit-min-device-pixel-ratio: 4) and (max-width: 700px) 50vw, (min-resolution: 3dppx) and (max-width: 700px) 67vw, (-webkit-min-device-pixel-ratio: 3) and (max-width: 700px) 65vw, (min-resolution: 2.5dppx) and (max-width: 700px) 80vw, (-webkit-min-device-pixel-ratio: 2.5) and (max-width: 700px) 80vw, (min-resolution: 2dppx) and (max-width: 700px) 100vw, (-webkit-min-device-pixel-ratio: 2) and (max-width: 700px) 100vw, 700px" type="image/webp"/><source data-testid="og" srcSet="https://miro.medium.com/v2/resize:fit:640/1*23iaADN4aH4bcYjsUZMC0A.png 640w, https://miro.medium.com/v2/resize:fit:720/1*23iaADN4aH4bcYjsUZMC0A.png 720w, https://miro.medium.com/v2/resize:fit:750/1*23iaADN4aH4bcYjsUZMC0A.png 750w, https://miro.medium.com/v2/resize:fit:786/1*23iaADN4aH4bcYjsUZMC0A.png 786w, https://miro.medium.com/v2/resize:fit:828/1*23iaADN4aH4bcYjsUZMC0A.png 828w, https://miro.medium.com/v2/resize:fit:1100/1*23iaADN4aH4bcYjsUZMC0A.png 1100w, https://miro.medium.com/v2/resize:fit:1400/1*23iaADN4aH4bcYjsUZMC0A.png 1400w" sizes="(min-resolution: 4dppx) and (max-width: 700px) 50vw, (-webkit-min-device-pixel-ratio: 4) and (max-width: 700px) 50vw, (min-resolution: 3dppx) and (max-width: 700px) 67vw, (-webkit-min-device-pixel-ratio: 3) and (max-width: 700px) 65vw, (min-resolution: 2.5dppx) and (max-width: 700px) 80vw, (-webkit-min-device-pixel-ratio: 2.5) and (max-width: 700px) 80vw, (min-resolution: 2dppx) and (max-width: 700px) 100vw, (-webkit-min-device-pixel-ratio: 2) and (max-width: 700px) 100vw, 700px"/><img alt="" class="bh lo mu c" width="700" height="189" loading="lazy" role="presentation"/></picture></div></div><figcaption class="mv ff mw mh mi mx my bf b bg z du">Results</figcaption></figure><p id="3637" class="pw-post-body-paragraph mz na gu nb b nc nd ne nf ng nh ni nj nk nl nm nn no np nq nr ns nt nu nv nw gn bk">As can be seen in their results, ImageNet pretrained Two Stream I3D using Optical Flow obtains the best results(74.2).</p><h1 id="3703" class="of og gu bf oh oi oj ok ol om on oo op oq or os ot ou ov ow ox oy oz pa pb pc bk">Advances in Efficient Video Recognition</h1><p id="bc5b" class="pw-post-body-paragraph mz na gu nb b nc pd ne nf ng pe ni nj nk pf nm nn no pg nq nr ns ph nu nv nw gn bk">The above video models are quite heavy and require large training time. In this section we will cover some important papers focused on efficient processing of videos.</p><p id="afb6" class="pw-post-body-paragraph mz na gu nb b nc nd ne nf ng nh ni nj nk nl nm nn no np nq nr ns nt nu nv nw gn bk">We will talk about three new approaches that largely aim at reducing the heavy computational cost of video models.</p><h1 id="360a" class="of og gu bf oh oi oj ok ol om on oo op oq or os ot ou ov ow ox oy oz pa pb pc bk">SlowFast Networks for Video Recognition</h1><p id="91a6" class="pw-post-body-paragraph mz na gu nb b nc pd ne nf ng pe ni nj nk pf nm nn no pg nq nr ns ph nu nv nw gn bk">Paper: SlowFast Networks for Video Recognition: <a class="af pi" href="https://arxiv.org/pdf/1812.03982.pdf" rel="noopener ugc nofollow" target="_blank">https://arxiv.org/pdf/1812.03982.pdf</a></p><p id="2b5e" class="pw-post-body-paragraph mz na gu nb b nc nd ne nf ng nh ni nj nk nl nm nn no np nq nr ns nt nu nv nw gn bk">SlowFast Networks take inspiration from P-cells and M-cells in the brain which are responsible for visual processing. The M-cells operate at high temporal frequency and are responsive to fast temporal changes, but not sensitive to spatial detail or color. P-cells provide fine spatial detail and color, but lower temporal resolution, responding slowly to stimuli.</p><p id="76ab" class="pw-post-body-paragraph mz na gu nb b nc nd ne nf ng nh ni nj nk nl nm nn no np nq nr ns nt nu nv nw gn bk">So this paper tries to replicate this function and produces a two stream network where each stream mimics the M-cell and P-cell. One stream operates at high temporal frequency and one at a finer frequency.</p><figure class="pk pl pm pn po mp mh mi paragraph-image"><div role="button" tabindex="0" class="mq mr fj ms bh mt"><div class="mh mi py"><picture><source srcSet="https://miro.medium.com/v2/resize:fit:640/format:webp/1*1ji-uvjS3o7-j9ARpR8jLA.png 640w, https://miro.medium.com/v2/resize:fit:720/format:webp/1*1ji-uvjS3o7-j9ARpR8jLA.png 720w, https://miro.medium.com/v2/resize:fit:750/format:webp/1*1ji-uvjS3o7-j9ARpR8jLA.png 750w, https://miro.medium.com/v2/resize:fit:786/format:webp/1*1ji-uvjS3o7-j9ARpR8jLA.png 786w, https://miro.medium.com/v2/resize:fit:828/format:webp/1*1ji-uvjS3o7-j9ARpR8jLA.png 828w, https://miro.medium.com/v2/resize:fit:1100/format:webp/1*1ji-uvjS3o7-j9ARpR8jLA.png 1100w, https://miro.medium.com/v2/resize:fit:1400/format:webp/1*1ji-uvjS3o7-j9ARpR8jLA.png 1400w" sizes="(min-resolution: 4dppx) and (max-width: 700px) 50vw, (-webkit-min-device-pixel-ratio: 4) and (max-width: 700px) 50vw, (min-resolution: 3dppx) and (max-width: 700px) 67vw, (-webkit-min-device-pixel-ratio: 3) and (max-width: 700px) 65vw, (min-resolution: 2.5dppx) and (max-width: 700px) 80vw, (-webkit-min-device-pixel-ratio: 2.5) and (max-width: 700px) 80vw, (min-resolution: 2dppx) and (max-width: 700px) 100vw, (-webkit-min-device-pixel-ratio: 2) and (max-width: 700px) 100vw, 700px" type="image/webp"/><source data-testid="og" srcSet="https://miro.medium.com/v2/resize:fit:640/1*1ji-uvjS3o7-j9ARpR8jLA.png 640w, https://miro.medium.com/v2/resize:fit:720/1*1ji-uvjS3o7-j9ARpR8jLA.png 720w, https://miro.medium.com/v2/resize:fit:750/1*1ji-uvjS3o7-j9ARpR8jLA.png 750w, https://miro.medium.com/v2/resize:fit:786/1*1ji-uvjS3o7-j9ARpR8jLA.png 786w, https://miro.medium.com/v2/resize:fit:828/1*1ji-uvjS3o7-j9ARpR8jLA.png 828w, https://miro.medium.com/v2/resize:fit:1100/1*1ji-uvjS3o7-j9ARpR8jLA.png 1100w, https://miro.medium.com/v2/resize:fit:1400/1*1ji-uvjS3o7-j9ARpR8jLA.png 1400w" sizes="(min-resolution: 4dppx) and (max-width: 700px) 50vw, (-webkit-min-device-pixel-ratio: 4) and (max-width: 700px) 50vw, (min-resolution: 3dppx) and (max-width: 700px) 67vw, (-webkit-min-device-pixel-ratio: 3) and (max-width: 700px) 65vw, (min-resolution: 2.5dppx) and (max-width: 700px) 80vw, (-webkit-min-device-pixel-ratio: 2.5) and (max-width: 700px) 80vw, (min-resolution: 2dppx) and (max-width: 700px) 100vw, (-webkit-min-device-pixel-ratio: 2) and (max-width: 700px) 100vw, 700px"/><img alt="" class="bh lo mu c" width="700" height="500" loading="lazy" role="presentation"/></picture></div></div><figcaption class="mv ff mw mh mi mx my bf b bg z du">SlowFast Network</figcaption></figure><p id="a212" class="pw-post-body-paragraph mz na gu nb b nc nd ne nf ng nh ni nj nk nl nm nn no np nq nr ns nt nu nv nw gn bk"><strong class="nb gv">The top or slow pathway</strong> subsamples frames at a low frame rate so images are much more spread out in time. It uses only a fraction of the total input(1/8th frames). This way it is focused to capture contextual information or structure.</p><p id="5d2d" class="pw-post-body-paragraph mz na gu nb b nc nd ne nf ng nh ni nj nk nl nm nn no np nq nr ns nt nu nv nw gn bk"><strong class="nb gv">The lower or fast channel pathway </strong>processes all the frames but is made very light weight. This patway is focused on motion determination. Despite its high temporal rate, this pathway is made very lightweight, only ∼20% of total computation.</p><p id="d4fa" class="pw-post-body-paragraph mz na gu nb b nc nd ne nf ng nh ni nj nk nl nm nn no np nq nr ns nt nu nv nw gn bk">The slow pathway has lower frame rates and higher parameters as to promote it to learn structure due to it’s high capacity. The fast pathway has lower parameters to promote it to learn motion via simplier gabor filters.</p><figure class="pk pl pm pn po mp mh mi paragraph-image"><div role="button" tabindex="0" class="mq mr fj ms bh mt"><div class="mh mi pz"><picture><source srcSet="https://miro.medium.com/v2/resize:fit:640/format:webp/1*o--_LFretLsN6riCIyJ1dQ.png 640w, https://miro.medium.com/v2/resize:fit:720/format:webp/1*o--_LFretLsN6riCIyJ1dQ.png 720w, https://miro.medium.com/v2/resize:fit:750/format:webp/1*o--_LFretLsN6riCIyJ1dQ.png 750w, https://miro.medium.com/v2/resize:fit:786/format:webp/1*o--_LFretLsN6riCIyJ1dQ.png 786w, https://miro.medium.com/v2/resize:fit:828/format:webp/1*o--_LFretLsN6riCIyJ1dQ.png 828w, https://miro.medium.com/v2/resize:fit:1100/format:webp/1*o--_LFretLsN6riCIyJ1dQ.png 1100w, https://miro.medium.com/v2/resize:fit:1400/format:webp/1*o--_LFretLsN6riCIyJ1dQ.png 1400w" sizes="(min-resolution: 4dppx) and (max-width: 700px) 50vw, (-webkit-min-device-pixel-ratio: 4) and (max-width: 700px) 50vw, (min-resolution: 3dppx) and (max-width: 700px) 67vw, (-webkit-min-device-pixel-ratio: 3) and (max-width: 700px) 65vw, (min-resolution: 2.5dppx) and (max-width: 700px) 80vw, (-webkit-min-device-pixel-ratio: 2.5) and (max-width: 700px) 80vw, (min-resolution: 2dppx) and (max-width: 700px) 100vw, (-webkit-min-device-pixel-ratio: 2) and (max-width: 700px) 100vw, 700px" type="image/webp"/><source data-testid="og" srcSet="https://miro.medium.com/v2/resize:fit:640/1*o--_LFretLsN6riCIyJ1dQ.png 640w, https://miro.medium.com/v2/resize:fit:720/1*o--_LFretLsN6riCIyJ1dQ.png 720w, https://miro.medium.com/v2/resize:fit:750/1*o--_LFretLsN6riCIyJ1dQ.png 750w, https://miro.medium.com/v2/resize:fit:786/1*o--_LFretLsN6riCIyJ1dQ.png 786w, https://miro.medium.com/v2/resize:fit:828/1*o--_LFretLsN6riCIyJ1dQ.png 828w, https://miro.medium.com/v2/resize:fit:1100/1*o--_LFretLsN6riCIyJ1dQ.png 1100w, https://miro.medium.com/v2/resize:fit:1400/1*o--_LFretLsN6riCIyJ1dQ.png 1400w" sizes="(min-resolution: 4dppx) and (max-width: 700px) 50vw, (-webkit-min-device-pixel-ratio: 4) and (max-width: 700px) 50vw, (min-resolution: 3dppx) and (max-width: 700px) 67vw, (-webkit-min-device-pixel-ratio: 3) and (max-width: 700px) 65vw, (min-resolution: 2.5dppx) and (max-width: 700px) 80vw, (-webkit-min-device-pixel-ratio: 2.5) and (max-width: 700px) 80vw, (min-resolution: 2dppx) and (max-width: 700px) 100vw, (-webkit-min-device-pixel-ratio: 2) and (max-width: 700px) 100vw, 700px"/><img alt="" class="bh lo mu c" width="700" height="373" loading="lazy" role="presentation"/></picture></div></div><figcaption class="mv ff mw mh mi mx my bf b bg z du">SlowFast Network</figcaption></figure><p id="d29e" class="pw-post-body-paragraph mz na gu nb b nc nd ne nf ng nh ni nj nk nl nm nn no np nq nr ns nt nu nv nw gn bk">You can see above that slow pathway has non-temporal convolutions till res4 so it is mostly focused on semantics whereas the fast pathway has a higher temporal stride from res2 but lower channels making it more biased towards motion gabor filters.</p><p id="cbef" class="pw-post-body-paragraph mz na gu nb b nc nd ne nf ng nh ni nj nk nl nm nn no np nq nr ns nt nu nv nw gn bk">They also extend to other tasks such as video object detection.</p><figure class="pk pl pm pn po mp mh mi paragraph-image"><div role="button" tabindex="0" class="mq mr fj ms bh mt"><div class="mh mi qa"><picture><source srcSet="https://miro.medium.com/v2/resize:fit:640/format:webp/1*qghLNnVV7FaEsr5vIdJU-Q.png 640w, https://miro.medium.com/v2/resize:fit:720/format:webp/1*qghLNnVV7FaEsr5vIdJU-Q.png 720w, https://miro.medium.com/v2/resize:fit:750/format:webp/1*qghLNnVV7FaEsr5vIdJU-Q.png 750w, https://miro.medium.com/v2/resize:fit:786/format:webp/1*qghLNnVV7FaEsr5vIdJU-Q.png 786w, https://miro.medium.com/v2/resize:fit:828/format:webp/1*qghLNnVV7FaEsr5vIdJU-Q.png 828w, https://miro.medium.com/v2/resize:fit:1100/format:webp/1*qghLNnVV7FaEsr5vIdJU-Q.png 1100w, https://miro.medium.com/v2/resize:fit:1400/format:webp/1*qghLNnVV7FaEsr5vIdJU-Q.png 1400w" sizes="(min-resolution: 4dppx) and (max-width: 700px) 50vw, (-webkit-min-device-pixel-ratio: 4) and (max-width: 700px) 50vw, (min-resolution: 3dppx) and (max-width: 700px) 67vw, (-webkit-min-device-pixel-ratio: 3) and (max-width: 700px) 65vw, (min-resolution: 2.5dppx) and (max-width: 700px) 80vw, (-webkit-min-device-pixel-ratio: 2.5) and (max-width: 700px) 80vw, (min-resolution: 2dppx) and (max-width: 700px) 100vw, (-webkit-min-device-pixel-ratio: 2) and (max-width: 700px) 100vw, 700px" type="image/webp"/><source data-testid="og" srcSet="https://miro.medium.com/v2/resize:fit:640/1*qghLNnVV7FaEsr5vIdJU-Q.png 640w, https://miro.medium.com/v2/resize:fit:720/1*qghLNnVV7FaEsr5vIdJU-Q.png 720w, https://miro.medium.com/v2/resize:fit:750/1*qghLNnVV7FaEsr5vIdJU-Q.png 750w, https://miro.medium.com/v2/resize:fit:786/1*qghLNnVV7FaEsr5vIdJU-Q.png 786w, https://miro.medium.com/v2/resize:fit:828/1*qghLNnVV7FaEsr5vIdJU-Q.png 828w, https://miro.medium.com/v2/resize:fit:1100/1*qghLNnVV7FaEsr5vIdJU-Q.png 1100w, https://miro.medium.com/v2/resize:fit:1400/1*qghLNnVV7FaEsr5vIdJU-Q.png 1400w" sizes="(min-resolution: 4dppx) and (max-width: 700px) 50vw, (-webkit-min-device-pixel-ratio: 4) and (max-width: 700px) 50vw, (min-resolution: 3dppx) and (max-width: 700px) 67vw, (-webkit-min-device-pixel-ratio: 3) and (max-width: 700px) 65vw, (min-resolution: 2.5dppx) and (max-width: 700px) 80vw, (-webkit-min-device-pixel-ratio: 2.5) and (max-width: 700px) 80vw, (min-resolution: 2dppx) and (max-width: 700px) 100vw, (-webkit-min-device-pixel-ratio: 2) and (max-width: 700px) 100vw, 700px"/><img alt="" class="bh lo mu c" width="700" height="335" loading="lazy" role="presentation"/></picture></div></div><figcaption class="mv ff mw mh mi mx my bf b bg z du">SlowFast for Action Detection</figcaption></figure><p id="a57d" class="pw-post-body-paragraph mz na gu nb b nc nd ne nf ng nh ni nj nk nl nm nn no np nq nr ns nt nu nv nw gn bk">This model beats out previous methods as a much lower FLOP rate due to its curated and optimal filter sizes.</p><figure class="pk pl pm pn po mp mh mi paragraph-image"><div role="button" tabindex="0" class="mq mr fj ms bh mt"><div class="mh mi qb"><picture><source srcSet="https://miro.medium.com/v2/resize:fit:640/format:webp/1*5d_XD29PQsvb1f4hLfrkPw.png 640w, https://miro.medium.com/v2/resize:fit:720/format:webp/1*5d_XD29PQsvb1f4hLfrkPw.png 720w, https://miro.medium.com/v2/resize:fit:750/format:webp/1*5d_XD29PQsvb1f4hLfrkPw.png 750w, https://miro.medium.com/v2/resize:fit:786/format:webp/1*5d_XD29PQsvb1f4hLfrkPw.png 786w, https://miro.medium.com/v2/resize:fit:828/format:webp/1*5d_XD29PQsvb1f4hLfrkPw.png 828w, https://miro.medium.com/v2/resize:fit:1100/format:webp/1*5d_XD29PQsvb1f4hLfrkPw.png 1100w, https://miro.medium.com/v2/resize:fit:1400/format:webp/1*5d_XD29PQsvb1f4hLfrkPw.png 1400w" sizes="(min-resolution: 4dppx) and (max-width: 700px) 50vw, (-webkit-min-device-pixel-ratio: 4) and (max-width: 700px) 50vw, (min-resolution: 3dppx) and (max-width: 700px) 67vw, (-webkit-min-device-pixel-ratio: 3) and (max-width: 700px) 65vw, (min-resolution: 2.5dppx) and (max-width: 700px) 80vw, (-webkit-min-device-pixel-ratio: 2.5) and (max-width: 700px) 80vw, (min-resolution: 2dppx) and (max-width: 700px) 100vw, (-webkit-min-device-pixel-ratio: 2) and (max-width: 700px) 100vw, 700px" type="image/webp"/><source data-testid="og" srcSet="https://miro.medium.com/v2/resize:fit:640/1*5d_XD29PQsvb1f4hLfrkPw.png 640w, https://miro.medium.com/v2/resize:fit:720/1*5d_XD29PQsvb1f4hLfrkPw.png 720w, https://miro.medium.com/v2/resize:fit:750/1*5d_XD29PQsvb1f4hLfrkPw.png 750w, https://miro.medium.com/v2/resize:fit:786/1*5d_XD29PQsvb1f4hLfrkPw.png 786w, https://miro.medium.com/v2/resize:fit:828/1*5d_XD29PQsvb1f4hLfrkPw.png 828w, https://miro.medium.com/v2/resize:fit:1100/1*5d_XD29PQsvb1f4hLfrkPw.png 1100w, https://miro.medium.com/v2/resize:fit:1400/1*5d_XD29PQsvb1f4hLfrkPw.png 1400w" sizes="(min-resolution: 4dppx) and (max-width: 700px) 50vw, (-webkit-min-device-pixel-ratio: 4) and (max-width: 700px) 50vw, (min-resolution: 3dppx) and (max-width: 700px) 67vw, (-webkit-min-device-pixel-ratio: 3) and (max-width: 700px) 65vw, (min-resolution: 2.5dppx) and (max-width: 700px) 80vw, (-webkit-min-device-pixel-ratio: 2.5) and (max-width: 700px) 80vw, (min-resolution: 2dppx) and (max-width: 700px) 100vw, (-webkit-min-device-pixel-ratio: 2) and (max-width: 700px) 100vw, 700px"/><img alt="" class="bh lo mu c" width="700" height="337" loading="lazy" role="presentation"/></picture></div></div><figcaption class="mv ff mw mh mi mx my bf b bg z du">Results</figcaption></figure><h1 id="1b40" class="of og gu bf oh oi oj ok ol om on oo op oq or os ot ou ov ow ox oy oz pa pb pc bk">X3D: Expanding Architectures for Efficient Video Recognition</h1><p id="4fac" class="pw-post-body-paragraph mz na gu nb b nc pd ne nf ng pe ni nj nk pf nm nn no pg nq nr ns ph nu nv nw gn bk">Paper: X3D: Expanding Architectures for Efficient Video Recognition: <a class="af pi" href="https://arxiv.org/abs/2004.04730" rel="noopener ugc nofollow" target="_blank">https://arxiv.org/abs/2004.04730</a></p><p id="dfef" class="pw-post-body-paragraph mz na gu nb b nc nd ne nf ng nh ni nj nk nl nm nn no np nq nr ns nt nu nv nw gn bk">Perhaps one of my most favorite papers all time, X3D aims to determine exactly how many parameters we need to do efficient video recognition. This work is done by a single author Christoph Feichtenhofer.</p><p id="96e8" class="pw-post-body-paragraph mz na gu nb b nc nd ne nf ng nh ni nj nk nl nm nn no np nq nr ns nt nu nv nw gn bk">Previously model sizes like layer count and filter count were based on heuristics and it was unclear how many parameters are required to reach certain levels of accuracy.</p><p id="cfd2" class="pw-post-body-paragraph mz na gu nb b nc nd ne nf ng nh ni nj nk nl nm nn no np nq nr ns nt nu nv nw gn bk">There are various dimensions that effect computation like: input spatial resolution, temporal resolution, number of layers, number of filters, bottleneck dimension etc. The factors can be seen as the Expansion operations on the right below:</p><figure class="pk pl pm pn po mp mh mi paragraph-image"><div role="button" tabindex="0" class="mq mr fj ms bh mt"><div class="mh mi qc"><picture><source srcSet="https://miro.medium.com/v2/resize:fit:640/format:webp/1*ZgEaxumewFVy5pt-JNkWLw.png 640w, https://miro.medium.com/v2/resize:fit:720/format:webp/1*ZgEaxumewFVy5pt-JNkWLw.png 720w, https://miro.medium.com/v2/resize:fit:750/format:webp/1*ZgEaxumewFVy5pt-JNkWLw.png 750w, https://miro.medium.com/v2/resize:fit:786/format:webp/1*ZgEaxumewFVy5pt-JNkWLw.png 786w, https://miro.medium.com/v2/resize:fit:828/format:webp/1*ZgEaxumewFVy5pt-JNkWLw.png 828w, https://miro.medium.com/v2/resize:fit:1100/format:webp/1*ZgEaxumewFVy5pt-JNkWLw.png 1100w, https://miro.medium.com/v2/resize:fit:1400/format:webp/1*ZgEaxumewFVy5pt-JNkWLw.png 1400w" sizes="(min-resolution: 4dppx) and (max-width: 700px) 50vw, (-webkit-min-device-pixel-ratio: 4) and (max-width: 700px) 50vw, (min-resolution: 3dppx) and (max-width: 700px) 67vw, (-webkit-min-device-pixel-ratio: 3) and (max-width: 700px) 65vw, (min-resolution: 2.5dppx) and (max-width: 700px) 80vw, (-webkit-min-device-pixel-ratio: 2.5) and (max-width: 700px) 80vw, (min-resolution: 2dppx) and (max-width: 700px) 100vw, (-webkit-min-device-pixel-ratio: 2) and (max-width: 700px) 100vw, 700px" type="image/webp"/><source data-testid="og" srcSet="https://miro.medium.com/v2/resize:fit:640/1*ZgEaxumewFVy5pt-JNkWLw.png 640w, https://miro.medium.com/v2/resize:fit:720/1*ZgEaxumewFVy5pt-JNkWLw.png 720w, https://miro.medium.com/v2/resize:fit:750/1*ZgEaxumewFVy5pt-JNkWLw.png 750w, https://miro.medium.com/v2/resize:fit:786/1*ZgEaxumewFVy5pt-JNkWLw.png 786w, https://miro.medium.com/v2/resize:fit:828/1*ZgEaxumewFVy5pt-JNkWLw.png 828w, https://miro.medium.com/v2/resize:fit:1100/1*ZgEaxumewFVy5pt-JNkWLw.png 1100w, https://miro.medium.com/v2/resize:fit:1400/1*ZgEaxumewFVy5pt-JNkWLw.png 1400w" sizes="(min-resolution: 4dppx) and (max-width: 700px) 50vw, (-webkit-min-device-pixel-ratio: 4) and (max-width: 700px) 50vw, (min-resolution: 3dppx) and (max-width: 700px) 67vw, (-webkit-min-device-pixel-ratio: 3) and (max-width: 700px) 65vw, (min-resolution: 2.5dppx) and (max-width: 700px) 80vw, (-webkit-min-device-pixel-ratio: 2.5) and (max-width: 700px) 80vw, (min-resolution: 2dppx) and (max-width: 700px) 100vw, (-webkit-min-device-pixel-ratio: 2) and (max-width: 700px) 100vw, 700px"/><img alt="" class="bh lo mu c" width="700" height="362" loading="lazy" role="presentation"/></picture></div></div><figcaption class="mv ff mw mh mi mx my bf b bg z du">Computation Breakdown in a Video Model</figcaption></figure><p id="7e21" class="pw-post-body-paragraph mz na gu nb b nc nd ne nf ng nh ni nj nk nl nm nn no np nq nr ns nt nu nv nw gn bk">X3D takes an iterative approach to finding the best model:</p><p id="2611" class="pw-post-body-paragraph mz na gu nb b nc nd ne nf ng nh ni nj nk nl nm nn no np nq nr ns nt nu nv nw gn bk">1) Train a tiny base model to convergence</p><p id="28f7" class="pw-post-body-paragraph mz na gu nb b nc nd ne nf ng nh ni nj nk nl nm nn no np nq nr ns nt nu nv nw gn bk">2) For each of the 6 dimensions of computation, increase them as to double the computation and create 6 different models</p><p id="f9b8" class="pw-post-body-paragraph mz na gu nb b nc nd ne nf ng nh ni nj nk nl nm nn no np nq nr ns nt nu nv nw gn bk">3) From the set of 6 new models pick the one with the highest accuracy, discard the rest and permanently increase the base model in that dimension and repeat 1) in a loop</p><p id="6e14" class="pw-post-body-paragraph mz na gu nb b nc nd ne nf ng nh ni nj nk nl nm nn no np nq nr ns nt nu nv nw gn bk">In this way we approximately know which dimension of compute to increase and progressively make the model larger with the best accuracy tradeoff for each level of available compute.</p><p id="a52e" class="pw-post-body-paragraph mz na gu nb b nc nd ne nf ng nh ni nj nk nl nm nn no np nq nr ns nt nu nv nw gn bk">This sounds like it would take alot of training time but because we start with a very small base model training is completed after only training 30 tiny<em class="pp"> </em>models that accumulatively<em class="pp"> </em>require over 25 times fewer multiply-add operations for training than one of the previous large state-of-the-art network</p><figure class="pk pl pm pn po mp mh mi paragraph-image"><div role="button" tabindex="0" class="mq mr fj ms bh mt"><div class="mh mi qd"><picture><source srcSet="https://miro.medium.com/v2/resize:fit:640/format:webp/1*cey1UtyB2SzcOkFgQ0SqJQ.png 640w, https://miro.medium.com/v2/resize:fit:720/format:webp/1*cey1UtyB2SzcOkFgQ0SqJQ.png 720w, https://miro.medium.com/v2/resize:fit:750/format:webp/1*cey1UtyB2SzcOkFgQ0SqJQ.png 750w, https://miro.medium.com/v2/resize:fit:786/format:webp/1*cey1UtyB2SzcOkFgQ0SqJQ.png 786w, https://miro.medium.com/v2/resize:fit:828/format:webp/1*cey1UtyB2SzcOkFgQ0SqJQ.png 828w, https://miro.medium.com/v2/resize:fit:1100/format:webp/1*cey1UtyB2SzcOkFgQ0SqJQ.png 1100w, https://miro.medium.com/v2/resize:fit:1400/format:webp/1*cey1UtyB2SzcOkFgQ0SqJQ.png 1400w" sizes="(min-resolution: 4dppx) and (max-width: 700px) 50vw, (-webkit-min-device-pixel-ratio: 4) and (max-width: 700px) 50vw, (min-resolution: 3dppx) and (max-width: 700px) 67vw, (-webkit-min-device-pixel-ratio: 3) and (max-width: 700px) 65vw, (min-resolution: 2.5dppx) and (max-width: 700px) 80vw, (-webkit-min-device-pixel-ratio: 2.5) and (max-width: 700px) 80vw, (min-resolution: 2dppx) and (max-width: 700px) 100vw, (-webkit-min-device-pixel-ratio: 2) and (max-width: 700px) 100vw, 700px" type="image/webp"/><source data-testid="og" srcSet="https://miro.medium.com/v2/resize:fit:640/1*cey1UtyB2SzcOkFgQ0SqJQ.png 640w, https://miro.medium.com/v2/resize:fit:720/1*cey1UtyB2SzcOkFgQ0SqJQ.png 720w, https://miro.medium.com/v2/resize:fit:750/1*cey1UtyB2SzcOkFgQ0SqJQ.png 750w, https://miro.medium.com/v2/resize:fit:786/1*cey1UtyB2SzcOkFgQ0SqJQ.png 786w, https://miro.medium.com/v2/resize:fit:828/1*cey1UtyB2SzcOkFgQ0SqJQ.png 828w, https://miro.medium.com/v2/resize:fit:1100/1*cey1UtyB2SzcOkFgQ0SqJQ.png 1100w, https://miro.medium.com/v2/resize:fit:1400/1*cey1UtyB2SzcOkFgQ0SqJQ.png 1400w" sizes="(min-resolution: 4dppx) and (max-width: 700px) 50vw, (-webkit-min-device-pixel-ratio: 4) and (max-width: 700px) 50vw, (min-resolution: 3dppx) and (max-width: 700px) 67vw, (-webkit-min-device-pixel-ratio: 3) and (max-width: 700px) 65vw, (min-resolution: 2.5dppx) and (max-width: 700px) 80vw, (-webkit-min-device-pixel-ratio: 2.5) and (max-width: 700px) 80vw, (min-resolution: 2dppx) and (max-width: 700px) 100vw, (-webkit-min-device-pixel-ratio: 2) and (max-width: 700px) 100vw, 700px"/><img alt="" class="bh lo mu c" width="700" height="345" loading="lazy" role="presentation"/></picture></div></div><figcaption class="mv ff mw mh mi mx my bf b bg z du">Model Capacity vs Accuracy</figcaption></figure><p id="a577" class="pw-post-body-paragraph mz na gu nb b nc nd ne nf ng nh ni nj nk nl nm nn no np nq nr ns nt nu nv nw gn bk">As seen above, we can see how accuracy increases. Each point is a new model that we get as we double model capacity using the steps outlined above. After 10 GFLOPS gains are slow and we see how accuracy scales with compute clearly.</p><figure class="pk pl pm pn po mp mh mi paragraph-image"><div role="button" tabindex="0" class="mq mr fj ms bh mt"><div class="mh mi qe"><picture><source srcSet="https://miro.medium.com/v2/resize:fit:640/format:webp/1*Qaeuo7VkyPFRUqWCgmodFQ.png 640w, https://miro.medium.com/v2/resize:fit:720/format:webp/1*Qaeuo7VkyPFRUqWCgmodFQ.png 720w, https://miro.medium.com/v2/resize:fit:750/format:webp/1*Qaeuo7VkyPFRUqWCgmodFQ.png 750w, https://miro.medium.com/v2/resize:fit:786/format:webp/1*Qaeuo7VkyPFRUqWCgmodFQ.png 786w, https://miro.medium.com/v2/resize:fit:828/format:webp/1*Qaeuo7VkyPFRUqWCgmodFQ.png 828w, https://miro.medium.com/v2/resize:fit:1100/format:webp/1*Qaeuo7VkyPFRUqWCgmodFQ.png 1100w, https://miro.medium.com/v2/resize:fit:1400/format:webp/1*Qaeuo7VkyPFRUqWCgmodFQ.png 1400w" sizes="(min-resolution: 4dppx) and (max-width: 700px) 50vw, (-webkit-min-device-pixel-ratio: 4) and (max-width: 700px) 50vw, (min-resolution: 3dppx) and (max-width: 700px) 67vw, (-webkit-min-device-pixel-ratio: 3) and (max-width: 700px) 65vw, (min-resolution: 2.5dppx) and (max-width: 700px) 80vw, (-webkit-min-device-pixel-ratio: 2.5) and (max-width: 700px) 80vw, (min-resolution: 2dppx) and (max-width: 700px) 100vw, (-webkit-min-device-pixel-ratio: 2) and (max-width: 700px) 100vw, 700px" type="image/webp"/><source data-testid="og" srcSet="https://miro.medium.com/v2/resize:fit:640/1*Qaeuo7VkyPFRUqWCgmodFQ.png 640w, https://miro.medium.com/v2/resize:fit:720/1*Qaeuo7VkyPFRUqWCgmodFQ.png 720w, https://miro.medium.com/v2/resize:fit:750/1*Qaeuo7VkyPFRUqWCgmodFQ.png 750w, https://miro.medium.com/v2/resize:fit:786/1*Qaeuo7VkyPFRUqWCgmodFQ.png 786w, https://miro.medium.com/v2/resize:fit:828/1*Qaeuo7VkyPFRUqWCgmodFQ.png 828w, https://miro.medium.com/v2/resize:fit:1100/1*Qaeuo7VkyPFRUqWCgmodFQ.png 1100w, https://miro.medium.com/v2/resize:fit:1400/1*Qaeuo7VkyPFRUqWCgmodFQ.png 1400w" sizes="(min-resolution: 4dppx) and (max-width: 700px) 50vw, (-webkit-min-device-pixel-ratio: 4) and (max-width: 700px) 50vw, (min-resolution: 3dppx) and (max-width: 700px) 67vw, (-webkit-min-device-pixel-ratio: 3) and (max-width: 700px) 65vw, (min-resolution: 2.5dppx) and (max-width: 700px) 80vw, (-webkit-min-device-pixel-ratio: 2.5) and (max-width: 700px) 80vw, (min-resolution: 2dppx) and (max-width: 700px) 100vw, (-webkit-min-device-pixel-ratio: 2) and (max-width: 700px) 100vw, 700px"/><img alt="" class="bh lo mu c" width="700" height="312" loading="lazy" role="presentation"/></picture></div></div><figcaption class="mv ff mw mh mi mx my bf b bg z du">Comparision with other video models for different compute levels</figcaption></figure><p id="24db" class="pw-post-body-paragraph mz na gu nb b nc nd ne nf ng nh ni nj nk nl nm nn no np nq nr ns nt nu nv nw gn bk">As we see in the above comparison X3D beats SlowFast in accuracy by less than half the TFLOPS! Refer to the paper for more details. This is a really interesting way to optimise for compute and accuracy tradeoff.</p><h1 id="4389" class="of og gu bf oh oi oj ok ol om on oo op oq or os ot ou ov ow ox oy oz pa pb pc bk">A Multigrid Method for Efficiently Training Video Models</h1><p id="9e78" class="pw-post-body-paragraph mz na gu nb b nc pd ne nf ng pe ni nj nk pf nm nn no pg nq nr ns ph nu nv nw gn bk">Paper: A Multigrid Method for Efficiently Training Video Models: <a class="af pi" href="https://arxiv.org/pdf/1912.00998.pdf" rel="noopener ugc nofollow" target="_blank">https://arxiv.org/pdf/1912.00998.pdf</a></p><p id="67b9" class="pw-post-body-paragraph mz na gu nb b nc nd ne nf ng nh ni nj nk nl nm nn no np nq nr ns nt nu nv nw gn bk">Results first!</p><figure class="pk pl pm pn po mp mh mi paragraph-image"><div role="button" tabindex="0" class="mq mr fj ms bh mt"><div class="mh mi qf"><picture><source srcSet="https://miro.medium.com/v2/resize:fit:640/format:webp/1*KF1S-3iO_eiSeWladDIvPw.png 640w, https://miro.medium.com/v2/resize:fit:720/format:webp/1*KF1S-3iO_eiSeWladDIvPw.png 720w, https://miro.medium.com/v2/resize:fit:750/format:webp/1*KF1S-3iO_eiSeWladDIvPw.png 750w, https://miro.medium.com/v2/resize:fit:786/format:webp/1*KF1S-3iO_eiSeWladDIvPw.png 786w, https://miro.medium.com/v2/resize:fit:828/format:webp/1*KF1S-3iO_eiSeWladDIvPw.png 828w, https://miro.medium.com/v2/resize:fit:1100/format:webp/1*KF1S-3iO_eiSeWladDIvPw.png 1100w, https://miro.medium.com/v2/resize:fit:1400/format:webp/1*KF1S-3iO_eiSeWladDIvPw.png 1400w" sizes="(min-resolution: 4dppx) and (max-width: 700px) 50vw, (-webkit-min-device-pixel-ratio: 4) and (max-width: 700px) 50vw, (min-resolution: 3dppx) and (max-width: 700px) 67vw, (-webkit-min-device-pixel-ratio: 3) and (max-width: 700px) 65vw, (min-resolution: 2.5dppx) and (max-width: 700px) 80vw, (-webkit-min-device-pixel-ratio: 2.5) and (max-width: 700px) 80vw, (min-resolution: 2dppx) and (max-width: 700px) 100vw, (-webkit-min-device-pixel-ratio: 2) and (max-width: 700px) 100vw, 700px" type="image/webp"/><source data-testid="og" srcSet="https://miro.medium.com/v2/resize:fit:640/1*KF1S-3iO_eiSeWladDIvPw.png 640w, https://miro.medium.com/v2/resize:fit:720/1*KF1S-3iO_eiSeWladDIvPw.png 720w, https://miro.medium.com/v2/resize:fit:750/1*KF1S-3iO_eiSeWladDIvPw.png 750w, https://miro.medium.com/v2/resize:fit:786/1*KF1S-3iO_eiSeWladDIvPw.png 786w, https://miro.medium.com/v2/resize:fit:828/1*KF1S-3iO_eiSeWladDIvPw.png 828w, https://miro.medium.com/v2/resize:fit:1100/1*KF1S-3iO_eiSeWladDIvPw.png 1100w, https://miro.medium.com/v2/resize:fit:1400/1*KF1S-3iO_eiSeWladDIvPw.png 1400w" sizes="(min-resolution: 4dppx) and (max-width: 700px) 50vw, (-webkit-min-device-pixel-ratio: 4) and (max-width: 700px) 50vw, (min-resolution: 3dppx) and (max-width: 700px) 67vw, (-webkit-min-device-pixel-ratio: 3) and (max-width: 700px) 65vw, (min-resolution: 2.5dppx) and (max-width: 700px) 80vw, (-webkit-min-device-pixel-ratio: 2.5) and (max-width: 700px) 80vw, (min-resolution: 2dppx) and (max-width: 700px) 100vw, (-webkit-min-device-pixel-ratio: 2) and (max-width: 700px) 100vw, 700px"/><img alt="" class="bh lo mu c" width="700" height="396" loading="lazy" role="presentation"/></picture></div></div><figcaption class="mv ff mw mh mi mx my bf b bg z du">Speed!</figcaption></figure><p id="6e40" class="pw-post-body-paragraph mz na gu nb b nc nd ne nf ng nh ni nj nk nl nm nn no np nq nr ns nt nu nv nw gn bk">How can we train video models faster without tweaking architecture? Is there a more efficient way to train our models?</p><figure class="pk pl pm pn po mp mh mi paragraph-image"><div role="button" tabindex="0" class="mq mr fj ms bh mt"><div class="mh mi qg"><picture><source srcSet="https://miro.medium.com/v2/resize:fit:640/format:webp/1*uZiOU2nJofLoXLIfpNT9UA.png 640w, https://miro.medium.com/v2/resize:fit:720/format:webp/1*uZiOU2nJofLoXLIfpNT9UA.png 720w, https://miro.medium.com/v2/resize:fit:750/format:webp/1*uZiOU2nJofLoXLIfpNT9UA.png 750w, https://miro.medium.com/v2/resize:fit:786/format:webp/1*uZiOU2nJofLoXLIfpNT9UA.png 786w, https://miro.medium.com/v2/resize:fit:828/format:webp/1*uZiOU2nJofLoXLIfpNT9UA.png 828w, https://miro.medium.com/v2/resize:fit:1100/format:webp/1*uZiOU2nJofLoXLIfpNT9UA.png 1100w, https://miro.medium.com/v2/resize:fit:1400/format:webp/1*uZiOU2nJofLoXLIfpNT9UA.png 1400w" sizes="(min-resolution: 4dppx) and (max-width: 700px) 50vw, (-webkit-min-device-pixel-ratio: 4) and (max-width: 700px) 50vw, (min-resolution: 3dppx) and (max-width: 700px) 67vw, (-webkit-min-device-pixel-ratio: 3) and (max-width: 700px) 65vw, (min-resolution: 2.5dppx) and (max-width: 700px) 80vw, (-webkit-min-device-pixel-ratio: 2.5) and (max-width: 700px) 80vw, (min-resolution: 2dppx) and (max-width: 700px) 100vw, (-webkit-min-device-pixel-ratio: 2) and (max-width: 700px) 100vw, 700px" type="image/webp"/><source data-testid="og" srcSet="https://miro.medium.com/v2/resize:fit:640/1*uZiOU2nJofLoXLIfpNT9UA.png 640w, https://miro.medium.com/v2/resize:fit:720/1*uZiOU2nJofLoXLIfpNT9UA.png 720w, https://miro.medium.com/v2/resize:fit:750/1*uZiOU2nJofLoXLIfpNT9UA.png 750w, https://miro.medium.com/v2/resize:fit:786/1*uZiOU2nJofLoXLIfpNT9UA.png 786w, https://miro.medium.com/v2/resize:fit:828/1*uZiOU2nJofLoXLIfpNT9UA.png 828w, https://miro.medium.com/v2/resize:fit:1100/1*uZiOU2nJofLoXLIfpNT9UA.png 1100w, https://miro.medium.com/v2/resize:fit:1400/1*uZiOU2nJofLoXLIfpNT9UA.png 1400w" sizes="(min-resolution: 4dppx) and (max-width: 700px) 50vw, (-webkit-min-device-pixel-ratio: 4) and (max-width: 700px) 50vw, (min-resolution: 3dppx) and (max-width: 700px) 67vw, (-webkit-min-device-pixel-ratio: 3) and (max-width: 700px) 65vw, (min-resolution: 2.5dppx) and (max-width: 700px) 80vw, (-webkit-min-device-pixel-ratio: 2.5) and (max-width: 700px) 80vw, (min-resolution: 2dppx) and (max-width: 700px) 100vw, (-webkit-min-device-pixel-ratio: 2) and (max-width: 700px) 100vw, 700px"/><img alt="" class="bh lo mu c" width="700" height="371" loading="lazy" role="presentation"/></picture></div></div><figcaption class="mv ff mw mh mi mx my bf b bg z du">Image vs Video Models training time</figcaption></figure><p id="04a3" class="pw-post-body-paragraph mz na gu nb b nc nd ne nf ng nh ni nj nk nl nm nn no np nq nr ns nt nu nv nw gn bk">Video models are typically trained using a fixed mini-batch shape, which includes a specific number of video clips, frames, and spatial dimensions. This fixed shape is chosen based on heuristics to balance accuracy and training speed. The choice of mini-batch shape involves trade-offs. Higher spatial resolutions can improve accuracy but slow down training, while lower resolutions speed up training but reduce accuracy.</p><p id="9700" class="pw-post-body-paragraph mz na gu nb b nc nd ne nf ng nh ni nj nk nl nm nn no np nq nr ns nt nu nv nw gn bk">Training at low spatial resolutions can increase the speed of training drastically as we can use large batch sizes and higher learning rates however the accuracy is capped. Can we get the benefits of quick training and low resolution and added accuracy of training at higher resolution?</p><figure class="pk pl pm pn po mp mh mi paragraph-image"><div role="button" tabindex="0" class="mq mr fj ms bh mt"><div class="mh mi qh"><picture><source srcSet="https://miro.medium.com/v2/resize:fit:640/format:webp/1*14ArMcEIAujd0V-WgcgWzQ.png 640w, https://miro.medium.com/v2/resize:fit:720/format:webp/1*14ArMcEIAujd0V-WgcgWzQ.png 720w, https://miro.medium.com/v2/resize:fit:750/format:webp/1*14ArMcEIAujd0V-WgcgWzQ.png 750w, https://miro.medium.com/v2/resize:fit:786/format:webp/1*14ArMcEIAujd0V-WgcgWzQ.png 786w, https://miro.medium.com/v2/resize:fit:828/format:webp/1*14ArMcEIAujd0V-WgcgWzQ.png 828w, https://miro.medium.com/v2/resize:fit:1100/format:webp/1*14ArMcEIAujd0V-WgcgWzQ.png 1100w, https://miro.medium.com/v2/resize:fit:1400/format:webp/1*14ArMcEIAujd0V-WgcgWzQ.png 1400w" sizes="(min-resolution: 4dppx) and (max-width: 700px) 50vw, (-webkit-min-device-pixel-ratio: 4) and (max-width: 700px) 50vw, (min-resolution: 3dppx) and (max-width: 700px) 67vw, (-webkit-min-device-pixel-ratio: 3) and (max-width: 700px) 65vw, (min-resolution: 2.5dppx) and (max-width: 700px) 80vw, (-webkit-min-device-pixel-ratio: 2.5) and (max-width: 700px) 80vw, (min-resolution: 2dppx) and (max-width: 700px) 100vw, (-webkit-min-device-pixel-ratio: 2) and (max-width: 700px) 100vw, 700px" type="image/webp"/><source data-testid="og" srcSet="https://miro.medium.com/v2/resize:fit:640/1*14ArMcEIAujd0V-WgcgWzQ.png 640w, https://miro.medium.com/v2/resize:fit:720/1*14ArMcEIAujd0V-WgcgWzQ.png 720w, https://miro.medium.com/v2/resize:fit:750/1*14ArMcEIAujd0V-WgcgWzQ.png 750w, https://miro.medium.com/v2/resize:fit:786/1*14ArMcEIAujd0V-WgcgWzQ.png 786w, https://miro.medium.com/v2/resize:fit:828/1*14ArMcEIAujd0V-WgcgWzQ.png 828w, https://miro.medium.com/v2/resize:fit:1100/1*14ArMcEIAujd0V-WgcgWzQ.png 1100w, https://miro.medium.com/v2/resize:fit:1400/1*14ArMcEIAujd0V-WgcgWzQ.png 1400w" sizes="(min-resolution: 4dppx) and (max-width: 700px) 50vw, (-webkit-min-device-pixel-ratio: 4) and (max-width: 700px) 50vw, (min-resolution: 3dppx) and (max-width: 700px) 67vw, (-webkit-min-device-pixel-ratio: 3) and (max-width: 700px) 65vw, (min-resolution: 2.5dppx) and (max-width: 700px) 80vw, (-webkit-min-device-pixel-ratio: 2.5) and (max-width: 700px) 80vw, (min-resolution: 2dppx) and (max-width: 700px) 100vw, (-webkit-min-device-pixel-ratio: 2) and (max-width: 700px) 100vw, 700px"/><img alt="" class="bh lo mu c" width="700" height="370" loading="lazy" role="presentation"/></picture></div></div></figure><p id="0dd0" class="pw-post-body-paragraph mz na gu nb b nc nd ne nf ng nh ni nj nk nl nm nn no np nq nr ns nt nu nv nw gn bk">For this the authors propose a simple variable mini-batch with different spatial-temporal resolutions. These shapes are determined by resampling training data on multiple grids so that the model being trained gets the benefits from training on different spatial resolutions.</p><figure class="pk pl pm pn po mp mh mi paragraph-image"><div role="button" tabindex="0" class="mq mr fj ms bh mt"><div class="mh mi qi"><picture><source srcSet="https://miro.medium.com/v2/resize:fit:640/format:webp/1*vM48hfeR414Ji_CVoFoBBg.png 640w, https://miro.medium.com/v2/resize:fit:720/format:webp/1*vM48hfeR414Ji_CVoFoBBg.png 720w, https://miro.medium.com/v2/resize:fit:750/format:webp/1*vM48hfeR414Ji_CVoFoBBg.png 750w, https://miro.medium.com/v2/resize:fit:786/format:webp/1*vM48hfeR414Ji_CVoFoBBg.png 786w, https://miro.medium.com/v2/resize:fit:828/format:webp/1*vM48hfeR414Ji_CVoFoBBg.png 828w, https://miro.medium.com/v2/resize:fit:1100/format:webp/1*vM48hfeR414Ji_CVoFoBBg.png 1100w, https://miro.medium.com/v2/resize:fit:1400/format:webp/1*vM48hfeR414Ji_CVoFoBBg.png 1400w" sizes="(min-resolution: 4dppx) and (max-width: 700px) 50vw, (-webkit-min-device-pixel-ratio: 4) and (max-width: 700px) 50vw, (min-resolution: 3dppx) and (max-width: 700px) 67vw, (-webkit-min-device-pixel-ratio: 3) and (max-width: 700px) 65vw, (min-resolution: 2.5dppx) and (max-width: 700px) 80vw, (-webkit-min-device-pixel-ratio: 2.5) and (max-width: 700px) 80vw, (min-resolution: 2dppx) and (max-width: 700px) 100vw, (-webkit-min-device-pixel-ratio: 2) and (max-width: 700px) 100vw, 700px" type="image/webp"/><source data-testid="og" srcSet="https://miro.medium.com/v2/resize:fit:640/1*vM48hfeR414Ji_CVoFoBBg.png 640w, https://miro.medium.com/v2/resize:fit:720/1*vM48hfeR414Ji_CVoFoBBg.png 720w, https://miro.medium.com/v2/resize:fit:750/1*vM48hfeR414Ji_CVoFoBBg.png 750w, https://miro.medium.com/v2/resize:fit:786/1*vM48hfeR414Ji_CVoFoBBg.png 786w, https://miro.medium.com/v2/resize:fit:828/1*vM48hfeR414Ji_CVoFoBBg.png 828w, https://miro.medium.com/v2/resize:fit:1100/1*vM48hfeR414Ji_CVoFoBBg.png 1100w, https://miro.medium.com/v2/resize:fit:1400/1*vM48hfeR414Ji_CVoFoBBg.png 1400w" sizes="(min-resolution: 4dppx) and (max-width: 700px) 50vw, (-webkit-min-device-pixel-ratio: 4) and (max-width: 700px) 50vw, (min-resolution: 3dppx) and (max-width: 700px) 67vw, (-webkit-min-device-pixel-ratio: 3) and (max-width: 700px) 65vw, (min-resolution: 2.5dppx) and (max-width: 700px) 80vw, (-webkit-min-device-pixel-ratio: 2.5) and (max-width: 700px) 80vw, (min-resolution: 2dppx) and (max-width: 700px) 100vw, (-webkit-min-device-pixel-ratio: 2) and (max-width: 700px) 100vw, 700px"/><img alt="" class="bh lo mu c" width="700" height="376" loading="lazy" role="presentation"/></picture></div></div><figcaption class="mv ff mw mh mi mx my bf b bg z du">Sampling a grid differently</figcaption></figure><p id="867d" class="pw-post-body-paragraph mz na gu nb b nc nd ne nf ng nh ni nj nk nl nm nn no np nq nr ns nt nu nv nw gn bk">The authors have multiple strategies of alternating between low and high spatial resolution. When using low resolution they are able to train on high batch size which is what the y-axis below represents:</p><figure class="pk pl pm pn po mp mh mi paragraph-image"><div role="button" tabindex="0" class="mq mr fj ms bh mt"><div class="mh mi qj"><picture><source srcSet="https://miro.medium.com/v2/resize:fit:640/format:webp/1*ikg2vXZ7b8MkfQ04uxdRIQ.png 640w, https://miro.medium.com/v2/resize:fit:720/format:webp/1*ikg2vXZ7b8MkfQ04uxdRIQ.png 720w, https://miro.medium.com/v2/resize:fit:750/format:webp/1*ikg2vXZ7b8MkfQ04uxdRIQ.png 750w, https://miro.medium.com/v2/resize:fit:786/format:webp/1*ikg2vXZ7b8MkfQ04uxdRIQ.png 786w, https://miro.medium.com/v2/resize:fit:828/format:webp/1*ikg2vXZ7b8MkfQ04uxdRIQ.png 828w, https://miro.medium.com/v2/resize:fit:1100/format:webp/1*ikg2vXZ7b8MkfQ04uxdRIQ.png 1100w, https://miro.medium.com/v2/resize:fit:1400/format:webp/1*ikg2vXZ7b8MkfQ04uxdRIQ.png 1400w" sizes="(min-resolution: 4dppx) and (max-width: 700px) 50vw, (-webkit-min-device-pixel-ratio: 4) and (max-width: 700px) 50vw, (min-resolution: 3dppx) and (max-width: 700px) 67vw, (-webkit-min-device-pixel-ratio: 3) and (max-width: 700px) 65vw, (min-resolution: 2.5dppx) and (max-width: 700px) 80vw, (-webkit-min-device-pixel-ratio: 2.5) and (max-width: 700px) 80vw, (min-resolution: 2dppx) and (max-width: 700px) 100vw, (-webkit-min-device-pixel-ratio: 2) and (max-width: 700px) 100vw, 700px" type="image/webp"/><source data-testid="og" srcSet="https://miro.medium.com/v2/resize:fit:640/1*ikg2vXZ7b8MkfQ04uxdRIQ.png 640w, https://miro.medium.com/v2/resize:fit:720/1*ikg2vXZ7b8MkfQ04uxdRIQ.png 720w, https://miro.medium.com/v2/resize:fit:750/1*ikg2vXZ7b8MkfQ04uxdRIQ.png 750w, https://miro.medium.com/v2/resize:fit:786/1*ikg2vXZ7b8MkfQ04uxdRIQ.png 786w, https://miro.medium.com/v2/resize:fit:828/1*ikg2vXZ7b8MkfQ04uxdRIQ.png 828w, https://miro.medium.com/v2/resize:fit:1100/1*ikg2vXZ7b8MkfQ04uxdRIQ.png 1100w, https://miro.medium.com/v2/resize:fit:1400/1*ikg2vXZ7b8MkfQ04uxdRIQ.png 1400w" sizes="(min-resolution: 4dppx) and (max-width: 700px) 50vw, (-webkit-min-device-pixel-ratio: 4) and (max-width: 700px) 50vw, (min-resolution: 3dppx) and (max-width: 700px) 67vw, (-webkit-min-device-pixel-ratio: 3) and (max-width: 700px) 65vw, (min-resolution: 2.5dppx) and (max-width: 700px) 80vw, (-webkit-min-device-pixel-ratio: 2.5) and (max-width: 700px) 80vw, (min-resolution: 2dppx) and (max-width: 700px) 100vw, (-webkit-min-device-pixel-ratio: 2) and (max-width: 700px) 100vw, 700px"/><img alt="" class="bh lo mu c" width="700" height="438" loading="lazy" role="presentation"/></picture></div></div></figure><p id="378d" class="pw-post-body-paragraph mz na gu nb b nc nd ne nf ng nh ni nj nk nl nm nn no np nq nr ns nt nu nv nw gn bk">The long cycle follows a coarse-to-fine strategy where the model sees different resolutions of input images so it trains really quickly in the start and then gets refined with a higher resolution to achieve maximum accuracy. The short cycle focus on mixture of multiple shapes while the long+short fuses both.</p><p id="6992" class="pw-post-body-paragraph mz na gu nb b nc nd ne nf ng nh ni nj nk nl nm nn no np nq nr ns nt nu nv nw gn bk">As can be seen below training with this method can reach close to the maximum accuracy around 3 times faster than regular training.</p><figure class="pk pl pm pn po mp mh mi paragraph-image"><div role="button" tabindex="0" class="mq mr fj ms bh mt"><div class="mh mi qk"><picture><source srcSet="https://miro.medium.com/v2/resize:fit:640/format:webp/1*sudFiNhjS6cDLC9FLJ_QTg.png 640w, https://miro.medium.com/v2/resize:fit:720/format:webp/1*sudFiNhjS6cDLC9FLJ_QTg.png 720w, https://miro.medium.com/v2/resize:fit:750/format:webp/1*sudFiNhjS6cDLC9FLJ_QTg.png 750w, https://miro.medium.com/v2/resize:fit:786/format:webp/1*sudFiNhjS6cDLC9FLJ_QTg.png 786w, https://miro.medium.com/v2/resize:fit:828/format:webp/1*sudFiNhjS6cDLC9FLJ_QTg.png 828w, https://miro.medium.com/v2/resize:fit:1100/format:webp/1*sudFiNhjS6cDLC9FLJ_QTg.png 1100w, https://miro.medium.com/v2/resize:fit:1400/format:webp/1*sudFiNhjS6cDLC9FLJ_QTg.png 1400w" sizes="(min-resolution: 4dppx) and (max-width: 700px) 50vw, (-webkit-min-device-pixel-ratio: 4) and (max-width: 700px) 50vw, (min-resolution: 3dppx) and (max-width: 700px) 67vw, (-webkit-min-device-pixel-ratio: 3) and (max-width: 700px) 65vw, (min-resolution: 2.5dppx) and (max-width: 700px) 80vw, (-webkit-min-device-pixel-ratio: 2.5) and (max-width: 700px) 80vw, (min-resolution: 2dppx) and (max-width: 700px) 100vw, (-webkit-min-device-pixel-ratio: 2) and (max-width: 700px) 100vw, 700px" type="image/webp"/><source data-testid="og" srcSet="https://miro.medium.com/v2/resize:fit:640/1*sudFiNhjS6cDLC9FLJ_QTg.png 640w, https://miro.medium.com/v2/resize:fit:720/1*sudFiNhjS6cDLC9FLJ_QTg.png 720w, https://miro.medium.com/v2/resize:fit:750/1*sudFiNhjS6cDLC9FLJ_QTg.png 750w, https://miro.medium.com/v2/resize:fit:786/1*sudFiNhjS6cDLC9FLJ_QTg.png 786w, https://miro.medium.com/v2/resize:fit:828/1*sudFiNhjS6cDLC9FLJ_QTg.png 828w, https://miro.medium.com/v2/resize:fit:1100/1*sudFiNhjS6cDLC9FLJ_QTg.png 1100w, https://miro.medium.com/v2/resize:fit:1400/1*sudFiNhjS6cDLC9FLJ_QTg.png 1400w" sizes="(min-resolution: 4dppx) and (max-width: 700px) 50vw, (-webkit-min-device-pixel-ratio: 4) and (max-width: 700px) 50vw, (min-resolution: 3dppx) and (max-width: 700px) 67vw, (-webkit-min-device-pixel-ratio: 3) and (max-width: 700px) 65vw, (min-resolution: 2.5dppx) and (max-width: 700px) 80vw, (-webkit-min-device-pixel-ratio: 2.5) and (max-width: 700px) 80vw, (min-resolution: 2dppx) and (max-width: 700px) 100vw, (-webkit-min-device-pixel-ratio: 2) and (max-width: 700px) 100vw, 700px"/><img alt="" class="bh lo mu c" width="700" height="321" loading="lazy" role="presentation"/></picture></div></div><figcaption class="mv ff mw mh mi mx my bf b bg z du">Speed!</figcaption></figure><h1 id="c199" class="of og gu bf oh oi oj ok ol om on oo op oq or os ot ou ov ow ox oy oz pa pb pc bk"><strong class="al">Conclusion:</strong></h1><p id="e915" class="pw-post-body-paragraph mz na gu nb b nc pd ne nf ng pe ni nj nk pf nm nn no pg nq nr ns ph nu nv nw gn bk">In this blog we covered early prominent approaches such as single-stream and two-stream networks, C3D, and I3D. Furthermore, we explored strides in efficient video recognition with a spotlight on pioneering methods like SlowFast Networks, X3D’s parameter optimization strategy and Multigrid Training Method for faster convergence. The last two works are particularly impactful in their approach and provide methods which can be used in other domains of learning as well.</p><p id="53b2" class="pw-post-body-paragraph mz na gu nb b nc nd ne nf ng nh ni nj nk nl nm nn no np nq nr ns nt nu nv nw gn bk">Hope you found this useful!</p></div></div></div></div></section></div></div></article></div><div class="ab cb"><div class="ci bh fz ga gb gc"><div class="ql qm ab iv"><div class="qn ab"><a class="qo ay am ao" href="https://medium.com/tag/convolutional-network?source=post_page-----12d803da3854--------------------------------" rel="noopener follow"><div class="qp fj cx qq ge qr qs bf b bg z bk qt">Convolutional Network</div></a></div><div class="qn ab"><a class="qo ay am ao" href="https://medium.com/tag/ai?source=post_page-----12d803da3854--------------------------------" rel="noopener follow"><div class="qp fj cx qq ge qr qs bf b bg z bk qt">AI</div></a></div><div class="qn ab"><a class="qo ay am ao" href="https://medium.com/tag/machine-learning?source=post_page-----12d803da3854--------------------------------" rel="noopener follow"><div class="qp fj cx qq ge qr qs bf b bg z bk qt">Machine Learning</div></a></div><div class="qn ab"><a class="qo ay am ao" href="https://medium.com/tag/neural-networks?source=post_page-----12d803da3854--------------------------------" rel="noopener follow"><div class="qp fj cx qq ge qr qs bf b bg z bk qt">Neural Networks</div></a></div><div class="qn ab"><a class="qo ay am ao" href="https://medium.com/tag/computer-vision?source=post_page-----12d803da3854--------------------------------" rel="noopener follow"><div class="qp fj cx qq ge qr qs bf b bg z bk qt">Computer Vision</div></a></div></div></div></div><div class="l"></div><footer class="qu qv qw qx qy ab q qz ra c"><div class="l ae"><div class="ab cb"><div class="ci bh fz ga gb gc"><div class="ab cp rb"><div class="ab q ke"><div class="rc l"><span class="l rd re rf e d"><div class="ab q ke kf"><div class="pw-multi-vote-icon fj kg kh ki kj"><span><a class="af ag ah ai aj ak al am an ao ap aq ar as at" data-testid="footerClapButton" href="https://medium.com/m/signin?actionUrl=https%3A%2F%2Fmedium.com%2F_%2Fvote%2Fp%2F12d803da3854&operation=register&redirect=https%3A%2F%2Frahuld3eora.medium.com%2Fdeep-learning-for-action-recognition-from-basics-to-efficiency-advancements-12d803da3854&user=Rahul+Deora&userId=c0b8e0e6e9b0&source=---footer_actions--12d803da3854---------------------clap_footer-----------" rel="noopener follow"><div><div class="bm" aria-hidden="false"><div class="kk ao kl km kn ko am kp kq kr kj"><svg xmlns="http://www.w3.org/2000/svg" width="24" height="24" viewBox="0 0 24 24" aria-label="clap"><path fill-rule="evenodd" d="M11.37.828 12 3.282l.63-2.454zM13.916 3.953l1.523-2.112-1.184-.39zM8.589 1.84l1.522 2.112-.337-2.501zM18.523 18.92c-.86.86-1.75 1.246-2.62 1.33a6 6 0 0 0 .407-.372c2.388-2.389 2.86-4.951 1.399-7.623l-.912-1.603-.79-1.672c-.26-.56-.194-.98.203-1.288a.7.7 0 0 1 .546-.132c.283.046.546.231.728.5l2.363 4.157c.976 1.624 1.141 4.237-1.324 6.702m-10.999-.438L3.37 14.328a.828.828 0 0 1 .585-1.408.83.83 0 0 1 .585.242l2.158 2.157a.365.365 0 0 0 .516-.516l-2.157-2.158-1.449-1.449a.826.826 0 0 1 1.167-1.17l3.438 3.44a.363.363 0 0 0 .516 0 .364.364 0 0 0 0-.516L5.293 9.513l-.97-.97a.826.826 0 0 1 0-1.166.84.84 0 0 1 1.167 0l.97.968 3.437 3.436a.36.36 0 0 0 .517 0 .366.366 0 0 0 0-.516L6.977 7.83a.82.82 0 0 1-.241-.584.82.82 0 0 1 .824-.826c.219 0 .43.087.584.242l5.787 5.787a.366.366 0 0 0 .587-.415l-1.117-2.363c-.26-.56-.194-.98.204-1.289a.7.7 0 0 1 .546-.132c.283.046.545.232.727.501l2.193 3.86c1.302 2.38.883 4.59-1.277 6.75-1.156 1.156-2.602 1.627-4.19 1.367-1.418-.236-2.866-1.033-4.079-2.246M10.75 5.971l2.12 2.12c-.41.502-.465 1.17-.128 1.89l.22.465-3.523-3.523a.8.8 0 0 1-.097-.368c0-.22.086-.428.241-.584a.847.847 0 0 1 1.167 0m7.355 1.705c-.31-.461-.746-.758-1.23-.837a1.44 1.44 0 0 0-1.11.275c-.312.24-.505.543-.59.881a1.74 1.74 0 0 0-.906-.465 1.47 1.47 0 0 0-.82.106l-2.182-2.182a1.56 1.56 0 0 0-2.2 0 1.54 1.54 0 0 0-.396.701 1.56 1.56 0 0 0-2.21-.01 1.55 1.55 0 0 0-.416.753c-.624-.624-1.649-.624-2.237-.037a1.557 1.557 0 0 0 0 2.2c-.239.1-.501.238-.715.453a1.56 1.56 0 0 0 0 2.2l.516.515a1.556 1.556 0 0 0-.753 2.615L7.01 19c1.32 1.319 2.909 2.189 4.475 2.449q.482.08.971.08c.85 0 1.653-.198 2.393-.579.231.033.46.054.686.054 1.266 0 2.457-.52 3.505-1.567 2.763-2.763 2.552-5.734 1.439-7.586z" clip-rule="evenodd"></path></svg></div></div></div></a></span></div><div class="pw-multi-vote-count l ks kt ku kv kw kx ky"><p class="bf b dv z du"><span class="kz">--</span></p></div></div></span><span class="l h g f rg rh"><div class="ab q ke kf"><div class="pw-multi-vote-icon fj kg kh ki kj"><span><a class="af ag ah ai aj ak al am an ao ap aq ar as at" data-testid="footerClapButton" href="https://medium.com/m/signin?actionUrl=https%3A%2F%2Fmedium.com%2F_%2Fvote%2Fp%2F12d803da3854&operation=register&redirect=https%3A%2F%2Frahuld3eora.medium.com%2Fdeep-learning-for-action-recognition-from-basics-to-efficiency-advancements-12d803da3854&user=Rahul+Deora&userId=c0b8e0e6e9b0&source=---footer_actions--12d803da3854---------------------clap_footer-----------" rel="noopener follow"><div><div class="bm" aria-hidden="false"><div class="kk ao kl km kn ko am kp kq kr kj"><svg xmlns="http://www.w3.org/2000/svg" width="24" height="24" viewBox="0 0 24 24" aria-label="clap"><path fill-rule="evenodd" d="M11.37.828 12 3.282l.63-2.454zM13.916 3.953l1.523-2.112-1.184-.39zM8.589 1.84l1.522 2.112-.337-2.501zM18.523 18.92c-.86.86-1.75 1.246-2.62 1.33a6 6 0 0 0 .407-.372c2.388-2.389 2.86-4.951 1.399-7.623l-.912-1.603-.79-1.672c-.26-.56-.194-.98.203-1.288a.7.7 0 0 1 .546-.132c.283.046.546.231.728.5l2.363 4.157c.976 1.624 1.141 4.237-1.324 6.702m-10.999-.438L3.37 14.328a.828.828 0 0 1 .585-1.408.83.83 0 0 1 .585.242l2.158 2.157a.365.365 0 0 0 .516-.516l-2.157-2.158-1.449-1.449a.826.826 0 0 1 1.167-1.17l3.438 3.44a.363.363 0 0 0 .516 0 .364.364 0 0 0 0-.516L5.293 9.513l-.97-.97a.826.826 0 0 1 0-1.166.84.84 0 0 1 1.167 0l.97.968 3.437 3.436a.36.36 0 0 0 .517 0 .366.366 0 0 0 0-.516L6.977 7.83a.82.82 0 0 1-.241-.584.82.82 0 0 1 .824-.826c.219 0 .43.087.584.242l5.787 5.787a.366.366 0 0 0 .587-.415l-1.117-2.363c-.26-.56-.194-.98.204-1.289a.7.7 0 0 1 .546-.132c.283.046.545.232.727.501l2.193 3.86c1.302 2.38.883 4.59-1.277 6.75-1.156 1.156-2.602 1.627-4.19 1.367-1.418-.236-2.866-1.033-4.079-2.246M10.75 5.971l2.12 2.12c-.41.502-.465 1.17-.128 1.89l.22.465-3.523-3.523a.8.8 0 0 1-.097-.368c0-.22.086-.428.241-.584a.847.847 0 0 1 1.167 0m7.355 1.705c-.31-.461-.746-.758-1.23-.837a1.44 1.44 0 0 0-1.11.275c-.312.24-.505.543-.59.881a1.74 1.74 0 0 0-.906-.465 1.47 1.47 0 0 0-.82.106l-2.182-2.182a1.56 1.56 0 0 0-2.2 0 1.54 1.54 0 0 0-.396.701 1.56 1.56 0 0 0-2.21-.01 1.55 1.55 0 0 0-.416.753c-.624-.624-1.649-.624-2.237-.037a1.557 1.557 0 0 0 0 2.2c-.239.1-.501.238-.715.453a1.56 1.56 0 0 0 0 2.2l.516.515a1.556 1.556 0 0 0-.753 2.615L7.01 19c1.32 1.319 2.909 2.189 4.475 2.449q.482.08.971.08c.85 0 1.653-.198 2.393-.579.231.033.46.054.686.054 1.266 0 2.457-.52 3.505-1.567 2.763-2.763 2.552-5.734 1.439-7.586z" clip-rule="evenodd"></path></svg></div></div></div></a></span></div><div class="pw-multi-vote-count l ks kt ku kv kw kx ky"><p class="bf b dv z du"><span class="kz">--</span></p></div></div></span></div><div class="bq ab"><div><div class="bm" aria-hidden="false"><button class="ao kk lc ld ab q fk le lf" aria-label="responses"><svg xmlns="http://www.w3.org/2000/svg" width="24" height="24" viewBox="0 0 24 24" class="lb"><path d="M18.006 16.803c1.533-1.456 2.234-3.325 2.234-5.321C20.24 7.357 16.709 4 12.191 4S4 7.357 4 11.482c0 4.126 3.674 7.482 8.191 7.482.817 0 1.622-.111 2.393-.327.231.2.48.391.744.559 1.06.693 2.203 1.044 3.399 1.044.224-.008.4-.112.486-.287a.49.49 0 0 0-.042-.518c-.495-.67-.845-1.364-1.04-2.057a4 4 0 0 1-.125-.598zm-3.122 1.055-.067-.223-.315.096a8 8 0 0 1-2.311.338c-4.023 0-7.292-2.955-7.292-6.587 0-3.633 3.269-6.588 7.292-6.588 4.014 0 7.112 2.958 7.112 6.593 0 1.794-.608 3.469-2.027 4.72l-.195.168v.255c0 .056 0 .151.016.295.025.231.081.478.154.733.154.558.398 1.117.722 1.659a5.3 5.3 0 0 1-2.165-.845c-.276-.176-.714-.383-.941-.59z"></path></svg><p class="bf b bg z du"><span class="pw-responses-count la lb">1</span></p></button></div></div></div></div><div class="ab q"><div class="ri l is"><div><div class="bm" aria-hidden="false"><span><a class="af ag ah ai aj ak al am an ao ap aq ar as at" data-testid="footerBookmarkButton" href="https://medium.com/m/signin?actionUrl=https%3A%2F%2Fmedium.com%2F_%2Fbookmark%2Fp%2F12d803da3854&operation=register&redirect=https%3A%2F%2Frahuld3eora.medium.com%2Fdeep-learning-for-action-recognition-from-basics-to-efficiency-advancements-12d803da3854&source=---footer_actions--12d803da3854---------------------bookmark_footer-----------" rel="noopener follow"><svg xmlns="http://www.w3.org/2000/svg" width="25" height="25" fill="none" viewBox="0 0 25 25" class="du lh" aria-label="Add to list bookmark button"><path fill="currentColor" d="M18 2.5a.5.5 0 0 1 1 0V5h2.5a.5.5 0 0 1 0 1H19v2.5a.5.5 0 1 1-1 0V6h-2.5a.5.5 0 0 1 0-1H18zM7 7a1 1 0 0 1 1-1h3.5a.5.5 0 0 0 0-1H8a2 2 0 0 0-2 2v14a.5.5 0 0 0 .805.396L12.5 17l5.695 4.396A.5.5 0 0 0 19 21v-8.5a.5.5 0 0 0-1 0v7.485l-5.195-4.012a.5.5 0 0 0-.61 0L7 19.985z"></path></svg></a></span></div></div></div><div class="ri l is"><div class="bm" aria-hidden="false" aria-describedby="postFooterSocialMenu" aria-labelledby="postFooterSocialMenu"><div><div class="bm" aria-hidden="false"><button aria-controls="postFooterSocialMenu" aria-expanded="false" aria-label="Share Post" data-testid="footerSocialShareButton" class="af fk ah ai aj ak al lp an ao ap ex lq lr lf ls"><svg xmlns="http://www.w3.org/2000/svg" width="24" height="24" fill="none" viewBox="0 0 24 24"><path fill="currentColor" fill-rule="evenodd" d="M15.218 4.931a.4.4 0 0 1-.118.132l.012.006a.45.45 0 0 1-.292.074.5.5 0 0 1-.3-.13l-2.02-2.02v7.07c0 .28-.23.5-.5.5s-.5-.22-.5-.5v-7.04l-2 2a.45.45 0 0 1-.57.04h-.02a.4.4 0 0 1-.16-.3.4.4 0 0 1 .1-.32l2.8-2.8a.5.5 0 0 1 .7 0l2.8 2.79a.42.42 0 0 1 .068.498m-.106.138.008.004v-.01zM16 7.063h1.5a2 2 0 0 1 2 2v10a2 2 0 0 1-2 2h-11c-1.1 0-2-.9-2-2v-10a2 2 0 0 1 2-2H8a.5.5 0 0 1 .35.15.5.5 0 0 1 .15.35.5.5 0 0 1-.15.35.5.5 0 0 1-.35.15H6.4c-.5 0-.9.4-.9.9v10.2a.9.9 0 0 0 .9.9h11.2c.5 0 .9-.4.9-.9v-10.2c0-.5-.4-.9-.9-.9H16a.5.5 0 0 1 0-1" clip-rule="evenodd"></path></svg></button></div></div></div></div></div></div></div></div></div></footer><div class="rj rk rl rm rn l"><div class="ab cb"><div class="ci bh fz ga gb gc"><div class="ro bh r rp"></div><div class="ab rq rr rs iu it"><div class="rt ru rv rw rx ry rz sa sb sc ab cp"><div class="h k"><a tabindex="0" rel="noopener follow" href="/?source=post_page---post_author_info--12d803da3854--------------------------------"><div class="l fj"><img alt="Rahul Deora" class="l fd by ic ib cx" src="https://miro.medium.com/v2/resize:fill:96:96/1*yiQbwaG6UYBBeLZeDgIVLg.jpeg" width="48" height="48" loading="lazy"/><div class="fr by l ic ib fs n ay sd"></div></div></a></div><div class="j i d"><a tabindex="0" rel="noopener follow" href="/?source=post_page---post_author_info--12d803da3854--------------------------------"><div class="l fj"><img alt="Rahul Deora" class="l fd by se sf cx" src="https://miro.medium.com/v2/resize:fill:128:128/1*yiQbwaG6UYBBeLZeDgIVLg.jpeg" width="64" height="64" loading="lazy"/><div class="fr by l se sf fs n ay sd"></div></div></a></div><div class="j i d sg is"><div class="ab"><span><button class="bf b bg z sh qp si sj sk sl sm ev ew sn so sp fa fb fc fd bm fe ff">Follow</button></span></div></div></div><div class="ab co sq"><div class="sr ss st su sv l"><a class="af ag ah aj ak al am an ao ap aq ar as at ab q" rel="noopener follow" href="/?source=post_page---post_author_info--12d803da3854--------------------------------"><h2 class="pw-author-name bf sx sy sz ta tb tc td nk te tf no tg th ns ti tj bk"><span class="gn sw">Written by Rahul Deora</span></h2></a><div class="qn ab ia"><div class="l is"><span class="pw-follower-count bf b bg z du"><a class="af ag ah ai aj ak al am an ao ap aq ar il" rel="noopener follow" href="/followers?source=post_page---post_author_info--12d803da3854--------------------------------">75 Followers</a></span></div><div class="bf b bg z du ab tk"><span class="im l" aria-hidden="true"><span class="bf b bg z du">·</span></span><a class="af ag ah ai aj ak al am an ao ap aq ar il" rel="noopener follow" href="/following?source=post_page---post_author_info--12d803da3854--------------------------------">32 Following</a></div></div><div class="tl l"><p class="bf b bg z bk">Computer Vision Research Engineer. Personal Blog site: <a class="af ag ah ai aj ak al am an ao ap aq ar pi go" href="https://bluesky314.github.io/" rel="noopener ugc nofollow">https://bluesky314.github.io/</a></p></div></div></div><div class="h k"><div class="ab"><span><button class="bf b bg z sh qp si sj sk sl sm ev ew sn so sp fa fb fc fd bm fe ff">Follow</button></span></div></div></div></div></div></div><div class="tm tn to tp tq l"><div class="ro bh r tm tn tr ts tt"></div><div class="ab cb"><div class="ci bh fz ga gb gc"><div class="ab q cp"><h2 class="bf sx oi ok ol om oo op oq os ot ou ow ox oy pa pb bk">Responses (1)</h2><div class="ab tu"><div><div class="bm" aria-hidden="false"><a class="tv tw" href="https://policy.medium.com/medium-rules-30e5502c4eb4?source=post_page---post_responses--12d803da3854--------------------------------" rel="noopener follow" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" width="25" height="25" viewBox="0 0 25 25"><path fill-rule="evenodd" d="M11.987 5.036a.754.754 0 0 1 .914-.01c.972.721 1.767 1.218 2.6 1.543.828.322 1.719.485 2.887.505a.755.755 0 0 1 .741.757c-.018 3.623-.43 6.256-1.449 8.21-1.034 1.984-2.662 3.209-4.966 4.083a.75.75 0 0 1-.537-.003c-2.243-.874-3.858-2.095-4.897-4.074-1.024-1.951-1.457-4.583-1.476-8.216a.755.755 0 0 1 .741-.757c1.195-.02 2.1-.182 2.923-.503.827-.322 1.6-.815 2.519-1.535m.468.903c-.897.69-1.717 1.21-2.623 1.564-.898.35-1.856.527-3.026.565.037 3.45.469 5.817 1.36 7.515.884 1.684 2.25 2.762 4.284 3.571 2.092-.81 3.465-1.89 4.344-3.575.886-1.698 1.299-4.065 1.334-7.512-1.149-.039-2.091-.217-2.99-.567-.906-.353-1.745-.873-2.683-1.561m-.009 9.155a2.672 2.672 0 1 0 0-5.344 2.672 2.672 0 0 0 0 5.344m0 1a3.672 3.672 0 1 0 0-7.344 3.672 3.672 0 0 0 0 7.344m-1.813-3.777.525-.526.916.917 1.623-1.625.526.526-2.149 2.152z" clip-rule="evenodd"></path></svg></a></div></div></div></div><div class="tx l"><button class="bf b bg z bk qp ty tz ua lh le sm ev ew ex ub uc ud fa ue uf ug uh ui fb fc fd bm fe ff">See all responses</button></div></div></div></div><div class="uj uk ul um un l bx"><div class="h k j"><div class="ro bh uo up"></div><div class="ab cb"><div class="ci bh fz ga gb gc"><div class="uq ab ke iv"><div class="ur us l"><a class="af ag ah ai aj ak al am an ao ap aq ar as at" href="https://help.medium.com/hc/en-us?source=post_page-----12d803da3854--------------------------------" rel="noopener follow"><p class="bf b dv z du">Help</p></a></div><div class="ur us l"><a class="af ag ah ai aj ak al am an ao ap aq ar as at" href="https://medium.statuspage.io/?source=post_page-----12d803da3854--------------------------------" rel="noopener follow"><p class="bf b dv z du">Status</p></a></div><div class="ur us l"><a class="af ag ah ai aj ak al am an ao ap aq ar as at" href="https://medium.com/about?autoplay=1&source=post_page-----12d803da3854--------------------------------" rel="noopener follow"><p class="bf b dv z du">About</p></a></div><div class="ur us l"><a class="af ag ah ai aj ak al am an ao ap aq ar as at" href="https://medium.com/jobs-at-medium/work-at-medium-959d1a85284e?source=post_page-----12d803da3854--------------------------------" rel="noopener follow"><p class="bf b dv z du">Careers</p></a></div><div class="ur us l"><a class="af ag ah ai aj ak al am an ao ap aq ar as at" href="pressinquiries@medium.com?source=post_page-----12d803da3854--------------------------------" rel="noopener follow"><p class="bf b dv z du">Press</p></a></div><div class="ur us l"><a class="af ag ah ai aj ak al am an ao ap aq ar as at" href="https://blog.medium.com/?source=post_page-----12d803da3854--------------------------------" rel="noopener follow"><p class="bf b dv z du">Blog</p></a></div><div class="ur us l"><a class="af ag ah ai aj ak al am an ao ap aq ar as at" href="https://policy.medium.com/medium-privacy-policy-f03bf92035c9?source=post_page-----12d803da3854--------------------------------" rel="noopener follow"><p class="bf b dv z du">Privacy</p></a></div><div class="ur us l"><a class="af ag ah ai aj ak al am an ao ap aq ar as at" href="https://policy.medium.com/medium-terms-of-service-9db0094a1e0f?source=post_page-----12d803da3854--------------------------------" rel="noopener follow"><p class="bf b dv z du">Terms</p></a></div><div class="ur us l"><a class="af ag ah ai aj ak al am an ao ap aq ar as at" href="https://speechify.com/medium?source=post_page-----12d803da3854--------------------------------" rel="noopener follow"><p class="bf b dv z du">Text to speech</p></a></div><div class="ur l"><a class="af ag ah ai aj ak al am an ao ap aq ar as at" href="https://medium.com/business?source=post_page-----12d803da3854--------------------------------" rel="noopener follow"><p class="bf b dv z du">Teams</p></a></div></div></div></div></div></div></div></div></div></div><script>window.__BUILD_ID__="main-20241202-120250-b4d476f058"</script><script>window.__GRAPHQL_URI__ = "https://rahuld3eora.medium.com/_/graphql"</script><script>window.__PRELOADED_STATE__ = {"algolia":{"queries":{}},"cache":{"experimentGroupSet":true,"reason":"","group":"enabled","tags":["group-edgeCachePosts","post-12d803da3854","user-c0b8e0e6e9b0"],"serverVariantState":"44136fa355b3678a1146ad16f7e8649e94fb4fc21fe77e8310c060f61caaff8a","middlewareEnabled":true,"cacheStatus":"DYNAMIC","shouldUseCache":true,"vary":[],"lohpSummerUpsellEnabled":false,"publicationHierarchyEnabledWeb":false,"postBottomResponsesEnabled":false},"client":{"hydrated":false,"isUs":false,"isNativeMedium":false,"isSafariMobile":false,"isSafari":false,"isFirefox":true,"routingEntity":{"type":"USER","id":"c0b8e0e6e9b0","explicit":true},"viewerIsBot":false},"debug":{"requestId":"3fe417b6-1384-453e-b0a5-13ed26ca2127","hybridDevServices":[],"originalSpanCarrier":{"traceparent":"00-1bd2f002a46f7f1d2923b839417ca60d-b1f884bc7c723ac1-01"}},"multiVote":{"clapsPerPost":{}},"navigation":{"branch":{"show":null,"hasRendered":null,"blockedByCTA":false},"hideGoogleOneTap":false,"hasRenderedAlternateUserBanner":null,"currentLocation":"https:\u002F\u002Frahuld3eora.medium.com\u002Fdeep-learning-for-action-recognition-from-basics-to-efficiency-advancements-12d803da3854","host":"rahuld3eora.medium.com","hostname":"rahuld3eora.medium.com","referrer":"","hasSetReferrer":false,"susiModal":{"step":null,"operation":"register"},"postRead":false,"partnerProgram":{"selectedCountryCode":null},"queryString":"","currentHash":""},"config":{"nodeEnv":"production","version":"main-20241202-120250-b4d476f058","target":"production","productName":"Medium","publicUrl":"https:\u002F\u002Fcdn-client.medium.com\u002Flite","authDomain":"medium.com","authGoogleClientId":"216296035834-k1k6qe060s2tp2a2jam4ljdcms00sttg.apps.googleusercontent.com","favicon":"production","glyphUrl":"https:\u002F\u002Fglyph.medium.com","branchKey":"key_live_ofxXr2qTrrU9NqURK8ZwEhknBxiI6KBm","algolia":{"appId":"MQ57UUUQZ2","apiKeySearch":"394474ced050e3911ae2249ecc774921","indexPrefix":"medium_","host":"-dsn.algolia.net"},"recaptchaKey":"6Lfc37IUAAAAAKGGtC6rLS13R1Hrw_BqADfS1LRk","recaptcha3Key":"6Lf8R9wUAAAAABMI_85Wb8melS7Zj6ziuf99Yot5","recaptchaEnterpriseKeyId":"6Le-uGgpAAAAAPprRaokM8AKthQ9KNGdoxaGUvVp","datadog":{"applicationId":"6702d87d-a7e0-42fe-bbcb-95b469547ea0","clientToken":"pub853ea8d17ad6821d9f8f11861d23dfed","rumToken":"pubf9cc52896502b9413b68ba36fc0c7162","context":{"deployment":{"target":"production","tag":"main-20241202-120250-b4d476f058","commit":"b4d476f05843111baca5a9be048d4d77758064af"}},"datacenter":"us"},"googleAnalyticsCode":"G-7JY7T788PK","googlePay":{"apiVersion":"2","apiVersionMinor":"0","merchantId":"BCR2DN6TV7EMTGBM","merchantName":"Medium","instanceMerchantId":"13685562959212738550"},"applePay":{"version":3},"signInWallCustomDomainCollectionIds":["3a8144eabfe3","336d898217ee","61061eb0c96b","138adf9c44c","819cc2aaeee0"],"mediumMastodonDomainName":"me.dm","mediumOwnedAndOperatedCollectionIds":["8a9336e5bb4","b7e45b22fec3","193b68bd4fba","8d6b8a439e32","54c98c43354d","3f6ecf56618","d944778ce714","92d2092dc598","ae2a65f35510","1285ba81cada","544c7006046e","fc8964313712","40187e704f1c","88d9857e584e","7b6769f2748b","bcc38c8f6edf","cef6983b292","cb8577c9149e","444d13b52878","713d7dbc99b0","ef8e90590e66","191186aaafa0","55760f21cdc5","9dc80918cc93","bdc4052bbdba","8ccfed20cbb2"],"tierOneDomains":["medium.com","thebolditalic.com","arcdigital.media","towardsdatascience.com","uxdesign.cc","codeburst.io","psiloveyou.xyz","writingcooperative.com","entrepreneurshandbook.co","prototypr.io","betterhumans.coach.me","theascent.pub"],"topicsToFollow":["d61cf867d93f","8a146bc21b28","1eca0103fff3","4d562ee63426","aef1078a3ef5","e15e46793f8d","6158eb913466","55f1c20aba7a","3d18b94f6858","4861fee224fd","63c6f1f93ee","1d98b3a9a871","decb52b64abf","ae5d4995e225","830cded25262"],"topicToTagMappings":{"accessibility":"accessibility","addiction":"addiction","android-development":"android-development","art":"art","artificial-intelligence":"artificial-intelligence","astrology":"astrology","basic-income":"basic-income","beauty":"beauty","biotech":"biotech","blockchain":"blockchain","books":"books","business":"business","cannabis":"cannabis","cities":"cities","climate-change":"climate-change","comics":"comics","coronavirus":"coronavirus","creativity":"creativity","cryptocurrency":"cryptocurrency","culture":"culture","cybersecurity":"cybersecurity","data-science":"data-science","design":"design","digital-life":"digital-life","disability":"disability","economy":"economy","education":"education","equality":"equality","family":"family","feminism":"feminism","fiction":"fiction","film":"film","fitness":"fitness","food":"food","freelancing":"freelancing","future":"future","gadgets":"gadgets","gaming":"gaming","gun-control":"gun-control","health":"health","history":"history","humor":"humor","immigration":"immigration","ios-development":"ios-development","javascript":"javascript","justice":"justice","language":"language","leadership":"leadership","lgbtqia":"lgbtqia","lifestyle":"lifestyle","machine-learning":"machine-learning","makers":"makers","marketing":"marketing","math":"math","media":"media","mental-health":"mental-health","mindfulness":"mindfulness","money":"money","music":"music","neuroscience":"neuroscience","nonfiction":"nonfiction","outdoors":"outdoors","parenting":"parenting","pets":"pets","philosophy":"philosophy","photography":"photography","podcasts":"podcast","poetry":"poetry","politics":"politics","privacy":"privacy","product-management":"product-management","productivity":"productivity","programming":"programming","psychedelics":"psychedelics","psychology":"psychology","race":"race","relationships":"relationships","religion":"religion","remote-work":"remote-work","san-francisco":"san-francisco","science":"science","self":"self","self-driving-cars":"self-driving-cars","sexuality":"sexuality","social-media":"social-media","society":"society","software-engineering":"software-engineering","space":"space","spirituality":"spirituality","sports":"sports","startups":"startup","style":"style","technology":"technology","transportation":"transportation","travel":"travel","true-crime":"true-crime","tv":"tv","ux":"ux","venture-capital":"venture-capital","visual-design":"visual-design","work":"work","world":"world","writing":"writing"},"defaultImages":{"avatar":{"imageId":"1*dmbNkD5D-u45r44go_cf0g.png","height":150,"width":150},"orgLogo":{"imageId":"7*V1_7XP4snlmqrc_0Njontw.png","height":110,"width":500},"postLogo":{"imageId":"bd978bb536350a710e8efb012513429cabdc4c28700604261aeda246d0f980b7","height":810,"width":1440},"postPreviewImage":{"imageId":"1*hn4v1tCaJy7cWMyb0bpNpQ.png","height":386,"width":579}},"collectionStructuredData":{"8d6b8a439e32":{"name":"Elemental","data":{"@type":"NewsMediaOrganization","ethicsPolicy":"https:\u002F\u002Fhelp.medium.com\u002Fhc\u002Fen-us\u002Farticles\u002F360043290473","logo":{"@type":"ImageObject","url":"https:\u002F\u002Fcdn-images-1.medium.com\u002Fmax\u002F980\u002F1*9ygdqoKprhwuTVKUM0DLPA@2x.png","width":980,"height":159}}},"3f6ecf56618":{"name":"Forge","data":{"@type":"NewsMediaOrganization","ethicsPolicy":"https:\u002F\u002Fhelp.medium.com\u002Fhc\u002Fen-us\u002Farticles\u002F360043290473","logo":{"@type":"ImageObject","url":"https:\u002F\u002Fcdn-images-1.medium.com\u002Fmax\u002F596\u002F1*uULpIlImcO5TDuBZ6lm7Lg@2x.png","width":596,"height":183}}},"ae2a65f35510":{"name":"GEN","data":{"@type":"NewsMediaOrganization","ethicsPolicy":"https:\u002F\u002Fhelp.medium.com\u002Fhc\u002Fen-us\u002Farticles\u002F360043290473","logo":{"@type":"ImageObject","url":"https:\u002F\u002Fmiro.medium.com\u002Fmax\u002F264\u002F1*RdVZMdvfV3YiZTw6mX7yWA.png","width":264,"height":140}}},"88d9857e584e":{"name":"LEVEL","data":{"@type":"NewsMediaOrganization","ethicsPolicy":"https:\u002F\u002Fhelp.medium.com\u002Fhc\u002Fen-us\u002Farticles\u002F360043290473","logo":{"@type":"ImageObject","url":"https:\u002F\u002Fmiro.medium.com\u002Fmax\u002F540\u002F1*JqYMhNX6KNNb2UlqGqO2WQ.png","width":540,"height":108}}},"7b6769f2748b":{"name":"Marker","data":{"@type":"NewsMediaOrganization","ethicsPolicy":"https:\u002F\u002Fhelp.medium.com\u002Fhc\u002Fen-us\u002Farticles\u002F360043290473","logo":{"@type":"ImageObject","url":"https:\u002F\u002Fcdn-images-1.medium.com\u002Fmax\u002F383\u002F1*haCUs0wF6TgOOvfoY-jEoQ@2x.png","width":383,"height":92}}},"444d13b52878":{"name":"OneZero","data":{"@type":"NewsMediaOrganization","ethicsPolicy":"https:\u002F\u002Fhelp.medium.com\u002Fhc\u002Fen-us\u002Farticles\u002F360043290473","logo":{"@type":"ImageObject","url":"https:\u002F\u002Fmiro.medium.com\u002Fmax\u002F540\u002F1*cw32fIqCbRWzwJaoQw6BUg.png","width":540,"height":123}}},"8ccfed20cbb2":{"name":"Zora","data":{"@type":"NewsMediaOrganization","ethicsPolicy":"https:\u002F\u002Fhelp.medium.com\u002Fhc\u002Fen-us\u002Farticles\u002F360043290473","logo":{"@type":"ImageObject","url":"https:\u002F\u002Fmiro.medium.com\u002Fmax\u002F540\u002F1*tZUQqRcCCZDXjjiZ4bDvgQ.png","width":540,"height":106}}}},"embeddedPostIds":{"coronavirus":"cd3010f9d81f"},"sharedCdcMessaging":{"COVID_APPLICABLE_TAG_SLUGS":[],"COVID_APPLICABLE_TOPIC_NAMES":[],"COVID_APPLICABLE_TOPIC_NAMES_FOR_TOPIC_PAGE":[],"COVID_MESSAGES":{"tierA":{"text":"For more information on the novel coronavirus and Covid-19, visit cdc.gov.","markups":[{"start":66,"end":73,"href":"https:\u002F\u002Fwww.cdc.gov\u002Fcoronavirus\u002F2019-nCoV"}]},"tierB":{"text":"Anyone can publish on Medium per our Policies, but we don’t fact-check every story. For more info about the coronavirus, see cdc.gov.","markups":[{"start":37,"end":45,"href":"https:\u002F\u002Fhelp.medium.com\u002Fhc\u002Fen-us\u002Fcategories\u002F201931128-Policies-Safety"},{"start":125,"end":132,"href":"https:\u002F\u002Fwww.cdc.gov\u002Fcoronavirus\u002F2019-nCoV"}]},"paywall":{"text":"This article has been made free for everyone, thanks to Medium Members. For more information on the novel coronavirus and Covid-19, visit cdc.gov.","markups":[{"start":56,"end":70,"href":"https:\u002F\u002Fmedium.com\u002Fmembership"},{"start":138,"end":145,"href":"https:\u002F\u002Fwww.cdc.gov\u002Fcoronavirus\u002F2019-nCoV"}]},"unbound":{"text":"This article is free for everyone, thanks to Medium Members. For more information on the novel coronavirus and Covid-19, visit cdc.gov.","markups":[{"start":45,"end":59,"href":"https:\u002F\u002Fmedium.com\u002Fmembership"},{"start":127,"end":134,"href":"https:\u002F\u002Fwww.cdc.gov\u002Fcoronavirus\u002F2019-nCoV"}]}},"COVID_BANNER_POST_ID_OVERRIDE_WHITELIST":["3b31a67bff4a"]},"sharedVoteMessaging":{"TAGS":["politics","election-2020","government","us-politics","election","2020-presidential-race","trump","donald-trump","democrats","republicans","congress","republican-party","democratic-party","biden","joe-biden","maga"],"TOPICS":["politics","election"],"MESSAGE":{"text":"Find out more about the U.S. election results here.","markups":[{"start":46,"end":50,"href":"https:\u002F\u002Fcookpolitical.com\u002F2020-national-popular-vote-tracker"}]},"EXCLUDE_POSTS":["397ef29e3ca5"]},"embedPostRules":[],"recircOptions":{"v1":{"limit":3},"v2":{"limit":8}},"braintreeClientKey":"production_zjkj96jm_m56f8fqpf7ngnrd4","braintree":{"enabled":true,"merchantId":"m56f8fqpf7ngnrd4","merchantAccountId":{"usd":"AMediumCorporation_instant","eur":"amediumcorporation_EUR","cad":"amediumcorporation_CAD"},"publicKey":"ds2nn34bg2z7j5gd","braintreeEnvironment":"production","dashboardUrl":"https:\u002F\u002Fwww.braintreegateway.com\u002Fmerchants","gracePeriodDurationInDays":14,"mediumMembershipPlanId":{"monthly":"ce105f8c57a3","monthlyV2":"e8a5e126-792b-4ee6-8fba-d574c1b02fc5","monthlyWithTrial":"d5ee3dbe3db8","monthlyPremium":"fa741a9b47a2","yearly":"a40ad4a43185","yearlyV2":"3815d7d6-b8ca-4224-9b8c-182f9047866e","yearlyStaff":"d74fb811198a","yearlyWithTrial":"b3bc7350e5c7","yearlyPremium":"e21bd2c12166","monthlyOneYearFree":"e6c0637a-2bad-4171-ab4f-3c268633d83c","monthly25PercentOffFirstYear":"235ecc62-0cdb-49ae-9378-726cd21c504b","monthly20PercentOffFirstYear":"ba518864-9c13-4a99-91ca-411bf0cac756","monthly15PercentOffFirstYear":"594c029b-9f89-43d5-88f8-8173af4e070e","monthly10PercentOffFirstYear":"c6c7bc9a-40f2-4b51-8126-e28511d5bdb0","monthlyForStudents":"629ebe51-da7d-41fd-8293-34cd2f2030a8","yearlyOneYearFree":"78ba7be9-0d9f-4ece-aa3e-b54b826f2bf1","yearly25PercentOffFirstYear":"2dbb010d-bb8f-4eeb-ad5c-a08509f42d34","yearly20PercentOffFirstYear":"47565488-435b-47f8-bf93-40d5fbe0ebc8","yearly15PercentOffFirstYear":"8259809b-0881-47d9-acf7-6c001c7f720f","yearly10PercentOffFirstYear":"9dd694fb-96e1-472c-8d9e-3c868d5c1506","yearlyForStudents":"e29345ef-ab1c-4234-95c5-70e50fe6bc23","monthlyCad":"p52orjkaceei","yearlyCad":"h4q9g2up9ktt"},"braintreeDiscountId":{"oneMonthFree":"MONTHS_FREE_01","threeMonthsFree":"MONTHS_FREE_03","sixMonthsFree":"MONTHS_FREE_06","fiftyPercentOffOneYear":"FIFTY_PERCENT_OFF_ONE_YEAR"},"3DSecureVersion":"2","defaultCurrency":"usd","providerPlanIdCurrency":{"4ycw":"usd","rz3b":"usd","3kqm":"usd","jzw6":"usd","c2q2":"usd","nnsw":"usd","q8qw":"usd","d9y6":"usd","fx7w":"cad","nwf2":"cad"}},"paypalClientId":"AXj1G4fotC2GE8KzWX9mSxCH1wmPE3nJglf4Z2ig_amnhvlMVX87otaq58niAg9iuLktVNF_1WCMnN7v","paypal":{"host":"https:\u002F\u002Fapi.paypal.com:443","clientMode":"production","serverMode":"live","webhookId":"4G466076A0294510S","monthlyPlan":{"planId":"P-9WR0658853113943TMU5FDQA","name":"Medium Membership (Monthly) with setup fee","description":"Unlimited access to the best and brightest stories on Medium. Membership billed monthly."},"yearlyPlan":{"planId":"P-7N8963881P8875835MU5JOPQ","name":"Medium Membership (Annual) with setup fee","description":"Unlimited access to the best and brightest stories on Medium. Membership billed annually."},"oneYearGift":{"name":"Medium Membership (1 Year, Digital Gift Code)","description":"Unlimited access to the best and brightest stories on Medium. Gift codes can be redeemed at medium.com\u002Fredeem.","price":"50.00","currency":"USD","sku":"membership-gift-1-yr"},"oldMonthlyPlan":{"planId":"P-96U02458LM656772MJZUVH2Y","name":"Medium Membership (Monthly)","description":"Unlimited access to the best and brightest stories on Medium. Membership billed monthly."},"oldYearlyPlan":{"planId":"P-59P80963JF186412JJZU3SMI","name":"Medium Membership (Annual)","description":"Unlimited access to the best and brightest stories on Medium. Membership billed annually."},"monthlyPlanWithTrial":{"planId":"P-66C21969LR178604GJPVKUKY","name":"Medium Membership (Monthly) with setup fee","description":"Unlimited access to the best and brightest stories on Medium. Membership billed monthly."},"yearlyPlanWithTrial":{"planId":"P-6XW32684EX226940VKCT2MFA","name":"Medium Membership (Annual) with setup fee","description":"Unlimited access to the best and brightest stories on Medium. Membership billed annually."},"oldMonthlyPlanNoSetupFee":{"planId":"P-4N046520HR188054PCJC7LJI","name":"Medium Membership (Monthly)","description":"Unlimited access to the best and brightest stories on Medium. Membership billed monthly."},"oldYearlyPlanNoSetupFee":{"planId":"P-7A4913502Y5181304CJEJMXQ","name":"Medium Membership (Annual)","description":"Unlimited access to the best and brightest stories on Medium. Membership billed annually."},"sdkUrl":"https:\u002F\u002Fwww.paypal.com\u002Fsdk\u002Fjs"},"stripePublishableKey":"pk_live_7FReX44VnNIInZwrIIx6ghjl","log":{"json":true,"level":"info"},"imageUploadMaxSizeMb":25,"staffPicks":{"title":"Staff Picks","catalogId":"c7bc6e1ee00f"}},"session":{"xsrf":""}}</script><script>window.__APOLLO_STATE__ = {"ROOT_QUERY":{"__typename":"Query","viewer":null,"collectionByDomainOrSlug({\"domainOrSlug\":\"rahuld3eora.medium.com\"})":null,"postResult({\"id\":\"12d803da3854\"})":{"__ref":"Post:12d803da3854"}},"LinkedAccounts:c0b8e0e6e9b0":{"__typename":"LinkedAccounts","mastodon":null,"id":"c0b8e0e6e9b0"},"UserViewerEdge:userId:c0b8e0e6e9b0-viewerId:lo_9acb5374b4d5":{"__typename":"UserViewerEdge","id":"userId:c0b8e0e6e9b0-viewerId:lo_9acb5374b4d5","isFollowing":false,"isUser":false,"isMuting":false},"NewsletterV3:6e50ffc48754":{"__typename":"NewsletterV3","id":"6e50ffc48754","type":"NEWSLETTER_TYPE_AUTHOR","slug":"c0b8e0e6e9b0","name":"c0b8e0e6e9b0","collection":null,"user":{"__ref":"User:c0b8e0e6e9b0"}},"User:c0b8e0e6e9b0":{"__typename":"User","id":"c0b8e0e6e9b0","name":"Rahul Deora","username":"rahuld3eora","newsletterV3":{"__ref":"NewsletterV3:6e50ffc48754"},"linkedAccounts":{"__ref":"LinkedAccounts:c0b8e0e6e9b0"},"isSuspended":false,"imageId":"1*yiQbwaG6UYBBeLZeDgIVLg.jpeg","mediumMemberAt":0,"verifications":{"__typename":"VerifiedInfo","isBookAuthor":false},"socialStats":{"__typename":"SocialStats","followerCount":75,"followingCount":30,"collectionFollowingCount":2},"customDomainState":{"__typename":"CustomDomainState","live":{"__typename":"CustomDomain","domain":"rahuld3eora.medium.com"}},"hasSubdomain":true,"bio":"Computer Vision Research Engineer. Personal Blog site: https:\u002F\u002Fbluesky314.github.io\u002F","isPartnerProgramEnrolled":false,"viewerEdge":{"__ref":"UserViewerEdge:userId:c0b8e0e6e9b0-viewerId:lo_9acb5374b4d5"},"viewerIsUser":false,"postSubscribeMembershipUpsellShownAt":0,"membership":null,"allowNotes":true,"twitterScreenName":""},"Paragraph:d584677e4bfb_0":{"__typename":"Paragraph","id":"d584677e4bfb_0","name":"4f80","type":"H3","href":null,"layout":null,"metadata":null,"text":"Deep Learning for Action Recognition: From Basics to Efficiency Advancements","hasDropCap":null,"dropCapImage":null,"markups":[],"codeBlockMetadata":null,"iframe":null,"mixtapeMetadata":null},"ImageMetadata:1*aNj3n_P-oyNwuOSCZxmFiQ.png":{"__typename":"ImageMetadata","id":"1*aNj3n_P-oyNwuOSCZxmFiQ.png","originalHeight":1292,"originalWidth":1714,"focusPercentX":null,"focusPercentY":null,"alt":null},"Paragraph:d584677e4bfb_1":{"__typename":"Paragraph","id":"d584677e4bfb_1","name":"b8c2","type":"IMG","href":null,"layout":"INSET_CENTER","metadata":{"__ref":"ImageMetadata:1*aNj3n_P-oyNwuOSCZxmFiQ.png"},"text":"The many actions in a video","hasDropCap":null,"dropCapImage":null,"markups":[],"codeBlockMetadata":null,"iframe":null,"mixtapeMetadata":null},"Paragraph:d584677e4bfb_2":{"__typename":"Paragraph","id":"d584677e4bfb_2","name":"8554","type":"P","href":null,"layout":null,"metadata":null,"text":"Action recognition is an important task in the field of computer vision that entails classifying human actions depicted in video frames. Think of it as the video counterpart of image classification. Action recognition is to videos what classification is to images. Instead of identifying objects in static 2D images, action recognition involves discerning actions within dynamic video clips, where each frame is essentially a 2D image connected to other 2D images in a sequence.","hasDropCap":null,"dropCapImage":null,"markups":[],"codeBlockMetadata":null,"iframe":null,"mixtapeMetadata":null},"Paragraph:d584677e4bfb_3":{"__typename":"Paragraph","id":"d584677e4bfb_3","name":"16d5","type":"P","href":null,"layout":null,"metadata":null,"text":"Action Recognition is more challenging than 2D classification due to the following reasons:","hasDropCap":null,"dropCapImage":null,"markups":[],"codeBlockMetadata":null,"iframe":null,"mixtapeMetadata":null},"Paragraph:d584677e4bfb_4":{"__typename":"Paragraph","id":"d584677e4bfb_4","name":"08f1","type":"ULI","href":null,"layout":null,"metadata":null,"text":"Densely Packed Actions: Videos often present scenarios where numerous actions unfold concurrently or in quick succession","hasDropCap":null,"dropCapImage":null,"markups":[],"codeBlockMetadata":null,"iframe":null,"mixtapeMetadata":null},"Paragraph:d584677e4bfb_5":{"__typename":"Paragraph","id":"d584677e4bfb_5","name":"fe12","type":"ULI","href":null,"layout":null,"metadata":null,"text":"Long-Range Processing: Actions may extend over extended intervals, requiring long-range processing to capture the nuances and transitions effectively","hasDropCap":null,"dropCapImage":null,"markups":[],"codeBlockMetadata":null,"iframe":null,"mixtapeMetadata":null},"Paragraph:d584677e4bfb_6":{"__typename":"Paragraph","id":"d584677e4bfb_6","name":"44cc","type":"ULI","href":null,"layout":null,"metadata":null,"text":"Irrelevant Frames: Not every frame contributes to the action recognition process, and there may be many irrelevant frames which need to be ignored","hasDropCap":null,"dropCapImage":null,"markups":[],"codeBlockMetadata":null,"iframe":null,"mixtapeMetadata":null},"Paragraph:d584677e4bfb_7":{"__typename":"Paragraph","id":"d584677e4bfb_7","name":"5ae9","type":"ULI","href":null,"layout":null,"metadata":null,"text":"Expensive and Time Consuming Training: Video models are harder and more compute intensive than image models","hasDropCap":null,"dropCapImage":null,"markups":[],"codeBlockMetadata":null,"iframe":null,"mixtapeMetadata":null},"Paragraph:d584677e4bfb_8":{"__typename":"Paragraph","id":"d584677e4bfb_8","name":"ae8b","type":"ULI","href":null,"layout":null,"metadata":null,"text":"Generalization Challenges: Harder to generalize due to the amount of variations possible in the video space","hasDropCap":null,"dropCapImage":null,"markups":[],"codeBlockMetadata":null,"iframe":null,"mixtapeMetadata":null},"Paragraph:d584677e4bfb_9":{"__typename":"Paragraph","id":"d584677e4bfb_9","name":"d088","type":"P","href":null,"layout":null,"metadata":null,"text":"Videos are generally 32 or 64fps so it is common to lower the frame rate(subsample in the temporal dimension) prior to processing them.","hasDropCap":null,"dropCapImage":null,"markups":[],"codeBlockMetadata":null,"iframe":null,"mixtapeMetadata":null},"Paragraph:d584677e4bfb_10":{"__typename":"Paragraph","id":"d584677e4bfb_10","name":"9029","type":"P","href":null,"layout":null,"metadata":null,"text":"In this blog, we’ll explore some of the early prominent approaches to action recognition and then cover some efficient methods that will help you get a strong overview of this field.","hasDropCap":null,"dropCapImage":null,"markups":[],"codeBlockMetadata":null,"iframe":null,"mixtapeMetadata":null},"Paragraph:d584677e4bfb_11":{"__typename":"Paragraph","id":"d584677e4bfb_11","name":"86e7","type":"H3","href":null,"layout":null,"metadata":null,"text":"Single Stream Network","hasDropCap":null,"dropCapImage":null,"markups":[{"__typename":"Markup","type":"STRONG","start":0,"end":21,"href":null,"anchorType":null,"userId":null,"linkMetadata":null}],"codeBlockMetadata":null,"iframe":null,"mixtapeMetadata":null},"Paragraph:d584677e4bfb_12":{"__typename":"Paragraph","id":"d584677e4bfb_12","name":"2289","type":"P","href":null,"layout":null,"metadata":null,"text":"Paper: Large-scale Video Classification with Convolutional Neural Networks: https:\u002F\u002Fstatic.googleusercontent.com\u002Fmedia\u002Fresearch.google.com\u002Fen\u002F\u002Fpubs\u002Farchive\u002F42455.pdf","hasDropCap":null,"dropCapImage":null,"markups":[{"__typename":"Markup","type":"A","start":76,"end":165,"href":"https:\u002F\u002Fstatic.googleusercontent.com\u002Fmedia\u002Fresearch.google.com\u002Fen\u002F\u002Fpubs\u002Farchive\u002F42455.pdf","anchorType":"LINK","userId":null,"linkMetadata":null}],"codeBlockMetadata":null,"iframe":null,"mixtapeMetadata":null},"ImageMetadata:1*rSXyK5HtenB3J3b9gvOG8A.png":{"__typename":"ImageMetadata","id":"1*rSXyK5HtenB3J3b9gvOG8A.png","originalHeight":382,"originalWidth":946,"focusPercentX":null,"focusPercentY":null,"alt":null},"Paragraph:d584677e4bfb_13":{"__typename":"Paragraph","id":"d584677e4bfb_13","name":"3fcc","type":"IMG","href":null,"layout":"INSET_CENTER","metadata":{"__ref":"ImageMetadata:1*rSXyK5HtenB3J3b9gvOG8A.png"},"text":"4 different ways to process videos","hasDropCap":null,"dropCapImage":null,"markups":[],"codeBlockMetadata":null,"iframe":null,"mixtapeMetadata":null},"Paragraph:d584677e4bfb_14":{"__typename":"Paragraph","id":"d584677e4bfb_14","name":"2572","type":"P","href":null,"layout":null,"metadata":null,"text":"This was an early seminal paper showing different ways information from different frames can be merged. The paper makes use of 3D convolution to merge multiple frames","hasDropCap":null,"dropCapImage":null,"markups":[],"codeBlockMetadata":null,"iframe":null,"mixtapeMetadata":null},"Paragraph:d584677e4bfb_15":{"__typename":"Paragraph","id":"d584677e4bfb_15","name":"b6a2","type":"P","href":null,"layout":null,"metadata":null,"text":"Single-frame: Only the one middle frame is used and processed by a 2D convolutional network to determine the accuracy for disregarding temporal information. A naive baseline.\nEarly Fusion: Here we pick T middle frames and process through a 2D convolutional network with the first filter being of size 11×11×3×T pixels where T is the temporal resolution of the filter. Only the middle T frames are processed as shown in the above diagram.","hasDropCap":null,"dropCapImage":null,"markups":[{"__typename":"Markup","type":"STRONG","start":0,"end":12,"href":null,"anchorType":null,"userId":null,"linkMetadata":null},{"__typename":"Markup","type":"STRONG","start":175,"end":187,"href":null,"anchorType":null,"userId":null,"linkMetadata":null},{"__typename":"Markup","type":"EM","start":309,"end":310,"href":null,"anchorType":null,"userId":null,"linkMetadata":null}],"codeBlockMetadata":null,"iframe":null,"mixtapeMetadata":null},"Paragraph:d584677e4bfb_16":{"__typename":"Paragraph","id":"d584677e4bfb_16","name":"897d","type":"P","href":null,"layout":null,"metadata":null,"text":"Late Fusion: Here two separate single-frame networks with shared parameters are used at a distance of 15 frames. Then the features are merged for a final classification. This method fuses information only at the end so is known as Late Fusion\nSlow Fusion: The Slow Fusion model slowly fuses temporal information throughout the network such that higher layers get access to progressively more global information in both spatial and temporal dimensions. This is implemented by carrying out temporal convolutions(3D Convolutions) which iteratively grow in temporal receptive field size as the network progresses.","hasDropCap":null,"dropCapImage":null,"markups":[{"__typename":"Markup","type":"STRONG","start":0,"end":11,"href":null,"anchorType":null,"userId":null,"linkMetadata":null},{"__typename":"Markup","type":"STRONG","start":243,"end":254,"href":null,"anchorType":null,"userId":null,"linkMetadata":null}],"codeBlockMetadata":null,"iframe":null,"mixtapeMetadata":null},"ImageMetadata:1*C0WFBnELAxQZ_zYJNLkTBQ.png":{"__typename":"ImageMetadata","id":"1*C0WFBnELAxQZ_zYJNLkTBQ.png","originalHeight":458,"originalWidth":1344,"focusPercentX":null,"focusPercentY":null,"alt":null},"Paragraph:d584677e4bfb_17":{"__typename":"Paragraph","id":"d584677e4bfb_17","name":"862f","type":"IMG","href":null,"layout":"INSET_CENTER","metadata":{"__ref":"ImageMetadata:1*C0WFBnELAxQZ_zYJNLkTBQ.png"},"text":"Results","hasDropCap":null,"dropCapImage":null,"markups":[],"codeBlockMetadata":null,"iframe":null,"mixtapeMetadata":null},"Paragraph:d584677e4bfb_18":{"__typename":"Paragraph","id":"d584677e4bfb_18","name":"4de2","type":"P","href":null,"layout":null,"metadata":null,"text":"As one would expect, the results showed that Slow Fusion performed the best of all the above methods however Single Frame was a close second.","hasDropCap":null,"dropCapImage":null,"markups":[],"codeBlockMetadata":null,"iframe":null,"mixtapeMetadata":null},"Paragraph:d584677e4bfb_19":{"__typename":"Paragraph","id":"d584677e4bfb_19","name":"9306","type":"H3","href":null,"layout":null,"metadata":null,"text":"Two Stream Networks","hasDropCap":null,"dropCapImage":null,"markups":[],"codeBlockMetadata":null,"iframe":null,"mixtapeMetadata":null},"Paragraph:d584677e4bfb_20":{"__typename":"Paragraph","id":"d584677e4bfb_20","name":"9659","type":"P","href":null,"layout":null,"metadata":null,"text":"Paper: Two-Stream Convolutional Networks for Action Recognition in Videos: https:\u002F\u002Farxiv.org\u002Fpdf\u002F1406.2199.pdf","hasDropCap":null,"dropCapImage":null,"markups":[{"__typename":"Markup","type":"A","start":75,"end":110,"href":"https:\u002F\u002Farxiv.org\u002Fpdf\u002F1406.2199.pdf","anchorType":"LINK","userId":null,"linkMetadata":null}],"codeBlockMetadata":null,"iframe":null,"mixtapeMetadata":null},"Paragraph:d584677e4bfb_21":{"__typename":"Paragraph","id":"d584677e4bfb_21","name":"3090","type":"P","href":null,"layout":null,"metadata":null,"text":"One of the reasons networks such as the above, which are called Single Stream Networks, failed to live to their promise is because single frame image classification is a strong baseline. That is, it is usually possible to classify the whole video based on just the center frame run through a 2D CNN.","hasDropCap":null,"dropCapImage":null,"markups":[],"codeBlockMetadata":null,"iframe":null,"mixtapeMetadata":null},"Paragraph:d584677e4bfb_22":{"__typename":"Paragraph","id":"d584677e4bfb_22","name":"30e9","type":"P","href":null,"layout":null,"metadata":null,"text":"Inspired by the two-streams hypothesis of the human visual system which states that human visual cortex contains two pathways: the ventral stream (which performs object recognition) and the dorsal stream (which recognises motion), this work attempts to aggregate spatial and temporal information via processing of a spatial and temporal components separately.","hasDropCap":null,"dropCapImage":null,"markups":[],"codeBlockMetadata":null,"iframe":null,"mixtapeMetadata":null},"ImageMetadata:1*O0HLUzyHL9wyKeieXSIJbA.png":{"__typename":"ImageMetadata","id":"1*O0HLUzyHL9wyKeieXSIJbA.png","originalHeight":347,"originalWidth":1023,"focusPercentX":null,"focusPercentY":null,"alt":null},"Paragraph:d584677e4bfb_23":{"__typename":"Paragraph","id":"d584677e4bfb_23","name":"6ea9","type":"IMG","href":null,"layout":"INSET_CENTER","metadata":{"__ref":"ImageMetadata:1*O0HLUzyHL9wyKeieXSIJbA.png"},"text":"Two Stream Network with Spatial and Temporal streams","hasDropCap":null,"dropCapImage":null,"markups":[],"codeBlockMetadata":null,"iframe":null,"mixtapeMetadata":null},"Paragraph:d584677e4bfb_24":{"__typename":"Paragraph","id":"d584677e4bfb_24","name":"5c7c","type":"P","href":null,"layout":null,"metadata":null,"text":"Video can naturally be decomposed into spatial and temporal components. Here the spatial stream performs action classification from still video frames, whilst the temporal stream is trained to recognise action from motion in the form of dense optical flow. Optimal flow better isolates motion than RGB making it easy for the network to infer movements.","hasDropCap":null,"dropCapImage":null,"markups":[],"codeBlockMetadata":null,"iframe":null,"mixtapeMetadata":null},"Paragraph:d584677e4bfb_25":{"__typename":"Paragraph","id":"d584677e4bfb_25","name":"6e18","type":"P","href":null,"layout":null,"metadata":null,"text":"Decoupling the spatial and temporal nets also allows us to exploit the availability of large amounts of annotated image data by pre-training the spatial net on the ImageNet challenge dataset.","hasDropCap":null,"dropCapImage":null,"markups":[],"codeBlockMetadata":null,"iframe":null,"mixtapeMetadata":null},"ImageMetadata:1*rsKIri8ucGxwiy0mvPSnTQ.png":{"__typename":"ImageMetadata","id":"1*rsKIri8ucGxwiy0mvPSnTQ.png","originalHeight":165,"originalWidth":481,"focusPercentX":null,"focusPercentY":null,"alt":null},"Paragraph:d584677e4bfb_26":{"__typename":"Paragraph","id":"d584677e4bfb_26","name":"84d6","type":"IMG","href":null,"layout":"INSET_CENTER","metadata":{"__ref":"ImageMetadata:1*rsKIri8ucGxwiy0mvPSnTQ.png"},"text":"Optical flow","hasDropCap":null,"dropCapImage":null,"markups":[],"codeBlockMetadata":null,"iframe":null,"mixtapeMetadata":null},"Paragraph:d584677e4bfb_27":{"__typename":"Paragraph","id":"d584677e4bfb_27","name":"80f3","type":"P","href":null,"layout":null,"metadata":null,"text":"The model aims to learn about structure from the Spatial Stream and movement from the Temporal Stream. These features are then fused using a linear layer at the end. This method significantly outperforms Slow Fusion mentioned above.","hasDropCap":null,"dropCapImage":null,"markups":[],"codeBlockMetadata":null,"iframe":null,"mixtapeMetadata":null},"Paragraph:d584677e4bfb_28":{"__typename":"Paragraph","id":"d584677e4bfb_28","name":"ee3f","type":"H3","href":null,"layout":null,"metadata":null,"text":"C3D: Learning Spatiotemporal Features with 3D Convolutional Networks","hasDropCap":null,"dropCapImage":null,"markups":[],"codeBlockMetadata":null,"iframe":null,"mixtapeMetadata":null},"Paragraph:d584677e4bfb_29":{"__typename":"Paragraph","id":"d584677e4bfb_29","name":"80cf","type":"P","href":null,"layout":null,"metadata":null,"text":"Paper: Learning Spatiotemporal Features with 3D Convolutional Networks: https:\u002F\u002Farxiv.org\u002Fpdf\u002F1412.0767.pdf","hasDropCap":null,"dropCapImage":null,"markups":[{"__typename":"Markup","type":"A","start":72,"end":107,"href":"https:\u002F\u002Farxiv.org\u002Fpdf\u002F1412.0767.pdf","anchorType":"LINK","userId":null,"linkMetadata":null}],"codeBlockMetadata":null,"iframe":null,"mixtapeMetadata":null},"Paragraph:d584677e4bfb_30":{"__typename":"Paragraph","id":"d584677e4bfb_30","name":"8069","type":"P","href":null,"layout":null,"metadata":null,"text":"This is another very important but simple paper which aims at replacing the deterministic optical flow method used previously by a 3D CNN. Optical flow methods are not perfect and 3D CNNs can pick more granular features provided more data and compute.","hasDropCap":null,"dropCapImage":null,"markups":[],"codeBlockMetadata":null,"iframe":null,"mixtapeMetadata":null},"ImageMetadata:1*bI_607tbITDjq6ZG80nXHA.png":{"__typename":"ImageMetadata","id":"1*bI_607tbITDjq6ZG80nXHA.png","originalHeight":133,"originalWidth":761,"focusPercentX":null,"focusPercentY":null,"alt":null},"Paragraph:d584677e4bfb_31":{"__typename":"Paragraph","id":"d584677e4bfb_31","name":"beea","type":"IMG","href":null,"layout":"INSET_CENTER","metadata":{"__ref":"ImageMetadata:1*bI_607tbITDjq6ZG80nXHA.png"},"text":"C3D Architecture","hasDropCap":null,"dropCapImage":null,"markups":[],"codeBlockMetadata":null,"iframe":null,"mixtapeMetadata":null},"Paragraph:d584677e4bfb_32":{"__typename":"Paragraph","id":"d584677e4bfb_32","name":"4354","type":"P","href":null,"layout":null,"metadata":null,"text":"The architecture is very short and simple. 3D convolutions are significantly more expensive than their 2D counterparts and they simply stack 3D convolutions in a single stream as 3D convolutions can pick up structure and motion concurrently. Quite a bit of data augmentation is done for robustness and generalisation.","hasDropCap":null,"dropCapImage":null,"markups":[],"codeBlockMetadata":null,"iframe":null,"mixtapeMetadata":null},"Paragraph:d584677e4bfb_33":{"__typename":"Paragraph","id":"d584677e4bfb_33","name":"1f99","type":"P","href":null,"layout":null,"metadata":null,"text":"The authors perform experiments with a smaller 5 layer 3D CNN to determine the optimal value for the temporal receptive field size which they determine to be 3. Then they train a larger 8 layer 3D CNN for the results below.","hasDropCap":null,"dropCapImage":null,"markups":[],"codeBlockMetadata":null,"iframe":null,"mixtapeMetadata":null},"ImageMetadata:0*hQVBMEp8nW9AnIAL.png":{"__typename":"ImageMetadata","id":"0*hQVBMEp8nW9AnIAL.png","originalHeight":574,"originalWidth":571,"focusPercentX":null,"focusPercentY":null,"alt":null},"Paragraph:d584677e4bfb_34":{"__typename":"Paragraph","id":"d584677e4bfb_34","name":"1e3c","type":"IMG","href":null,"layout":"INSET_CENTER","metadata":{"__ref":"ImageMetadata:0*hQVBMEp8nW9AnIAL.png"},"text":"Results","hasDropCap":null,"dropCapImage":null,"markups":[],"codeBlockMetadata":null,"iframe":null,"mixtapeMetadata":null},"Paragraph:d584677e4bfb_35":{"__typename":"Paragraph","id":"d584677e4bfb_35","name":"5ffb","type":"P","href":null,"layout":null,"metadata":null,"text":"They do post analysis and find their 3D Convolutions learn Temporal Gabor Filters","hasDropCap":null,"dropCapImage":null,"markups":[],"codeBlockMetadata":null,"iframe":null,"mixtapeMetadata":null},"ImageMetadata:1*gXkVtYhLR8O876TZ--0GSg.png":{"__typename":"ImageMetadata","id":"1*gXkVtYhLR8O876TZ--0GSg.png","originalHeight":199,"originalWidth":244,"focusPercentX":null,"focusPercentY":null,"alt":null},"Paragraph:d584677e4bfb_36":{"__typename":"Paragraph","id":"d584677e4bfb_36","name":"da4f","type":"IMG","href":null,"layout":"INSET_CENTER","metadata":{"__ref":"ImageMetadata:1*gXkVtYhLR8O876TZ--0GSg.png"},"text":"Temporal Gabor Filters","hasDropCap":null,"dropCapImage":null,"markups":[],"codeBlockMetadata":null,"iframe":null,"mixtapeMetadata":null},"Paragraph:d584677e4bfb_37":{"__typename":"Paragraph","id":"d584677e4bfb_37","name":"1908","type":"H3","href":null,"layout":null,"metadata":null,"text":"I3D: Quo Vadis, Action Recognition? A New Model and the Kinetics Dataset","hasDropCap":null,"dropCapImage":null,"markups":[],"codeBlockMetadata":null,"iframe":null,"mixtapeMetadata":null},"Paragraph:d584677e4bfb_38":{"__typename":"Paragraph","id":"d584677e4bfb_38","name":"fdb9","type":"P","href":null,"layout":null,"metadata":null,"text":"Paper: Quo Vadis, Action Recognition? A New Model and the Kinetics Dataset: https:\u002F\u002Farxiv.org\u002Fabs\u002F1705.07750","hasDropCap":null,"dropCapImage":null,"markups":[{"__typename":"Markup","type":"A","start":76,"end":108,"href":"https:\u002F\u002Farxiv.org\u002Fabs\u002F1705.07750","anchorType":"LINK","userId":null,"linkMetadata":null}],"codeBlockMetadata":null,"iframe":null,"mixtapeMetadata":null},"Paragraph:d584677e4bfb_39":{"__typename":"Paragraph","id":"d584677e4bfb_39","name":"7653","type":"P","href":null,"layout":null,"metadata":null,"text":"This paper introduced the famous Kinetics Dataset for Action Recognition and summarises the research before it in the below diagram and comes up with their Two Stream versions of 3D CNNs(just when we thought Two streams were replaced by 3D CNNs) by using optical flow in one pathway and using 3D CNNs for feature fusion of the two pathways.","hasDropCap":null,"dropCapImage":null,"markups":[],"codeBlockMetadata":null,"iframe":null,"mixtapeMetadata":null},"ImageMetadata:0*0HF_EnV6KeScUP1t.png":{"__typename":"ImageMetadata","id":"0*0HF_EnV6KeScUP1t.png","originalHeight":605,"originalWidth":2000,"focusPercentX":null,"focusPercentY":null,"alt":null},"Paragraph:d584677e4bfb_40":{"__typename":"Paragraph","id":"d584677e4bfb_40","name":"3973","type":"IMG","href":null,"layout":"INSET_CENTER","metadata":{"__ref":"ImageMetadata:0*0HF_EnV6KeScUP1t.png"},"text":"The different video architectures","hasDropCap":null,"dropCapImage":null,"markups":[],"codeBlockMetadata":null,"iframe":null,"mixtapeMetadata":null},"Paragraph:d584677e4bfb_41":{"__typename":"Paragraph","id":"d584677e4bfb_41","name":"2df2","type":"P","href":null,"layout":null,"metadata":null,"text":"This work used pretrained 2D convolutional models and converted them to 3D models by replicating learned filters in the temporal dimension. They also find that including optical flow as an additional input is helpful for the model most optical flow methods are iterative and have some information difficult for a 3D CNN to fully learn. The also provide very quick access to information from the start helping the model learn early.","hasDropCap":null,"dropCapImage":null,"markups":[],"codeBlockMetadata":null,"iframe":null,"mixtapeMetadata":null},"ImageMetadata:1*23iaADN4aH4bcYjsUZMC0A.png":{"__typename":"ImageMetadata","id":"1*23iaADN4aH4bcYjsUZMC0A.png","originalHeight":446,"originalWidth":1658,"focusPercentX":null,"focusPercentY":null,"alt":null},"Paragraph:d584677e4bfb_42":{"__typename":"Paragraph","id":"d584677e4bfb_42","name":"7495","type":"IMG","href":null,"layout":"INSET_CENTER","metadata":{"__ref":"ImageMetadata:1*23iaADN4aH4bcYjsUZMC0A.png"},"text":"Results","hasDropCap":null,"dropCapImage":null,"markups":[],"codeBlockMetadata":null,"iframe":null,"mixtapeMetadata":null},"Paragraph:d584677e4bfb_43":{"__typename":"Paragraph","id":"d584677e4bfb_43","name":"3637","type":"P","href":null,"layout":null,"metadata":null,"text":"As can be seen in their results, ImageNet pretrained Two Stream I3D using Optical Flow obtains the best results(74.2).","hasDropCap":null,"dropCapImage":null,"markups":[],"codeBlockMetadata":null,"iframe":null,"mixtapeMetadata":null},"Paragraph:d584677e4bfb_44":{"__typename":"Paragraph","id":"d584677e4bfb_44","name":"3703","type":"H3","href":null,"layout":null,"metadata":null,"text":"Advances in Efficient Video Recognition","hasDropCap":null,"dropCapImage":null,"markups":[],"codeBlockMetadata":null,"iframe":null,"mixtapeMetadata":null},"Paragraph:d584677e4bfb_45":{"__typename":"Paragraph","id":"d584677e4bfb_45","name":"bc5b","type":"P","href":null,"layout":null,"metadata":null,"text":"The above video models are quite heavy and require large training time. In this section we will cover some important papers focused on efficient processing of videos.","hasDropCap":null,"dropCapImage":null,"markups":[],"codeBlockMetadata":null,"iframe":null,"mixtapeMetadata":null},"Paragraph:d584677e4bfb_46":{"__typename":"Paragraph","id":"d584677e4bfb_46","name":"afb6","type":"P","href":null,"layout":null,"metadata":null,"text":"We will talk about three new approaches that largely aim at reducing the heavy computational cost of video models.","hasDropCap":null,"dropCapImage":null,"markups":[],"codeBlockMetadata":null,"iframe":null,"mixtapeMetadata":null},"Paragraph:d584677e4bfb_47":{"__typename":"Paragraph","id":"d584677e4bfb_47","name":"360a","type":"H3","href":null,"layout":null,"metadata":null,"text":"SlowFast Networks for Video Recognition","hasDropCap":null,"dropCapImage":null,"markups":[],"codeBlockMetadata":null,"iframe":null,"mixtapeMetadata":null},"Paragraph:d584677e4bfb_48":{"__typename":"Paragraph","id":"d584677e4bfb_48","name":"91a6","type":"P","href":null,"layout":null,"metadata":null,"text":"Paper: SlowFast Networks for Video Recognition: https:\u002F\u002Farxiv.org\u002Fpdf\u002F1812.03982.pdf","hasDropCap":null,"dropCapImage":null,"markups":[{"__typename":"Markup","type":"A","start":48,"end":84,"href":"https:\u002F\u002Farxiv.org\u002Fpdf\u002F1812.03982.pdf","anchorType":"LINK","userId":null,"linkMetadata":null}],"codeBlockMetadata":null,"iframe":null,"mixtapeMetadata":null},"Paragraph:d584677e4bfb_49":{"__typename":"Paragraph","id":"d584677e4bfb_49","name":"2b5e","type":"P","href":null,"layout":null,"metadata":null,"text":"SlowFast Networks take inspiration from P-cells and M-cells in the brain which are responsible for visual processing. The M-cells operate at high temporal frequency and are responsive to fast temporal changes, but not sensitive to spatial detail or color. P-cells provide fine spatial detail and color, but lower temporal resolution, responding slowly to stimuli.","hasDropCap":null,"dropCapImage":null,"markups":[],"codeBlockMetadata":null,"iframe":null,"mixtapeMetadata":null},"Paragraph:d584677e4bfb_50":{"__typename":"Paragraph","id":"d584677e4bfb_50","name":"76ab","type":"P","href":null,"layout":null,"metadata":null,"text":"So this paper tries to replicate this function and produces a two stream network where each stream mimics the M-cell and P-cell. One stream operates at high temporal frequency and one at a finer frequency.","hasDropCap":null,"dropCapImage":null,"markups":[],"codeBlockMetadata":null,"iframe":null,"mixtapeMetadata":null},"ImageMetadata:1*1ji-uvjS3o7-j9ARpR8jLA.png":{"__typename":"ImageMetadata","id":"1*1ji-uvjS3o7-j9ARpR8jLA.png","originalHeight":646,"originalWidth":906,"focusPercentX":null,"focusPercentY":null,"alt":null},"Paragraph:d584677e4bfb_51":{"__typename":"Paragraph","id":"d584677e4bfb_51","name":"877d","type":"IMG","href":null,"layout":"INSET_CENTER","metadata":{"__ref":"ImageMetadata:1*1ji-uvjS3o7-j9ARpR8jLA.png"},"text":"SlowFast Network","hasDropCap":null,"dropCapImage":null,"markups":[],"codeBlockMetadata":null,"iframe":null,"mixtapeMetadata":null},"Paragraph:d584677e4bfb_52":{"__typename":"Paragraph","id":"d584677e4bfb_52","name":"a212","type":"P","href":null,"layout":null,"metadata":null,"text":"The top or slow pathway subsamples frames at a low frame rate so images are much more spread out in time. It uses only a fraction of the total input(1\u002F8th frames). This way it is focused to capture contextual information or structure.","hasDropCap":null,"dropCapImage":null,"markups":[{"__typename":"Markup","type":"STRONG","start":0,"end":23,"href":null,"anchorType":null,"userId":null,"linkMetadata":null}],"codeBlockMetadata":null,"iframe":null,"mixtapeMetadata":null},"Paragraph:d584677e4bfb_53":{"__typename":"Paragraph","id":"d584677e4bfb_53","name":"5d2d","type":"P","href":null,"layout":null,"metadata":null,"text":"The lower or fast channel pathway processes all the frames but is made very light weight. This patway is focused on motion determination. Despite its high temporal rate, this pathway is made very lightweight, only ∼20% of total computation.","hasDropCap":null,"dropCapImage":null,"markups":[{"__typename":"Markup","type":"STRONG","start":0,"end":34,"href":null,"anchorType":null,"userId":null,"linkMetadata":null}],"codeBlockMetadata":null,"iframe":null,"mixtapeMetadata":null},"Paragraph:d584677e4bfb_54":{"__typename":"Paragraph","id":"d584677e4bfb_54","name":"d4fa","type":"P","href":null,"layout":null,"metadata":null,"text":"The slow pathway has lower frame rates and higher parameters as to promote it to learn structure due to it’s high capacity. The fast pathway has lower parameters to promote it to learn motion via simplier gabor filters.","hasDropCap":null,"dropCapImage":null,"markups":[],"codeBlockMetadata":null,"iframe":null,"mixtapeMetadata":null},"ImageMetadata:1*o--_LFretLsN6riCIyJ1dQ.png":{"__typename":"ImageMetadata","id":"1*o--_LFretLsN6riCIyJ1dQ.png","originalHeight":923,"originalWidth":1734,"focusPercentX":null,"focusPercentY":null,"alt":null},"Paragraph:d584677e4bfb_55":{"__typename":"Paragraph","id":"d584677e4bfb_55","name":"64c6","type":"IMG","href":null,"layout":"INSET_CENTER","metadata":{"__ref":"ImageMetadata:1*o--_LFretLsN6riCIyJ1dQ.png"},"text":"SlowFast Network","hasDropCap":null,"dropCapImage":null,"markups":[],"codeBlockMetadata":null,"iframe":null,"mixtapeMetadata":null},"Paragraph:d584677e4bfb_56":{"__typename":"Paragraph","id":"d584677e4bfb_56","name":"d29e","type":"P","href":null,"layout":null,"metadata":null,"text":"You can see above that slow pathway has non-temporal convolutions till res4 so it is mostly focused on semantics whereas the fast pathway has a higher temporal stride from res2 but lower channels making it more biased towards motion gabor filters.","hasDropCap":null,"dropCapImage":null,"markups":[],"codeBlockMetadata":null,"iframe":null,"mixtapeMetadata":null},"Paragraph:d584677e4bfb_57":{"__typename":"Paragraph","id":"d584677e4bfb_57","name":"cbef","type":"P","href":null,"layout":null,"metadata":null,"text":"They also extend to other tasks such as video object detection.","hasDropCap":null,"dropCapImage":null,"markups":[],"codeBlockMetadata":null,"iframe":null,"mixtapeMetadata":null},"ImageMetadata:1*qghLNnVV7FaEsr5vIdJU-Q.png":{"__typename":"ImageMetadata","id":"1*qghLNnVV7FaEsr5vIdJU-Q.png","originalHeight":569,"originalWidth":1190,"focusPercentX":null,"focusPercentY":null,"alt":null},"Paragraph:d584677e4bfb_58":{"__typename":"Paragraph","id":"d584677e4bfb_58","name":"d121","type":"IMG","href":null,"layout":"INSET_CENTER","metadata":{"__ref":"ImageMetadata:1*qghLNnVV7FaEsr5vIdJU-Q.png"},"text":"SlowFast for Action Detection","hasDropCap":null,"dropCapImage":null,"markups":[],"codeBlockMetadata":null,"iframe":null,"mixtapeMetadata":null},"Paragraph:d584677e4bfb_59":{"__typename":"Paragraph","id":"d584677e4bfb_59","name":"a57d","type":"P","href":null,"layout":null,"metadata":null,"text":"This model beats out previous methods as a much lower FLOP rate due to its curated and optimal filter sizes.","hasDropCap":null,"dropCapImage":null,"markups":[],"codeBlockMetadata":null,"iframe":null,"mixtapeMetadata":null},"ImageMetadata:1*5d_XD29PQsvb1f4hLfrkPw.png":{"__typename":"ImageMetadata","id":"1*5d_XD29PQsvb1f4hLfrkPw.png","originalHeight":854,"originalWidth":1778,"focusPercentX":null,"focusPercentY":null,"alt":null},"Paragraph:d584677e4bfb_60":{"__typename":"Paragraph","id":"d584677e4bfb_60","name":"8a6a","type":"IMG","href":null,"layout":"INSET_CENTER","metadata":{"__ref":"ImageMetadata:1*5d_XD29PQsvb1f4hLfrkPw.png"},"text":"Results","hasDropCap":null,"dropCapImage":null,"markups":[],"codeBlockMetadata":null,"iframe":null,"mixtapeMetadata":null},"Paragraph:d584677e4bfb_61":{"__typename":"Paragraph","id":"d584677e4bfb_61","name":"1b40","type":"H3","href":null,"layout":null,"metadata":null,"text":"X3D: Expanding Architectures for Efficient Video Recognition","hasDropCap":null,"dropCapImage":null,"markups":[],"codeBlockMetadata":null,"iframe":null,"mixtapeMetadata":null},"Paragraph:d584677e4bfb_62":{"__typename":"Paragraph","id":"d584677e4bfb_62","name":"4fac","type":"P","href":null,"layout":null,"metadata":null,"text":"Paper: X3D: Expanding Architectures for Efficient Video Recognition: https:\u002F\u002Farxiv.org\u002Fabs\u002F2004.04730","hasDropCap":null,"dropCapImage":null,"markups":[{"__typename":"Markup","type":"A","start":69,"end":101,"href":"https:\u002F\u002Farxiv.org\u002Fabs\u002F2004.04730","anchorType":"LINK","userId":null,"linkMetadata":null}],"codeBlockMetadata":null,"iframe":null,"mixtapeMetadata":null},"Paragraph:d584677e4bfb_63":{"__typename":"Paragraph","id":"d584677e4bfb_63","name":"dfef","type":"P","href":null,"layout":null,"metadata":null,"text":"Perhaps one of my most favorite papers all time, X3D aims to determine exactly how many parameters we need to do efficient video recognition. This work is done by a single author Christoph Feichtenhofer.","hasDropCap":null,"dropCapImage":null,"markups":[],"codeBlockMetadata":null,"iframe":null,"mixtapeMetadata":null},"Paragraph:d584677e4bfb_64":{"__typename":"Paragraph","id":"d584677e4bfb_64","name":"96e8","type":"P","href":null,"layout":null,"metadata":null,"text":"Previously model sizes like layer count and filter count were based on heuristics and it was unclear how many parameters are required to reach certain levels of accuracy.","hasDropCap":null,"dropCapImage":null,"markups":[],"codeBlockMetadata":null,"iframe":null,"mixtapeMetadata":null},"Paragraph:d584677e4bfb_65":{"__typename":"Paragraph","id":"d584677e4bfb_65","name":"cfd2","type":"P","href":null,"layout":null,"metadata":null,"text":"There are various dimensions that effect computation like: input spatial resolution, temporal resolution, number of layers, number of filters, bottleneck dimension etc. The factors can be seen as the Expansion operations on the right below:","hasDropCap":null,"dropCapImage":null,"markups":[],"codeBlockMetadata":null,"iframe":null,"mixtapeMetadata":null},"ImageMetadata:1*ZgEaxumewFVy5pt-JNkWLw.png":{"__typename":"ImageMetadata","id":"1*ZgEaxumewFVy5pt-JNkWLw.png","originalHeight":546,"originalWidth":1057,"focusPercentX":null,"focusPercentY":null,"alt":null},"Paragraph:d584677e4bfb_66":{"__typename":"Paragraph","id":"d584677e4bfb_66","name":"6dca","type":"IMG","href":null,"layout":"INSET_CENTER","metadata":{"__ref":"ImageMetadata:1*ZgEaxumewFVy5pt-JNkWLw.png"},"text":"Computation Breakdown in a Video Model","hasDropCap":null,"dropCapImage":null,"markups":[],"codeBlockMetadata":null,"iframe":null,"mixtapeMetadata":null},"Paragraph:d584677e4bfb_67":{"__typename":"Paragraph","id":"d584677e4bfb_67","name":"7e21","type":"P","href":null,"layout":null,"metadata":null,"text":"X3D takes an iterative approach to finding the best model:","hasDropCap":null,"dropCapImage":null,"markups":[],"codeBlockMetadata":null,"iframe":null,"mixtapeMetadata":null},"Paragraph:d584677e4bfb_68":{"__typename":"Paragraph","id":"d584677e4bfb_68","name":"2611","type":"P","href":null,"layout":null,"metadata":null,"text":"1) Train a tiny base model to convergence","hasDropCap":null,"dropCapImage":null,"markups":[],"codeBlockMetadata":null,"iframe":null,"mixtapeMetadata":null},"Paragraph:d584677e4bfb_69":{"__typename":"Paragraph","id":"d584677e4bfb_69","name":"28f7","type":"P","href":null,"layout":null,"metadata":null,"text":"2) For each of the 6 dimensions of computation, increase them as to double the computation and create 6 different models","hasDropCap":null,"dropCapImage":null,"markups":[],"codeBlockMetadata":null,"iframe":null,"mixtapeMetadata":null},"Paragraph:d584677e4bfb_70":{"__typename":"Paragraph","id":"d584677e4bfb_70","name":"f9b8","type":"P","href":null,"layout":null,"metadata":null,"text":"3) From the set of 6 new models pick the one with the highest accuracy, discard the rest and permanently increase the base model in that dimension and repeat 1) in a loop","hasDropCap":null,"dropCapImage":null,"markups":[],"codeBlockMetadata":null,"iframe":null,"mixtapeMetadata":null},"Paragraph:d584677e4bfb_71":{"__typename":"Paragraph","id":"d584677e4bfb_71","name":"6e14","type":"P","href":null,"layout":null,"metadata":null,"text":"In this way we approximately know which dimension of compute to increase and progressively make the model larger with the best accuracy tradeoff for each level of available compute.","hasDropCap":null,"dropCapImage":null,"markups":[],"codeBlockMetadata":null,"iframe":null,"mixtapeMetadata":null},"Paragraph:d584677e4bfb_72":{"__typename":"Paragraph","id":"d584677e4bfb_72","name":"a52e","type":"P","href":null,"layout":null,"metadata":null,"text":"This sounds like it would take alot of training time but because we start with a very small base model training is completed after only training 30 tiny models that accumulatively require over 25 times fewer multiply-add operations for training than one of the previous large state-of-the-art network","hasDropCap":null,"dropCapImage":null,"markups":[{"__typename":"Markup","type":"EM","start":152,"end":153,"href":null,"anchorType":null,"userId":null,"linkMetadata":null},{"__typename":"Markup","type":"EM","start":179,"end":180,"href":null,"anchorType":null,"userId":null,"linkMetadata":null}],"codeBlockMetadata":null,"iframe":null,"mixtapeMetadata":null},"ImageMetadata:1*cey1UtyB2SzcOkFgQ0SqJQ.png":{"__typename":"ImageMetadata","id":"1*cey1UtyB2SzcOkFgQ0SqJQ.png","originalHeight":521,"originalWidth":1058,"focusPercentX":null,"focusPercentY":null,"alt":null},"Paragraph:d584677e4bfb_73":{"__typename":"Paragraph","id":"d584677e4bfb_73","name":"9ae0","type":"IMG","href":null,"layout":"INSET_CENTER","metadata":{"__ref":"ImageMetadata:1*cey1UtyB2SzcOkFgQ0SqJQ.png"},"text":"Model Capacity vs Accuracy","hasDropCap":null,"dropCapImage":null,"markups":[],"codeBlockMetadata":null,"iframe":null,"mixtapeMetadata":null},"Paragraph:d584677e4bfb_74":{"__typename":"Paragraph","id":"d584677e4bfb_74","name":"a577","type":"P","href":null,"layout":null,"metadata":null,"text":"As seen above, we can see how accuracy increases. Each point is a new model that we get as we double model capacity using the steps outlined above. After 10 GFLOPS gains are slow and we see how accuracy scales with compute clearly.","hasDropCap":null,"dropCapImage":null,"markups":[],"codeBlockMetadata":null,"iframe":null,"mixtapeMetadata":null},"ImageMetadata:1*Qaeuo7VkyPFRUqWCgmodFQ.png":{"__typename":"ImageMetadata","id":"1*Qaeuo7VkyPFRUqWCgmodFQ.png","originalHeight":662,"originalWidth":1488,"focusPercentX":null,"focusPercentY":null,"alt":null},"Paragraph:d584677e4bfb_75":{"__typename":"Paragraph","id":"d584677e4bfb_75","name":"2003","type":"IMG","href":null,"layout":"INSET_CENTER","metadata":{"__ref":"ImageMetadata:1*Qaeuo7VkyPFRUqWCgmodFQ.png"},"text":"Comparision with other video models for different compute levels","hasDropCap":null,"dropCapImage":null,"markups":[],"codeBlockMetadata":null,"iframe":null,"mixtapeMetadata":null},"Paragraph:d584677e4bfb_76":{"__typename":"Paragraph","id":"d584677e4bfb_76","name":"24db","type":"P","href":null,"layout":null,"metadata":null,"text":"As we see in the above comparison X3D beats SlowFast in accuracy by less than half the TFLOPS! Refer to the paper for more details. This is a really interesting way to optimise for compute and accuracy tradeoff.","hasDropCap":null,"dropCapImage":null,"markups":[],"codeBlockMetadata":null,"iframe":null,"mixtapeMetadata":null},"Paragraph:d584677e4bfb_77":{"__typename":"Paragraph","id":"d584677e4bfb_77","name":"4389","type":"H3","href":null,"layout":null,"metadata":null,"text":"A Multigrid Method for Efficiently Training Video Models","hasDropCap":null,"dropCapImage":null,"markups":[],"codeBlockMetadata":null,"iframe":null,"mixtapeMetadata":null},"Paragraph:d584677e4bfb_78":{"__typename":"Paragraph","id":"d584677e4bfb_78","name":"9e78","type":"P","href":null,"layout":null,"metadata":null,"text":"Paper: A Multigrid Method for Efficiently Training Video Models: https:\u002F\u002Farxiv.org\u002Fpdf\u002F1912.00998.pdf","hasDropCap":null,"dropCapImage":null,"markups":[{"__typename":"Markup","type":"A","start":65,"end":101,"href":"https:\u002F\u002Farxiv.org\u002Fpdf\u002F1912.00998.pdf","anchorType":"LINK","userId":null,"linkMetadata":null}],"codeBlockMetadata":null,"iframe":null,"mixtapeMetadata":null},"Paragraph:d584677e4bfb_79":{"__typename":"Paragraph","id":"d584677e4bfb_79","name":"67b9","type":"P","href":null,"layout":null,"metadata":null,"text":"Results first!","hasDropCap":null,"dropCapImage":null,"markups":[],"codeBlockMetadata":null,"iframe":null,"mixtapeMetadata":null},"ImageMetadata:1*KF1S-3iO_eiSeWladDIvPw.png":{"__typename":"ImageMetadata","id":"1*KF1S-3iO_eiSeWladDIvPw.png","originalHeight":623,"originalWidth":1102,"focusPercentX":null,"focusPercentY":null,"alt":null},"Paragraph:d584677e4bfb_80":{"__typename":"Paragraph","id":"d584677e4bfb_80","name":"67c3","type":"IMG","href":null,"layout":"INSET_CENTER","metadata":{"__ref":"ImageMetadata:1*KF1S-3iO_eiSeWladDIvPw.png"},"text":"Speed!","hasDropCap":null,"dropCapImage":null,"markups":[],"codeBlockMetadata":null,"iframe":null,"mixtapeMetadata":null},"Paragraph:d584677e4bfb_81":{"__typename":"Paragraph","id":"d584677e4bfb_81","name":"6e40","type":"P","href":null,"layout":null,"metadata":null,"text":"How can we train video models faster without tweaking architecture? Is there a more efficient way to train our models?","hasDropCap":null,"dropCapImage":null,"markups":[],"codeBlockMetadata":null,"iframe":null,"mixtapeMetadata":null},"ImageMetadata:1*uZiOU2nJofLoXLIfpNT9UA.png":{"__typename":"ImageMetadata","id":"1*uZiOU2nJofLoXLIfpNT9UA.png","originalHeight":1384,"originalWidth":2618,"focusPercentX":null,"focusPercentY":null,"alt":null},"Paragraph:d584677e4bfb_82":{"__typename":"Paragraph","id":"d584677e4bfb_82","name":"3c2a","type":"IMG","href":null,"layout":"INSET_CENTER","metadata":{"__ref":"ImageMetadata:1*uZiOU2nJofLoXLIfpNT9UA.png"},"text":"Image vs Video Models training time","hasDropCap":null,"dropCapImage":null,"markups":[],"codeBlockMetadata":null,"iframe":null,"mixtapeMetadata":null},"Paragraph:d584677e4bfb_83":{"__typename":"Paragraph","id":"d584677e4bfb_83","name":"04a3","type":"P","href":null,"layout":null,"metadata":null,"text":"Video models are typically trained using a fixed mini-batch shape, which includes a specific number of video clips, frames, and spatial dimensions. This fixed shape is chosen based on heuristics to balance accuracy and training speed. The choice of mini-batch shape involves trade-offs. Higher spatial resolutions can improve accuracy but slow down training, while lower resolutions speed up training but reduce accuracy.","hasDropCap":null,"dropCapImage":null,"markups":[],"codeBlockMetadata":null,"iframe":null,"mixtapeMetadata":null},"Paragraph:d584677e4bfb_84":{"__typename":"Paragraph","id":"d584677e4bfb_84","name":"9700","type":"P","href":null,"layout":null,"metadata":null,"text":"Training at low spatial resolutions can increase the speed of training drastically as we can use large batch sizes and higher learning rates however the accuracy is capped. Can we get the benefits of quick training and low resolution and added accuracy of training at higher resolution?","hasDropCap":null,"dropCapImage":null,"markups":[],"codeBlockMetadata":null,"iframe":null,"mixtapeMetadata":null},"ImageMetadata:1*14ArMcEIAujd0V-WgcgWzQ.png":{"__typename":"ImageMetadata","id":"1*14ArMcEIAujd0V-WgcgWzQ.png","originalHeight":792,"originalWidth":1500,"focusPercentX":null,"focusPercentY":null,"alt":null},"Paragraph:d584677e4bfb_85":{"__typename":"Paragraph","id":"d584677e4bfb_85","name":"af9d","type":"IMG","href":null,"layout":"INSET_CENTER","metadata":{"__ref":"ImageMetadata:1*14ArMcEIAujd0V-WgcgWzQ.png"},"text":"","hasDropCap":null,"dropCapImage":null,"markups":[],"codeBlockMetadata":null,"iframe":null,"mixtapeMetadata":null},"Paragraph:d584677e4bfb_86":{"__typename":"Paragraph","id":"d584677e4bfb_86","name":"0dd0","type":"P","href":null,"layout":null,"metadata":null,"text":"For this the authors propose a simple variable mini-batch with different spatial-temporal resolutions. These shapes are determined by resampling training data on multiple grids so that the model being trained gets the benefits from training on different spatial resolutions.","hasDropCap":null,"dropCapImage":null,"markups":[],"codeBlockMetadata":null,"iframe":null,"mixtapeMetadata":null},"ImageMetadata:1*vM48hfeR414Ji_CVoFoBBg.png":{"__typename":"ImageMetadata","id":"1*vM48hfeR414Ji_CVoFoBBg.png","originalHeight":800,"originalWidth":1490,"focusPercentX":null,"focusPercentY":null,"alt":null},"Paragraph:d584677e4bfb_87":{"__typename":"Paragraph","id":"d584677e4bfb_87","name":"f766","type":"IMG","href":null,"layout":"INSET_CENTER","metadata":{"__ref":"ImageMetadata:1*vM48hfeR414Ji_CVoFoBBg.png"},"text":"Sampling a grid differently","hasDropCap":null,"dropCapImage":null,"markups":[],"codeBlockMetadata":null,"iframe":null,"mixtapeMetadata":null},"Paragraph:d584677e4bfb_88":{"__typename":"Paragraph","id":"d584677e4bfb_88","name":"867d","type":"P","href":null,"layout":null,"metadata":null,"text":"The authors have multiple strategies of alternating between low and high spatial resolution. When using low resolution they are able to train on high batch size which is what the y-axis below represents:","hasDropCap":null,"dropCapImage":null,"markups":[],"codeBlockMetadata":null,"iframe":null,"mixtapeMetadata":null},"ImageMetadata:1*ikg2vXZ7b8MkfQ04uxdRIQ.png":{"__typename":"ImageMetadata","id":"1*ikg2vXZ7b8MkfQ04uxdRIQ.png","originalHeight":804,"originalWidth":1286,"focusPercentX":null,"focusPercentY":null,"alt":null},"Paragraph:d584677e4bfb_89":{"__typename":"Paragraph","id":"d584677e4bfb_89","name":"979c","type":"IMG","href":null,"layout":"INSET_CENTER","metadata":{"__ref":"ImageMetadata:1*ikg2vXZ7b8MkfQ04uxdRIQ.png"},"text":"","hasDropCap":null,"dropCapImage":null,"markups":[],"codeBlockMetadata":null,"iframe":null,"mixtapeMetadata":null},"Paragraph:d584677e4bfb_90":{"__typename":"Paragraph","id":"d584677e4bfb_90","name":"378d","type":"P","href":null,"layout":null,"metadata":null,"text":"The long cycle follows a coarse-to-fine strategy where the model sees different resolutions of input images so it trains really quickly in the start and then gets refined with a higher resolution to achieve maximum accuracy. The short cycle focus on mixture of multiple shapes while the long+short fuses both.","hasDropCap":null,"dropCapImage":null,"markups":[],"codeBlockMetadata":null,"iframe":null,"mixtapeMetadata":null},"Paragraph:d584677e4bfb_91":{"__typename":"Paragraph","id":"d584677e4bfb_91","name":"6992","type":"P","href":null,"layout":null,"metadata":null,"text":"As can be seen below training with this method can reach close to the maximum accuracy around 3 times faster than regular training.","hasDropCap":null,"dropCapImage":null,"markups":[],"codeBlockMetadata":null,"iframe":null,"mixtapeMetadata":null},"ImageMetadata:1*sudFiNhjS6cDLC9FLJ_QTg.png":{"__typename":"ImageMetadata","id":"1*sudFiNhjS6cDLC9FLJ_QTg.png","originalHeight":790,"originalWidth":1726,"focusPercentX":null,"focusPercentY":null,"alt":null},"Paragraph:d584677e4bfb_92":{"__typename":"Paragraph","id":"d584677e4bfb_92","name":"2da0","type":"IMG","href":null,"layout":"INSET_CENTER","metadata":{"__ref":"ImageMetadata:1*sudFiNhjS6cDLC9FLJ_QTg.png"},"text":"Speed!","hasDropCap":null,"dropCapImage":null,"markups":[],"codeBlockMetadata":null,"iframe":null,"mixtapeMetadata":null},"Paragraph:d584677e4bfb_93":{"__typename":"Paragraph","id":"d584677e4bfb_93","name":"c199","type":"H3","href":null,"layout":null,"metadata":null,"text":"Conclusion:","hasDropCap":null,"dropCapImage":null,"markups":[{"__typename":"Markup","type":"STRONG","start":0,"end":11,"href":null,"anchorType":null,"userId":null,"linkMetadata":null}],"codeBlockMetadata":null,"iframe":null,"mixtapeMetadata":null},"Paragraph:d584677e4bfb_94":{"__typename":"Paragraph","id":"d584677e4bfb_94","name":"e915","type":"P","href":null,"layout":null,"metadata":null,"text":"In this blog we covered early prominent approaches such as single-stream and two-stream networks, C3D, and I3D. Furthermore, we explored strides in efficient video recognition with a spotlight on pioneering methods like SlowFast Networks, X3D’s parameter optimization strategy and Multigrid Training Method for faster convergence. The last two works are particularly impactful in their approach and provide methods which can be used in other domains of learning as well.","hasDropCap":null,"dropCapImage":null,"markups":[],"codeBlockMetadata":null,"iframe":null,"mixtapeMetadata":null},"Paragraph:d584677e4bfb_95":{"__typename":"Paragraph","id":"d584677e4bfb_95","name":"53b2","type":"P","href":null,"layout":null,"metadata":null,"text":"Hope you found this useful!","hasDropCap":null,"dropCapImage":null,"markups":[],"codeBlockMetadata":null,"iframe":null,"mixtapeMetadata":null},"PostViewerEdge:postId:12d803da3854-viewerId:lo_9acb5374b4d5":{"__typename":"PostViewerEdge","shouldIndexPostForExternalSearch":true,"id":"postId:12d803da3854-viewerId:lo_9acb5374b4d5"},"Tag:convolutional-network":{"__typename":"Tag","id":"convolutional-network","displayTitle":"Convolutional Network","normalizedTagSlug":"convolutional-network"},"Tag:ai":{"__typename":"Tag","id":"ai","displayTitle":"AI","normalizedTagSlug":"ai"},"Tag:machine-learning":{"__typename":"Tag","id":"machine-learning","displayTitle":"Machine Learning","normalizedTagSlug":"machine-learning"},"Tag:neural-networks":{"__typename":"Tag","id":"neural-networks","displayTitle":"Neural Networks","normalizedTagSlug":"neural-networks"},"Tag:computer-vision":{"__typename":"Tag","id":"computer-vision","displayTitle":"Computer Vision","normalizedTagSlug":"computer-vision"},"Post:12d803da3854":{"__typename":"Post","id":"12d803da3854","collection":null,"content({\"postMeteringOptions\":{}})":{"__typename":"PostContent","isLockedPreviewOnly":false,"bodyModel":{"__typename":"RichText","sections":[{"__typename":"Section","name":"6781","startIndex":0,"textLayout":null,"imageLayout":null,"backgroundImage":null,"videoLayout":null,"backgroundVideo":null}],"paragraphs":[{"__ref":"Paragraph:d584677e4bfb_0"},{"__ref":"Paragraph:d584677e4bfb_1"},{"__ref":"Paragraph:d584677e4bfb_2"},{"__ref":"Paragraph:d584677e4bfb_3"},{"__ref":"Paragraph:d584677e4bfb_4"},{"__ref":"Paragraph:d584677e4bfb_5"},{"__ref":"Paragraph:d584677e4bfb_6"},{"__ref":"Paragraph:d584677e4bfb_7"},{"__ref":"Paragraph:d584677e4bfb_8"},{"__ref":"Paragraph:d584677e4bfb_9"},{"__ref":"Paragraph:d584677e4bfb_10"},{"__ref":"Paragraph:d584677e4bfb_11"},{"__ref":"Paragraph:d584677e4bfb_12"},{"__ref":"Paragraph:d584677e4bfb_13"},{"__ref":"Paragraph:d584677e4bfb_14"},{"__ref":"Paragraph:d584677e4bfb_15"},{"__ref":"Paragraph:d584677e4bfb_16"},{"__ref":"Paragraph:d584677e4bfb_17"},{"__ref":"Paragraph:d584677e4bfb_18"},{"__ref":"Paragraph:d584677e4bfb_19"},{"__ref":"Paragraph:d584677e4bfb_20"},{"__ref":"Paragraph:d584677e4bfb_21"},{"__ref":"Paragraph:d584677e4bfb_22"},{"__ref":"Paragraph:d584677e4bfb_23"},{"__ref":"Paragraph:d584677e4bfb_24"},{"__ref":"Paragraph:d584677e4bfb_25"},{"__ref":"Paragraph:d584677e4bfb_26"},{"__ref":"Paragraph:d584677e4bfb_27"},{"__ref":"Paragraph:d584677e4bfb_28"},{"__ref":"Paragraph:d584677e4bfb_29"},{"__ref":"Paragraph:d584677e4bfb_30"},{"__ref":"Paragraph:d584677e4bfb_31"},{"__ref":"Paragraph:d584677e4bfb_32"},{"__ref":"Paragraph:d584677e4bfb_33"},{"__ref":"Paragraph:d584677e4bfb_34"},{"__ref":"Paragraph:d584677e4bfb_35"},{"__ref":"Paragraph:d584677e4bfb_36"},{"__ref":"Paragraph:d584677e4bfb_37"},{"__ref":"Paragraph:d584677e4bfb_38"},{"__ref":"Paragraph:d584677e4bfb_39"},{"__ref":"Paragraph:d584677e4bfb_40"},{"__ref":"Paragraph:d584677e4bfb_41"},{"__ref":"Paragraph:d584677e4bfb_42"},{"__ref":"Paragraph:d584677e4bfb_43"},{"__ref":"Paragraph:d584677e4bfb_44"},{"__ref":"Paragraph:d584677e4bfb_45"},{"__ref":"Paragraph:d584677e4bfb_46"},{"__ref":"Paragraph:d584677e4bfb_47"},{"__ref":"Paragraph:d584677e4bfb_48"},{"__ref":"Paragraph:d584677e4bfb_49"},{"__ref":"Paragraph:d584677e4bfb_50"},{"__ref":"Paragraph:d584677e4bfb_51"},{"__ref":"Paragraph:d584677e4bfb_52"},{"__ref":"Paragraph:d584677e4bfb_53"},{"__ref":"Paragraph:d584677e4bfb_54"},{"__ref":"Paragraph:d584677e4bfb_55"},{"__ref":"Paragraph:d584677e4bfb_56"},{"__ref":"Paragraph:d584677e4bfb_57"},{"__ref":"Paragraph:d584677e4bfb_58"},{"__ref":"Paragraph:d584677e4bfb_59"},{"__ref":"Paragraph:d584677e4bfb_60"},{"__ref":"Paragraph:d584677e4bfb_61"},{"__ref":"Paragraph:d584677e4bfb_62"},{"__ref":"Paragraph:d584677e4bfb_63"},{"__ref":"Paragraph:d584677e4bfb_64"},{"__ref":"Paragraph:d584677e4bfb_65"},{"__ref":"Paragraph:d584677e4bfb_66"},{"__ref":"Paragraph:d584677e4bfb_67"},{"__ref":"Paragraph:d584677e4bfb_68"},{"__ref":"Paragraph:d584677e4bfb_69"},{"__ref":"Paragraph:d584677e4bfb_70"},{"__ref":"Paragraph:d584677e4bfb_71"},{"__ref":"Paragraph:d584677e4bfb_72"},{"__ref":"Paragraph:d584677e4bfb_73"},{"__ref":"Paragraph:d584677e4bfb_74"},{"__ref":"Paragraph:d584677e4bfb_75"},{"__ref":"Paragraph:d584677e4bfb_76"},{"__ref":"Paragraph:d584677e4bfb_77"},{"__ref":"Paragraph:d584677e4bfb_78"},{"__ref":"Paragraph:d584677e4bfb_79"},{"__ref":"Paragraph:d584677e4bfb_80"},{"__ref":"Paragraph:d584677e4bfb_81"},{"__ref":"Paragraph:d584677e4bfb_82"},{"__ref":"Paragraph:d584677e4bfb_83"},{"__ref":"Paragraph:d584677e4bfb_84"},{"__ref":"Paragraph:d584677e4bfb_85"},{"__ref":"Paragraph:d584677e4bfb_86"},{"__ref":"Paragraph:d584677e4bfb_87"},{"__ref":"Paragraph:d584677e4bfb_88"},{"__ref":"Paragraph:d584677e4bfb_89"},{"__ref":"Paragraph:d584677e4bfb_90"},{"__ref":"Paragraph:d584677e4bfb_91"},{"__ref":"Paragraph:d584677e4bfb_92"},{"__ref":"Paragraph:d584677e4bfb_93"},{"__ref":"Paragraph:d584677e4bfb_94"},{"__ref":"Paragraph:d584677e4bfb_95"}]},"validatedShareKey":"","shareKeyCreator":null},"creator":{"__ref":"User:c0b8e0e6e9b0"},"inResponseToEntityType":null,"isLocked":false,"isMarkedPaywallOnly":false,"lockedSource":"LOCKED_POST_SOURCE_NONE","mediumUrl":"https:\u002F\u002Frahuld3eora.medium.com\u002Fdeep-learning-for-action-recognition-from-basics-to-efficiency-advancements-12d803da3854","primaryTopic":null,"topics":[{"__typename":"Topic","slug":"machine-learning"}],"isPublished":true,"latestPublishedVersion":"d584677e4bfb","visibility":"PUBLIC","postResponses":{"__typename":"PostResponses","count":1},"clapCount":27,"allowResponses":true,"isLimitedState":false,"title":"Deep Learning for Action Recognition: From Basics to Efficiency Advancements","isSeries":false,"sequence":null,"uniqueSlug":"deep-learning-for-action-recognition-from-basics-to-efficiency-advancements-12d803da3854","socialTitle":"","socialDek":"","canonicalUrl":"","metaDescription":"","latestPublishedAt":1706027741420,"readingTime":9.926415094339623,"previewContent":{"__typename":"PreviewContent","subtitle":"Action recognition is an important task in the field of computer vision that entails classifying human actions depicted in video frames…"},"previewImage":{"__ref":"ImageMetadata:1*aNj3n_P-oyNwuOSCZxmFiQ.png"},"isShortform":false,"seoTitle":"","firstPublishedAt":1706027741420,"updatedAt":1709011304996,"shortformType":"SHORTFORM_TYPE_LINK","seoDescription":"","viewerEdge":{"__ref":"PostViewerEdge:postId:12d803da3854-viewerId:lo_9acb5374b4d5"},"isSuspended":false,"license":"ALL_RIGHTS_RESERVED","tags":[{"__ref":"Tag:convolutional-network"},{"__ref":"Tag:ai"},{"__ref":"Tag:machine-learning"},{"__ref":"Tag:neural-networks"},{"__ref":"Tag:computer-vision"}],"isNewsletter":false,"statusForCollection":null,"pendingCollection":null,"detectedLanguage":"en","wordCount":2127,"layerCake":0,"responsesLocked":false}}</script><script>window.__MIDDLEWARE_STATE__={"session":{"xsrf":""},"cache":{"cacheStatus":"HIT"}}</script><script src="https://cdn-client.medium.com/lite/static/js/manifest.96244f47.js"></script><script src="https://cdn-client.medium.com/lite/static/js/9865.1496d74a.js"></script><script src="https://cdn-client.medium.com/lite/static/js/main.071d0479.js"></script><script src="https://cdn-client.medium.com/lite/static/js/instrumentation.d9108df7.chunk.js"></script> <script src="https://cdn-client.medium.com/lite/static/js/reporting.ff22a7a5.chunk.js"></script> <script src="https://cdn-client.medium.com/lite/static/js/5049.d1ead72d.chunk.js"></script> <script src="https://cdn-client.medium.com/lite/static/js/4810.6318add7.chunk.js"></script> <script src="https://cdn-client.medium.com/lite/static/js/6618.db187378.chunk.js"></script> <script src="https://cdn-client.medium.com/lite/static/js/2707.b0942613.chunk.js"></script> <script src="https://cdn-client.medium.com/lite/static/js/9977.5b3eb23a.chunk.js"></script> <script src="https://cdn-client.medium.com/lite/static/js/8599.1ab63137.chunk.js"></script> <script src="https://cdn-client.medium.com/lite/static/js/5250.9f9e01d2.chunk.js"></script> <script src="https://cdn-client.medium.com/lite/static/js/5787.e66a3a4d.chunk.js"></script> <script src="https://cdn-client.medium.com/lite/static/js/2648.26563adf.chunk.js"></script> <script src="https://cdn-client.medium.com/lite/static/js/8393.826a25fb.chunk.js"></script> <script src="https://cdn-client.medium.com/lite/static/js/7549.2176f21f.chunk.js"></script> <script src="https://cdn-client.medium.com/lite/static/js/6589.7c500280.chunk.js"></script> <script src="https://cdn-client.medium.com/lite/static/js/3735.afb7e926.chunk.js"></script> <script src="https://cdn-client.medium.com/lite/static/js/5642.0a97706a.chunk.js"></script> <script src="https://cdn-client.medium.com/lite/static/js/6546.cd03f950.chunk.js"></script> <script src="https://cdn-client.medium.com/lite/static/js/6834.08de95de.chunk.js"></script> <script src="https://cdn-client.medium.com/lite/static/js/7346.72622eb9.chunk.js"></script> <script src="https://cdn-client.medium.com/lite/static/js/2420.2a5e2d95.chunk.js"></script> <script src="https://cdn-client.medium.com/lite/static/js/839.ca7937c2.chunk.js"></script> <script src="https://cdn-client.medium.com/lite/static/js/7975.d195c6f1.chunk.js"></script> <script src="https://cdn-client.medium.com/lite/static/js/7394.bf599bc5.chunk.js"></script> <script src="https://cdn-client.medium.com/lite/static/js/2961.00a48598.chunk.js"></script> <script src="https://cdn-client.medium.com/lite/static/js/8204.c4082863.chunk.js"></script> <script src="https://cdn-client.medium.com/lite/static/js/4391.59acaed3.chunk.js"></script> <script src="https://cdn-client.medium.com/lite/static/js/PostPage.MainContent.902ad94b.chunk.js"></script> <script src="https://cdn-client.medium.com/lite/static/js/8414.6565ad5f.chunk.js"></script> <script src="https://cdn-client.medium.com/lite/static/js/3974.8d3e0217.chunk.js"></script> <script src="https://cdn-client.medium.com/lite/static/js/2527.a0afad8a.chunk.js"></script> <script src="https://cdn-client.medium.com/lite/static/js/PostResponsesContent.36c2ecf4.chunk.js"></script><script>window.main();</script><script>(function(){function c(){var b=a.contentDocument||a.contentWindow.document;if(b){var d=b.createElement('script');d.innerHTML="window.__CF$cv$params={r:'8ebe81d2fa88409a',t:'MTczMzE3NTk1OS4wMDAwMDA='};var a=document.createElement('script');a.nonce='';a.src='/cdn-cgi/challenge-platform/scripts/jsd/main.js';document.getElementsByTagName('head')[0].appendChild(a);";b.getElementsByTagName('head')[0].appendChild(d)}}if(document.body){var a=document.createElement('iframe');a.height=1;a.width=1;a.style.position='absolute';a.style.top=0;a.style.left=0;a.style.border='none';a.style.visibility='hidden';document.body.appendChild(a);if('loading'!==document.readyState)c();else if(window.addEventListener)document.addEventListener('DOMContentLoaded',c);else{var e=document.onreadystatechange||function(){};document.onreadystatechange=function(b){e(b);'loading'!==document.readyState&&(document.onreadystatechange=e,c())}}}})();</script></body></html>