CINXE.COM
Top Viewed Papers Referred by ArXiv | Semantic Scholar
<!DOCTYPE html><!-- Last Published: Fri Nov 15 2024 22:53:53 GMT+0000 (Coordinated Universal Time) --><html data-wf-domain="webflow.semanticscholar.org" data-wf-page="6584745360a4872a287a89a3" data-wf-site="605236bb767e9a5bb229c63c" lang="en"><head><meta charset="utf-8"/><title>Top Viewed Papers Referred by ArXiv | Semantic Scholar</title><meta content="Welcome to a curated collection of the most popular papers on Semantic Scholar, filtered through arXiv referrals to provide you with valuable insights into what truly piques researchers' interest." name="description"/><meta content="Top Viewed Papers Referred by ArXiv | Semantic Scholar" property="og:title"/><meta content="Welcome to a curated collection of the most popular papers on Semantic Scholar, filtered through arXiv referrals to provide you with valuable insights into what truly piques researchers' interest." property="og:description"/><meta content="https://assets-global.website-files.com/605236bb767e9a5bb229c63c/60a2cb67437bf2de419d5137_s2-og.png" property="og:image"/><meta content="Top Viewed Papers Referred by ArXiv | Semantic Scholar" property="twitter:title"/><meta content="Welcome to a curated collection of the most popular papers on Semantic Scholar, filtered through arXiv referrals to provide you with valuable insights into what truly piques researchers' interest." property="twitter:description"/><meta content="https://assets-global.website-files.com/605236bb767e9a5bb229c63c/60a2cb67437bf2de419d5137_s2-og.png" property="twitter:image"/><meta property="og:type" content="website"/><meta content="summary_large_image" name="twitter:card"/><meta content="width=device-width, initial-scale=1" name="viewport"/><link href="https://cdn.prod.website-files.com/605236bb767e9a5bb229c63c/css/semanticscholar.66787fc22.min.css" rel="stylesheet" type="text/css"/><link href="https://fonts.googleapis.com" rel="preconnect"/><link href="https://fonts.gstatic.com" rel="preconnect" crossorigin="anonymous"/><script src="https://ajax.googleapis.com/ajax/libs/webfont/1.6.26/webfont.js" type="text/javascript"></script><script type="text/javascript">WebFont.load({ google: { families: ["Lato:100,100italic,300,300italic,400,400italic,700,700italic,900,900italic","Roboto Slab:300,regular,500,700","Roboto:300,regular,500,700,900","Roboto Mono:regular","Roboto Mono:100,200,300,regular"] }});</script><script type="text/javascript">!function(o,c){var n=c.documentElement,t=" w-mod-";n.className+=t+"js",("ontouchstart"in o||o.DocumentTouch&&c instanceof DocumentTouch)&&(n.className+=t+"touch")}(window,document);</script><link href="https://cdn.prod.website-files.com/605236bb767e9a5bb229c63c/609add8e5f5ce7570f656904_favicon.png" rel="shortcut icon" type="image/x-icon"/><link href="https://cdn.prod.website-files.com/605236bb767e9a5bb229c63c/609adda9bd029148c37023a9_webclip.png" rel="apple-touch-icon"/><link href="/api/1/user/webflow.css" rel="stylesheet" type="text/css"> <!-- Heap Analytics Snippet --> <script type="text/javascript"> window.heap=window.heap||[],heap.load=function(e,t){window.heap.appid=e,window.heap.config=t=t||{};var r=t.forceSSL||"https:"===document.location.protocol,a=document.createElement("script");a.type="text/javascript",a.async=!0,a.src=(r?"https:":"http:")+"//cdn.heapanalytics.com/js/heap-"+e+".js";var n=document.getElementsByTagName("script")[0];n.parentNode.insertBefore(a,n);for(var o=function(e){return function(){heap.push([e].concat(Array.prototype.slice.call(arguments,0)))}},p=["addEventProperties","addUserProperties","clearEventProperties","identify","removeEventProperty","setEventProperties","track","unsetEventProperty"],c=0;c<p.length;c++)heap[p[c]]=o(p[c])}; heap.load("2424575119"); </script> <!--End Heap Analytics Snippet--> <style type="text/css"> /* Auth */ .site-header__navigation-auth { display: var(--has-auth--block) !important; } .site-header__navigation-not-auth { display: var(--no-auth--none) !important; } /* Dropdown Menu Top Bar */ .dropdown .dropdown__menu:before{ border-color: transparent transparent #1857B6 transparent; border-style: solid; border-width: 0 8px 8px 8px; content:""; height: 0; position: absolute; right: 12px; top: -12px; width: 0; } /* Embedded Newsletter Hubspot Form */ .newsletter .hbspt-form label{ margin: 0; } .newsletter .hbspt-form .hs-form { align-items: end; display: flex; } .newsletter .hbspt-form .hs-form-field { flex: 1; position: relative; } .newsletter .hbspt-form .hs-form-field .hs-input { border: 1px solid #546973; font-size: 16px; height: 36px; line-height: 36px; padding: 8px; width: 100%; } .newsletter .hbspt-form .hs-form-field .hs-input.error { border-color: #a92020; } .newsletter .hbspt-form .hs-form-field .hs-error-msgs { background: #a92020; bottom: -44px; left: 4px; list-style: none; margin: 0; padding: 6px 12px; position: absolute; } .newsletter .hbspt-form .hs-form-field .hs-error-msgs:after { border-color: transparent transparent transparent #a92020; border-style: solid; border-width: 8px 0 0 8px; content: ""; height: 0; left: 0; position: absolute; top: -8px; width: 0; } .newsletter .hbspt-form .hs-form-field .hs-error-msg { color: #fff; font-size: 14px; } .newsletter .hbspt-form .hs-submit { flex: 0 0 auto; } .newsletter .hbspt-form .hs-submit .hs-button { background: #1857B6; border: none; border-radius: 0 3px 3px 0; color: #fff; cursor: pointer; font-size: 14px; height: 36px; line-height: 36px; margin: 0; padding: 0 14px; transition: background-color 250ms cubic-bezier(.25, .46, .45, .94); } .newsletter .hbspt-form .hs-submit .hs-button:hover { background: #0f3875; } .newsletter .hbspt-form .hs_error_rollup { display: none; } .newsletter .hbspt-form .submitted-message{ border: 1px solid #1857B6; border-radius: 3px; padding: 12px; } .newsletter .hbspt-form .submitted-message p { color: #fff; margin: 0; text-align: left !important; } .newsletter-embed--accessibility .hbspt-form label{ color: #fff; font-family: "Roboto Slab", Serif; font-size: 18px; font-weight: 400; text-align: center; padding-bottom: 12px; } /* Paper Object */ .paper{ filter: drop-shadow(0 1px 2px rgba(0,0,0,.1)); } .paper:after{ background: #D9DADB; clip-path: polygon(0 0, 100% 100%, 0 100%); content: " "; height: 24px; position: absolute; right: 0; top: 0; width: 24px; } .paper__content{ clip-path: polygon(0 0, calc(100% - 24px) 0%, 100% calc(0% + 24px), 100% 100%, 0% 100%); } /* Testimonials */ .testimonial__citation:after{ content: ""; position: absolute; top: 36px; width: 0; height: 0; border-style: solid; border-width: 8px 0 8px 8px; border-color: transparent transparent transparent #f5f6f7; left: -24px; } .testimonial__citation.testimonial__citation--alt:after{ left: auto; right: -24px; border-width: 8px 8px 8px 0; border-color: transparent #f5f6f7 transparent transparent; } @media screen and (max-width: 767px){ .testimonial__citation:after{ display: none; } } </style></head><body><header class="site-header site-header--fixed"><div class="site-header__content"><a href="https://www.semanticscholar.org" class="site-header__logo w-inline-block"><img src="https://cdn.prod.website-files.com/605236bb767e9a5bb229c63c/6053b48e21b11570b9788241_s2-logo-small.svg" loading="lazy" alt="Semantic Scholar" height="36" class="logo-small"/><img src="https://cdn.prod.website-files.com/605236bb767e9a5bb229c63c/605274dd4af9b0ca8ac84182_s2-logo.svg" loading="lazy" alt="Semantic Scholar" height="36" class="logo-large"/></a><div class="site-header__search w-form"><form id="wf-form-Search" name="wf-form-Search" data-name="Search" action="https://www.semanticscholar.org/search" method="get" class="search__form" data-wf-page-id="6584745360a4872a287a89a3" data-wf-element-id="6ab03a1e-944e-9291-1968-b70fe5f1160b"><input class="search__field w-input" maxlength="256" name="q" data-name="q" placeholder="Search over 214 million papers from all fields of science" type="text" id="q"/><input type="submit" data-wait="Please wait..." class="search__submit w-button" value="Search"/></form><div class="w-form-done"><div>Thank you! Your submission has been received!</div></div><div class="w-form-fail"><div>Oops! Something went wrong while submitting the form.</div></div></div><div class="site-header__navigation site-header__navigation-auth"><div class="site-header__navigation-wrapper"><a data-w-id="6ab03a1e-944e-9291-1968-b70fe5f11616" href="#" class="site-header__navigation-close w-inline-block"><img src="https://cdn.prod.website-files.com/605236bb767e9a5bb229c63c/60908306c5938d99543d2b58_close.svg" loading="lazy" alt=""/></a><div data-hover="false" data-delay="0" class="site-header__navigation dropdown w-dropdown"><div class="site-header__navigation dropdown button button--secondary w-dropdown-toggle"><img src="https://cdn.prod.website-files.com/605236bb767e9a5bb229c63c/605b9e8398d437113ca1d650_icon-account.svg" loading="lazy" alt="Icon - Account" height="12" class="dropdown image"/><div class="dropdown icon w-icon-dropdown-toggle"></div><div class="dropdown dropdown__text">Account</div></div><nav class="dropdown dropdown__menu w-dropdown-list"><a href="https://www.semanticscholar.org/me/research" class="dropdown dropdown__link w-inline-block"><img src="https://cdn.prod.website-files.com/605236bb767e9a5bb229c63c/62dafc4d8c585c6c25f98ce9_menu-icon-dashboard.svg" loading="lazy" width="18" height="18" alt="" class="dropdown dropdown__image"/><div>Research Dashboard</div></a><a href="https://www.semanticscholar.org/me/recommendations" class="dropdown dropdown__link w-inline-block"><img src="https://cdn.prod.website-files.com/605236bb767e9a5bb229c63c/62dafc4d4e4b2383249840d1_menu-icon-feeds.svg" loading="lazy" width="18" height="18" alt="" class="dropdown dropdown__image"/><div>Research Feeds</div></a><a href="https://www.semanticscholar.org/me/library/all" class="dropdown dropdown__link w-inline-block"><img src="https://cdn.prod.website-files.com/605236bb767e9a5bb229c63c/62dafc4c42e988c961b5809a_menu-icon-library.svg" loading="lazy" width="18" height="18" alt="" class="dropdown dropdown__image"/><div>Library</div></a><a href="https://www.semanticscholar.org/me/account" class="dropdown dropdown__link dropdown__link--section w-inline-block"><img src="https://cdn.prod.website-files.com/605236bb767e9a5bb229c63c/62dafc4d3f5f3076275ec271_menu-icon-settings.svg" loading="lazy" width="18" height="18" alt="" class="dropdown dropdown__image"/><div>Settings</div></a><a href="https://www.semanticscholar.org/me/research" class="dropdown dropdown__link w-inline-block"><img src="https://cdn.prod.website-files.com/605236bb767e9a5bb229c63c/62dafc4c2ab4d9438fee8d1c_menu-icon-logout.svg" loading="lazy" width="18" height="18" alt="" class="dropdown dropdown__image"/><div>Sign Out</div></a></nav></div></div><a data-w-id="6ab03a1e-944e-9291-1968-b70fe5f11630" href="#" class="site-header__navigation-open">Menu</a></div><div class="site-header__navigation site-header__navigation-not-auth"><div class="site-header__navigation-wrapper"><a data-w-id="6ab03a1e-944e-9291-1968-b70fe5f11634" href="#" class="site-header__navigation-close w-inline-block"><img src="https://cdn.prod.website-files.com/605236bb767e9a5bb229c63c/60908306c5938d99543d2b58_close.svg" loading="lazy" alt=""/></a><a href="https://www.semanticscholar.org/sign-in" class="site-header__navigation button button--secondary w-button">Sign In</a><a href="https://www.semanticscholar.org/sign-in" class="site-header__navigation button w-button">Create Free Account</a></div><a data-w-id="6ab03a1e-944e-9291-1968-b70fe5f11662" href="#" class="site-header__navigation-open">Menu</a></div></div></header><main class="main"><div class="section-navigation"><div class="section-navigation__container"><div class="section-navigation__intro"><img src="https://cdn.prod.website-files.com/605236bb767e9a5bb229c63c/6402965ea141a9451bcbaca9_icon-s2.svg" loading="lazy" alt="" class="section-navigation__logo"/><a href="/product/scholars-hub" class="section-navigation__title w-inline-block">Scholar's Hub</a></div><ul role="list" class="section-navigation__links w-list-unstyled"><li><a href="/product/scholars-hub/trending" class="section-navigation__link">Trending Papers</a></li><li><a href="/product/scholars-hub/award-winning-ai-and-theory" class="section-navigation__link">AI & Theory</a></li><li><a href="/product/scholars-hub/award-winning-systems-and-databases" class="section-navigation__link">Systems & Databases</a></li><li><a href="/product/scholars-hub/award-winning-hci" class="section-navigation__link">HCI</a><a href="/product/scholars-hub/semantic-scholars-picks" class="section-navigation__link">Semantic Scholar's Picks</a></li></ul></div></div><div class="blade"><div class="blade__grid blade__grid--3-2"><div id="w-node-dc13fa01-f63d-0dd6-3ea2-09dbe45fc53c-287a89a3" class="blade__content"><h1>Scholar's Hub</h1><p class="p__intro p__intro--header"><strong class="p__intro p__intro--header">Top Viewed Papers Referred by ArXiv</strong></p><p class="p__intro">Welcome to a curated collection of the most popular papers on Semantic Scholar, filtered through arXiv referrals to provide you with valuable insights into what truly piques researchers' interest. These papers widely resonated with the research community and also gained credibility through their popularity on arXiv, a trusted source for scholarly content.</p><p>If you have any feedback or suggestions, please <a href="/about/contact">contact us</a>.</p><p class="p--small">Last updated: December 21st, 2023</p></div><img src="https://cdn.prod.website-files.com/605236bb767e9a5bb229c63c/64adb7026d0f25c7866370b5_trending-papers.png" loading="lazy" id="w-node-f36bb904-25b2-ce45-e358-9d11b7715e8e-287a89a3" sizes="(max-width: 479px) 57vw, (max-width: 991px) 45vw, 36vw" alt="Illustration: Trending Papers" srcset="https://cdn.prod.website-files.com/605236bb767e9a5bb229c63c/64adb7026d0f25c7866370b5_trending-papers-p-500.png 500w, https://cdn.prod.website-files.com/605236bb767e9a5bb229c63c/64adb7026d0f25c7866370b5_trending-papers-p-800.png 800w, https://cdn.prod.website-files.com/605236bb767e9a5bb229c63c/64adb7026d0f25c7866370b5_trending-papers.png 943w" class="blade__illustration"/></div></div><div id="mission" class="blade blade--white"><div class="blade__grid blade__grid--full"><div class="blade__content"><div class="condensed-list condensed-list--no-title"><div class="condensed-list__paper-list condensed-list__paper-list--no-title w-dyn-list"><div role="list" class="paper-list w-dyn-items"><div role="listitem" class="paper-list__paper paper-list__paper--condensed w-dyn-item"><a href="https://www.semanticscholar.org/paper/204e3073870fae3d05bcbc2f6a8e263d9b72e776?utm_source=top-arxiv-referred&utm_medium=hubpage&utm_term=1" target="_blank" class="paper-list__paper-link w-inline-block"><h4 class="paper-list__paper-title">Attention is All you Need</h4><ul role="list" class="paper-list__paper-meta"><li class="paper-list__paper-meta-holder"><p class="paper-list__paper-meta-item">Vaswani et al.</p></li><li class="paper-list__paper-meta-holder"><p class="paper-list__paper-meta-item">NIPS</p></li><li class="paper-list__paper-meta-holder"><p class="paper-list__paper-meta-item">June 12, 2017</p></li></ul><p class="paper-list__paper-abstract w-condition-invisible">The dominant sequence transduction models are based on complex recurrent or convolutional neural networks in an encoder-decoder configuration. The best performing models also connect the encoder and decoder through an attention mechanism. We propose a new simple network architecture, the Transformer, based solely on attention mechanisms, dispensing with recurrence and convolutions entirely. Experiments on two machine translation tasks show these models to be superior in quality while being more parallelizable and requiring significantly less time to train. Our model achieves 28.4 BLEU on the WMT 2014 English-to-German translation task, improving over the existing best results, including ensembles by over 2 BLEU. On the WMT 2014 English-to-French translation task, our model establishes a new single-model state-of-the-art BLEU score of 41.8 after training for 3.5 days on eight GPUs, a small fraction of the training costs of the best models from the literature. We show that the Transformer generalizes well to other tasks by applying it successfully to English constituency parsing both with large and limited training data.</p><div class="paper-list__paper-tldr-holder w-clearfix"><div class="pill pill--gray">TLDR</div><p class="paper-list__paper-abstract">A new simple network architecture, the Transformer, based solely on attention mechanisms, dispensing with recurrence and convolutions entirely is proposed, which generalizes well to other tasks by applying it successfully to English constituency parsing both with large and limited training data.</p></div></a></div><div role="listitem" class="paper-list__paper paper-list__paper--condensed w-dyn-item"><a href="https://www.semanticscholar.org/paper/8ca62fdf4c276ea3052dc96dcfd8ee96ca425a48?utm_source=top-arxiv-referred&utm_medium=hubpage&utm_term=2" target="_blank" class="paper-list__paper-link w-inline-block"><h4 class="paper-list__paper-title">GPT-4 Technical Report</h4><ul role="list" class="paper-list__paper-meta"><li class="paper-list__paper-meta-holder"><p class="paper-list__paper-meta-item">OpenAI et al.</p></li><li class="paper-list__paper-meta-holder"><p class="paper-list__paper-meta-item">ArXiv</p></li><li class="paper-list__paper-meta-holder"><p class="paper-list__paper-meta-item">March 15, 2023</p></li></ul><p class="paper-list__paper-abstract">We report the development of GPT-4, a large-scale, multimodal model which can accept image and text inputs and produce text outputs. While less capable than humans in many real-world scenarios, GPT-4 exhibits human-level performance on various professional and academic benchmarks, including passing a simulated bar exam with a score around the top 10% of test takers. GPT-4 is a Transformer-based model pre-trained to predict the next token in a document. The post-training alignment process results in improved performance on measures of factuality and adherence to desired behavior. A core component of this project was developing infrastructure and optimization methods that behave predictably across a wide range of scales. This allowed us to accurately predict some aspects of GPT-4's performance based on models trained with no more than 1/1,000th the compute of GPT-4.</p><div class="paper-list__paper-tldr-holder w-clearfix"><div class="pill pill--gray w-condition-invisible">TLDR</div><p class="paper-list__paper-abstract w-dyn-bind-empty"></p></div></a></div><div role="listitem" class="paper-list__paper paper-list__paper--condensed w-dyn-item"><a href="https://www.semanticscholar.org/paper/268d347e8a55b5eb82fb5e7d2f800e33c75ab18a?utm_source=top-arxiv-referred&utm_medium=hubpage&utm_term=3" target="_blank" class="paper-list__paper-link w-inline-block"><h4 class="paper-list__paper-title">An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale</h4><ul role="list" class="paper-list__paper-meta"><li class="paper-list__paper-meta-holder"><p class="paper-list__paper-meta-item">Dosovitskiy et al.</p></li><li class="paper-list__paper-meta-holder"><p class="paper-list__paper-meta-item">ICLR</p></li><li class="paper-list__paper-meta-holder"><p class="paper-list__paper-meta-item">October 22, 2020</p></li></ul><p class="paper-list__paper-abstract w-condition-invisible">While the Transformer architecture has become the de-facto standard for natural language processing tasks, its applications to computer vision remain limited. In vision, attention is either applied in conjunction with convolutional networks, or used to replace certain components of convolutional networks while keeping their overall structure in place. We show that this reliance on CNNs is not necessary and a pure transformer applied directly to sequences of image patches can perform very well on image classification tasks. When pre-trained on large amounts of data and transferred to multiple mid-sized or small image recognition benchmarks (ImageNet, CIFAR-100, VTAB, etc.), Vision Transformer (ViT) attains excellent results compared to state-of-the-art convolutional networks while requiring substantially fewer computational resources to train.</p><div class="paper-list__paper-tldr-holder w-clearfix"><div class="pill pill--gray">TLDR</div><p class="paper-list__paper-abstract">Vision Transformer (ViT) attains excellent results compared to state-of-the-art convolutional networks while requiring substantially fewer computational resources to train.</p></div></a></div><div role="listitem" class="paper-list__paper paper-list__paper--condensed w-dyn-item"><a href="https://www.semanticscholar.org/paper/c9c624f08872717965ff9a1229e9513acc2046ba?utm_source=top-arxiv-referred&utm_medium=hubpage&utm_term=4" target="_blank" class="paper-list__paper-link w-inline-block"><h4 class="paper-list__paper-title">IR-GAN: Room Impulse Response Generator for Speech Augmentation</h4><ul role="list" class="paper-list__paper-meta"><li class="paper-list__paper-meta-holder"><p class="paper-list__paper-meta-item">Ratnarajah et al.</p></li><li class="paper-list__paper-meta-holder"><p class="paper-list__paper-meta-item">ArXiv</p></li><li class="paper-list__paper-meta-holder"><p class="paper-list__paper-meta-item">October 25, 2020</p></li></ul><p class="paper-list__paper-abstract w-condition-invisible">We present a Generative Adversarial Network (GAN) based room impulse response generator for generating realistic synthetic room impulse responses. Our proposed generator can create synthetic room impulse responses by parametrically controlling the acoustic features captured in real-world room impulse responses. Our GAN-based room impulse response generator (IR-GAN) is capable of improving far-field automatic speech recognition in environments not known during training. We create far-field speech training set by augmenting our synthesized room impulse responses with clean LibriSpeech dataset. We evaluate the quality of our room impulse responses on the real-world LibriSpeech test set created using real impulse responses from BUT ReverbDB and AIR datasets. Furthermore, we combine our synthetic data with synthetic impulse responses generated using acoustic simulators, and this combination can reduce the word error rate by up to 14.3% in far-field speech recognition benchmarks.</p><div class="paper-list__paper-tldr-holder w-clearfix"><div class="pill pill--gray">TLDR</div><p class="paper-list__paper-abstract">A GAN-based room impulse response generator (IR-GAN) is capable of improving far-field automatic speech recognition in environments not known during training and can reduce the word error rate by up to 14.3% in far- field speech recognition benchmarks.</p></div></a></div><div role="listitem" class="paper-list__paper paper-list__paper--condensed w-dyn-item"><a href="https://www.semanticscholar.org/paper/463910f5a3abaa8d41dbbeeedd49d5746c1ab6b8?utm_source=top-arxiv-referred&utm_medium=hubpage&utm_term=5" target="_blank" class="paper-list__paper-link w-inline-block"><h4 class="paper-list__paper-title">UPop: Unified and Progressive Pruning for Compressing Vision-Language Transformers</h4><ul role="list" class="paper-list__paper-meta"><li class="paper-list__paper-meta-holder"><p class="paper-list__paper-meta-item">Shi et al.</p></li><li class="paper-list__paper-meta-holder"><p class="paper-list__paper-meta-item">ICML</p></li><li class="paper-list__paper-meta-holder"><p class="paper-list__paper-meta-item">January 31, 2023</p></li></ul><p class="paper-list__paper-abstract">Real-world data contains a vast amount of multimodal information, among which vision and language are the two most representative modalities. Moreover, increasingly heavier models, \textit{e}.\textit{g}., Transformers, have attracted the attention of researchers to model compression. However, how to compress multimodal models, especially vison-language Transformers, is still under-explored. This paper proposes the \textbf{U}nified and \textbf{P}r\textbf{o}gressive \textbf{P}runing (\textbf{\emph{UPop}}) as a universal vison-language Transformer compression framework, which incorporates 1) unifiedly searching multimodal subnets in a continuous optimization space from the original model, which enables automatic assignment of pruning ratios among compressible modalities and structures; 2) progressively searching and retraining the subnet, which maintains convergence between the search and retrain to attain higher compression ratios. Experiments on various tasks, datasets, and model architectures demonstrate the effectiveness and versatility of the proposed UPop framework. The code is available at https://github.com/sdc17/UPop.</p><div class="paper-list__paper-tldr-holder w-clearfix"><div class="pill pill--gray w-condition-invisible">TLDR</div><p class="paper-list__paper-abstract w-dyn-bind-empty"></p></div></a></div><div role="listitem" class="paper-list__paper paper-list__paper--condensed w-dyn-item"><a href="https://www.semanticscholar.org/paper/df2b0e26d0599ce3e70df8a9da02e51594e0e992?utm_source=top-arxiv-referred&utm_medium=hubpage&utm_term=6" target="_blank" class="paper-list__paper-link w-inline-block"><h4 class="paper-list__paper-title">BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding</h4><ul role="list" class="paper-list__paper-meta"><li class="paper-list__paper-meta-holder"><p class="paper-list__paper-meta-item">Devlin et al.</p></li><li class="paper-list__paper-meta-holder"><p class="paper-list__paper-meta-item">NAACL</p></li><li class="paper-list__paper-meta-holder"><p class="paper-list__paper-meta-item">October 11, 2018</p></li></ul><p class="paper-list__paper-abstract w-condition-invisible">We introduce a new language representation model called BERT, which stands for Bidirectional Encoder Representations from Transformers. Unlike recent language representation models (Peters et al., 2018a; Radford et al., 2018), BERT is designed to pre-train deep bidirectional representations from unlabeled text by jointly conditioning on both left and right context in all layers. As a result, the pre-trained BERT model can be fine-tuned with just one additional output layer to create state-of-the-art models for a wide range of tasks, such as question answering and language inference, without substantial task-specific architecture modifications. BERT is conceptually simple and empirically powerful. It obtains new state-of-the-art results on eleven natural language processing tasks, including pushing the GLUE score to 80.5 (7.7 point absolute improvement), MultiNLI accuracy to 86.7% (4.6% absolute improvement), SQuAD v1.1 question answering Test F1 to 93.2 (1.5 point absolute improvement) and SQuAD v2.0 Test F1 to 83.1 (5.1 point absolute improvement).</p><div class="paper-list__paper-tldr-holder w-clearfix"><div class="pill pill--gray">TLDR</div><p class="paper-list__paper-abstract">A new language representation model, BERT, designed to pre-train deep bidirectional representations from unlabeled text by jointly conditioning on both left and right context in all layers, which can be fine-tuned with just one additional output layer to create state-of-the-art models for a wide range of tasks.</p></div></a></div><div role="listitem" class="paper-list__paper paper-list__paper--condensed w-dyn-item"><a href="https://www.semanticscholar.org/paper/ff53cb49cb18e71e622fce7d96692e813630e878?utm_source=top-arxiv-referred&utm_medium=hubpage&utm_term=7" target="_blank" class="paper-list__paper-link w-inline-block"><h4 class="paper-list__paper-title">The Slingshot Mechanism: An Empirical Study of Adaptive Optimizers and the Grokking Phenomenon</h4><ul role="list" class="paper-list__paper-meta"><li class="paper-list__paper-meta-holder"><p class="paper-list__paper-meta-item">Thilak et al.</p></li><li class="paper-list__paper-meta-holder"><p class="paper-list__paper-meta-item">ArXiv</p></li><li class="paper-list__paper-meta-holder"><p class="paper-list__paper-meta-item">June 10, 2022</p></li></ul><p class="paper-list__paper-abstract">The grokking phenomenon as reported by Power et al. ( arXiv:2201.02177 ) refers to a regime where a long period of overfitting is followed by a seemingly sudden transition to perfect generalization. In this paper, we attempt to reveal the underpinnings of Grokking via a series of empirical studies. Specifically, we uncover an optimization anomaly plaguing adaptive optimizers at extremely late stages of training, referred to as the Slingshot Mechanism. A prominent artifact of the Slingshot Mechanism can be measured by the cyclic phase transitions between stable and unstable training regimes, and can be easily monitored by the cyclic behavior of the norm of the last layers weights. We empirically observe that without explicit regularization, Grokking as reported in ( arXiv:2201.02177 ) almost exclusively happens at the onset of Slingshots, and is absent without it. While common and easily reproduced in more general settings, the Slingshot Mechanism does not follow from any known optimization theories that we are aware of, and can be easily overlooked without an in depth examination. Our work points to a surprising and useful inductive bias of adaptive gradient optimizers at late stages of training, calling for a revised theoretical analysis of their origin.</p><div class="paper-list__paper-tldr-holder w-clearfix"><div class="pill pill--gray w-condition-invisible">TLDR</div><p class="paper-list__paper-abstract w-dyn-bind-empty"></p></div></a></div><div role="listitem" class="paper-list__paper paper-list__paper--condensed w-dyn-item"><a href="https://www.semanticscholar.org/paper/458355e8ee3c7233de8b5f6bbf101446f9ef380b?utm_source=top-arxiv-referred&utm_medium=hubpage&utm_term=8" target="_blank" class="paper-list__paper-link w-inline-block"><h4 class="paper-list__paper-title">Global Convergence of EM Algorithm for Mixtures of Two Component Linear Regression</h4><ul role="list" class="paper-list__paper-meta"><li class="paper-list__paper-meta-holder"><p class="paper-list__paper-meta-item">Kwon et al.</p></li><li class="paper-list__paper-meta-holder"><p class="paper-list__paper-meta-item">ArXiv</p></li><li class="paper-list__paper-meta-holder"><p class="paper-list__paper-meta-item">October 12, 2018</p></li></ul><p class="paper-list__paper-abstract w-condition-invisible">The Expectation-Maximization algorithm is perhaps the most broadly used algorithm for inference of latent variable problems. A theoretical understanding of its performance, however, largely remains lacking. Recent results established that EM enjoys global convergence for Gaussian Mixture Models. For Mixed Linear Regression, however, only local convergence results have been established, and those only for the high SNR regime. We show here that EM converges for mixed linear regression with two components (it is known that it may fail to converge for three or more), and moreover that this convergence holds for random initialization. Our analysis reveals that EM exhibits very different behavior in Mixed Linear Regression from its behavior in Gaussian Mixture Models, and hence our proofs require the development of several new ideas.</p><div class="paper-list__paper-tldr-holder w-clearfix"><div class="pill pill--gray">TLDR</div><p class="paper-list__paper-abstract">It is shown here that EM converges for mixed linear regression with two components (it is known that it may fail to converge for three or more), and moreover that this convergence holds for random initialization.</p></div></a></div><div role="listitem" class="paper-list__paper paper-list__paper--condensed w-dyn-item"><a href="https://www.semanticscholar.org/paper/2c03df8b48bf3fa39054345bafabfeff15bfd11d?utm_source=top-arxiv-referred&utm_medium=hubpage&utm_term=9" target="_blank" class="paper-list__paper-link w-inline-block"><h4 class="paper-list__paper-title">Deep Residual Learning for Image Recognition</h4><ul role="list" class="paper-list__paper-meta"><li class="paper-list__paper-meta-holder"><p class="paper-list__paper-meta-item">He et al.</p></li><li class="paper-list__paper-meta-holder"><p class="paper-list__paper-meta-item">2016 IEEE Conference on Computer Vision and Pattern Recognition (CVPR)</p></li><li class="paper-list__paper-meta-holder"><p class="paper-list__paper-meta-item">December 10, 2015</p></li></ul><p class="paper-list__paper-abstract w-condition-invisible">Deeper neural networks are more difficult to train. We present a residual learning framework to ease the training of networks that are substantially deeper than those used previously. We explicitly reformulate the layers as learning residual functions with reference to the layer inputs, instead of learning unreferenced functions. We provide comprehensive empirical evidence showing that these residual networks are easier to optimize, and can gain accuracy from considerably increased depth. On the ImageNet dataset we evaluate residual nets with a depth of up to 152 layers - 8× deeper than VGG nets [40] but still having lower complexity. An ensemble of these residual nets achieves 3.57% error on the ImageNet test set. This result won the 1st place on the ILSVRC 2015 classification task. We also present analysis on CIFAR-10 with 100 and 1000 layers. The depth of representations is of central importance for many visual recognition tasks. Solely due to our extremely deep representations, we obtain a 28% relative improvement on the COCO object detection dataset. Deep residual nets are foundations of our submissions to ILSVRC & COCO 2015 competitions1, where we also won the 1st places on the tasks of ImageNet detection, ImageNet localization, COCO detection, and COCO segmentation.</p><div class="paper-list__paper-tldr-holder w-clearfix"><div class="pill pill--gray">TLDR</div><p class="paper-list__paper-abstract">This work presents a residual learning framework to ease the training of networks that are substantially deeper than those used previously, and provides comprehensive empirical evidence showing that these residual networks are easier to optimize, and can gain accuracy from considerably increased depth.</p></div></a></div><div role="listitem" class="paper-list__paper paper-list__paper--condensed w-dyn-item"><a href="https://www.semanticscholar.org/paper/a2a3f5f141fbd938fa1cb16912058f1badc80826?utm_source=top-arxiv-referred&utm_medium=hubpage&utm_term=10" target="_blank" class="paper-list__paper-link w-inline-block"><h4 class="paper-list__paper-title">On the Minimax Optimality of the EM Algorithm for Learning Two-Component Mixed Linear Regression</h4><ul role="list" class="paper-list__paper-meta"><li class="paper-list__paper-meta-holder"><p class="paper-list__paper-meta-item">Kwon et al.</p></li><li class="paper-list__paper-meta-holder"><p class="paper-list__paper-meta-item">AISTATS</p></li><li class="paper-list__paper-meta-holder"><p class="paper-list__paper-meta-item">June 4, 2020</p></li></ul><p class="paper-list__paper-abstract w-condition-invisible">We study the convergence rates of the EM algorithm for learning two-component mixed linear regression under all regimes of signal-to-noise ratio (SNR). We resolve a long-standing question that many recent results have attempted to tackle: we completely characterize the convergence behavior of EM, and show that the EM algorithm achieves minimax optimal sample complexity under all SNR regimes. In particular, when the SNR is sufficiently large, the EM updates converge to the true parameter $\theta^{*}$ at the standard parametric convergence rate $\mathcal{O}((d/n)^{1/2})$ after $\mathcal{O}(\log(n/d))$ iterations. In the regime where the SNR is above $\mathcal{O}((d/n)^{1/4})$ and below some constant, the EM iterates converge to a $\mathcal{O}({\rm SNR}^{-1} (d/n)^{1/2})$ neighborhood of the true parameter, when the number of iterations is of the order $\mathcal{O}({\rm SNR}^{-2} \log(n/d))$. In the low SNR regime where the SNR is below $\mathcal{O}((d/n)^{1/4})$, we show that EM converges to a $\mathcal{O}((d/n)^{1/4})$ neighborhood of the true parameters, after $\mathcal{O}((n/d)^{1/2})$ iterations. Notably, these results are achieved under mild conditions of either random initialization or an efficiently computable local initialization. By providing tight convergence guarantees of the EM algorithm in middle-to-low SNR regimes, we fill the remaining gap in the literature, and significantly, reveal that in low SNR, EM changes rate, matching the $n^{-1/4}$ rate of the MLE, a behavior that previous work had been unable to show.</p><div class="paper-list__paper-tldr-holder w-clearfix"><div class="pill pill--gray">TLDR</div><p class="paper-list__paper-abstract">By providing tight convergence guarantees of the EM algorithm in middle-to-low SNR regimes, this work fills the remaining gap in the literature, and significantly reveals that in low SNR, EM changes rate, matching the $n^{-1/4}$ rate of the MLE, a behavior that previous work had been unable to show.</p></div></a></div><div role="listitem" class="paper-list__paper paper-list__paper--condensed w-dyn-item"><a href="https://www.semanticscholar.org/paper/eb42cf88027de515750f230b23b1a057dc782108?utm_source=top-arxiv-referred&utm_medium=hubpage&utm_term=11" target="_blank" class="paper-list__paper-link w-inline-block"><h4 class="paper-list__paper-title">Very Deep Convolutional Networks for Large-Scale Image Recognition</h4><ul role="list" class="paper-list__paper-meta"><li class="paper-list__paper-meta-holder"><p class="paper-list__paper-meta-item">Simonyan et al.</p></li><li class="paper-list__paper-meta-holder"><p class="paper-list__paper-meta-item">ICLR</p></li><li class="paper-list__paper-meta-holder"><p class="paper-list__paper-meta-item">September 4, 2014</p></li></ul><p class="paper-list__paper-abstract w-condition-invisible">In this work we investigate the effect of the convolutional network depth on its accuracy in the large-scale image recognition setting. Our main contribution is a thorough evaluation of networks of increasing depth using an architecture with very small (3x3) convolution filters, which shows that a significant improvement on the prior-art configurations can be achieved by pushing the depth to 16-19 weight layers. These findings were the basis of our ImageNet Challenge 2014 submission, where our team secured the first and the second places in the localisation and classification tracks respectively. We also show that our representations generalise well to other datasets, where they achieve state-of-the-art results. We have made our two best-performing ConvNet models publicly available to facilitate further research on the use of deep visual representations in computer vision.</p><div class="paper-list__paper-tldr-holder w-clearfix"><div class="pill pill--gray">TLDR</div><p class="paper-list__paper-abstract">This work investigates the effect of the convolutional network depth on its accuracy in the large-scale image recognition setting using an architecture with very small convolution filters, which shows that a significant improvement on the prior-art configurations can be achieved by pushing the depth to 16-19 weight layers.</p></div></a></div><div role="listitem" class="paper-list__paper paper-list__paper--condensed w-dyn-item"><a href="https://www.semanticscholar.org/paper/fe537df556f2036d0c51fed96da3060b12961c14?utm_source=top-arxiv-referred&utm_medium=hubpage&utm_term=12" target="_blank" class="paper-list__paper-link w-inline-block"><h4 class="paper-list__paper-title">A Fully First-Order Method for Stochastic Bilevel Optimization</h4><ul role="list" class="paper-list__paper-meta"><li class="paper-list__paper-meta-holder"><p class="paper-list__paper-meta-item">Kwon et al.</p></li><li class="paper-list__paper-meta-holder"><p class="paper-list__paper-meta-item">ICML</p></li><li class="paper-list__paper-meta-holder"><p class="paper-list__paper-meta-item">January 26, 2023</p></li></ul><p class="paper-list__paper-abstract">We consider stochastic unconstrained bilevel optimization problems when only the first-order gradient oracles are available. While numerous optimization methods have been proposed for tackling bilevel problems, existing methods either tend to require possibly expensive calculations regarding Hessians of lower-level objectives, or lack rigorous finite-time performance guarantees. In this work, we propose a Fully First-order Stochastic Approximation (F2SA) method, and study its non-asymptotic convergence properties. Specifically, we show that F2SA converges to an $\epsilon$-stationary solution of the bilevel problem after $\epsilon^{-7/2}, \epsilon^{-5/2}$, and $\epsilon^{-3/2}$ iterations (each iteration using $O(1)$ samples) when stochastic noises are in both level objectives, only in the upper-level objective, and not present (deterministic settings), respectively. We further show that if we employ momentum-assisted gradient estimators, the iteration complexities can be improved to $\epsilon^{-5/2}, \epsilon^{-4/2}$, and $\epsilon^{-3/2}$, respectively. We demonstrate even superior practical performance of the proposed method over existing second-order based approaches on MNIST data-hypercleaning experiments.</p><div class="paper-list__paper-tldr-holder w-clearfix"><div class="pill pill--gray w-condition-invisible">TLDR</div><p class="paper-list__paper-abstract w-dyn-bind-empty"></p></div></a></div><div role="listitem" class="paper-list__paper paper-list__paper--condensed w-dyn-item"><a href="https://www.semanticscholar.org/paper/6b85b63579a916f705a8e10a49bd8d849d91b1fc?utm_source=top-arxiv-referred&utm_medium=hubpage&utm_term=13" target="_blank" class="paper-list__paper-link w-inline-block"><h4 class="paper-list__paper-title">Language Models are Few-Shot Learners</h4><ul role="list" class="paper-list__paper-meta"><li class="paper-list__paper-meta-holder"><p class="paper-list__paper-meta-item">Brown et al.</p></li><li class="paper-list__paper-meta-holder"><p class="paper-list__paper-meta-item">NeurIPS</p></li><li class="paper-list__paper-meta-holder"><p class="paper-list__paper-meta-item">May 28, 2020</p></li></ul><p class="paper-list__paper-abstract w-condition-invisible">Recent work has demonstrated substantial gains on many NLP tasks and benchmarks by pre-training on a large corpus of text followed by fine-tuning on a specific task. While typically task-agnostic in architecture, this method still requires task-specific fine-tuning datasets of thousands or tens of thousands of examples. By contrast, humans can generally perform a new language task from only a few examples or from simple instructions - something which current NLP systems still largely struggle to do. Here we show that scaling up language models greatly improves task-agnostic, few-shot performance, sometimes even reaching competitiveness with prior state-of-the-art fine-tuning approaches. Specifically, we train GPT-3, an autoregressive language model with 175 billion parameters, 10x more than any previous non-sparse language model, and test its performance in the few-shot setting. For all tasks, GPT-3 is applied without any gradient updates or fine-tuning, with tasks and few-shot demonstrations specified purely via text interaction with the model. GPT-3 achieves strong performance on many NLP datasets, including translation, question-answering, and cloze tasks, as well as several tasks that require on-the-fly reasoning or domain adaptation, such as unscrambling words, using a novel word in a sentence, or performing 3-digit arithmetic. At the same time, we also identify some datasets where GPT-3's few-shot learning still struggles, as well as some datasets where GPT-3 faces methodological issues related to training on large web corpora. Finally, we find that GPT-3 can generate samples of news articles which human evaluators have difficulty distinguishing from articles written by humans. We discuss broader societal impacts of this finding and of GPT-3 in general.</p><div class="paper-list__paper-tldr-holder w-clearfix"><div class="pill pill--gray">TLDR</div><p class="paper-list__paper-abstract">GPT-3 achieves strong performance on many NLP datasets, including translation, question-answering, and cloze tasks, as well as several tasks that require on-the-fly reasoning or domain adaptation, such as unscrambling words, using a novel word in a sentence, or performing 3-digit arithmetic.</p></div></a></div><div role="listitem" class="paper-list__paper paper-list__paper--condensed w-dyn-item"><a href="https://www.semanticscholar.org/paper/f8562e48844bba90c55a505a93aa1ac652839a45?utm_source=top-arxiv-referred&utm_medium=hubpage&utm_term=14" target="_blank" class="paper-list__paper-link w-inline-block"><h4 class="paper-list__paper-title">A Collaborative Learning Framework via Federated Meta-Learning</h4><ul role="list" class="paper-list__paper-meta"><li class="paper-list__paper-meta-holder"><p class="paper-list__paper-meta-item">Lin et al.</p></li><li class="paper-list__paper-meta-holder"><p class="paper-list__paper-meta-item">2020 IEEE 40th International Conference on Distributed Computing Systems (ICDCS)</p></li><li class="paper-list__paper-meta-holder"><p class="paper-list__paper-meta-item">January 9, 2020</p></li></ul><p class="paper-list__paper-abstract w-condition-invisible">Many IoT applications at the network edge demand intelligent decisions in a real-time manner. The edge device alone, however, often cannot achieve real-time edge intelligence due to its constrained computing resources and limited local data. To tackle these challenges, we propose a platform-aided collaborative learning framework where a model is first trained across a set of source edge nodes by a federated meta-learning approach, and then it is rapidly adapted to learn a new task at the target edge node, using a few samples only. Further, we investigate the convergence of the proposed federated meta-learning algorithm under mild conditions on node similarity and the adaptation performance at the target edge. To combat against the vulnerability of meta-learning algorithms to possible adversarial attacks, we further propose a robust version of the federated meta-learning algorithm based on distributionally robust optimization, and establish its convergence under mild conditions. Experiments on different datasets demonstrate the effectiveness of the proposed Federated Meta-Learning based framework.</p><div class="paper-list__paper-tldr-holder w-clearfix"><div class="pill pill--gray">TLDR</div><p class="paper-list__paper-abstract">This work proposes a platform-aided collaborative learning framework where a model is first trained across a set of source edge nodes by a federated meta-learning approach, and then it is rapidly adapted to learn a new task at the target edge node, using a few samples only.</p></div></a></div><div role="listitem" class="paper-list__paper paper-list__paper--condensed w-dyn-item"><a href="https://www.semanticscholar.org/paper/6364fdaa0a0eccd823a779fcdd489173f938e91a?utm_source=top-arxiv-referred&utm_medium=hubpage&utm_term=15" target="_blank" class="paper-list__paper-link w-inline-block"><h4 class="paper-list__paper-title">U-Net: Convolutional Networks for Biomedical Image Segmentation</h4><ul role="list" class="paper-list__paper-meta"><li class="paper-list__paper-meta-holder"><p class="paper-list__paper-meta-item">Ronneberger et al.</p></li><li class="paper-list__paper-meta-holder"><p class="paper-list__paper-meta-item">MICCAI</p></li><li class="paper-list__paper-meta-holder"><p class="paper-list__paper-meta-item">May 18, 2015</p></li></ul><p class="paper-list__paper-abstract w-condition-invisible">There is large consent that successful training of deep networks requires many thousand annotated training samples. In this paper, we present a network and training strategy that relies on the strong use of data augmentation to use the available annotated samples more efficiently. The architecture consists of a contracting path to capture context and a symmetric expanding path that enables precise localization. We show that such a network can be trained end-to-end from very few images and outperforms the prior best method (a sliding-window convolutional network) on the ISBI challenge for segmentation of neuronal structures in electron microscopic stacks. Using the same network trained on transmitted light microscopy images (phase contrast and DIC) we won the ISBI cell tracking challenge 2015 in these categories by a large margin. Moreover, the network is fast. Segmentation of a 512x512 image takes less than a second on a recent GPU. The full implementation (based on Caffe) and the trained networks are available at http://lmb.informatik.uni-freiburg.de/people/ronneber/u-net .</p><div class="paper-list__paper-tldr-holder w-clearfix"><div class="pill pill--gray">TLDR</div><p class="paper-list__paper-abstract">It is shown that such a network can be trained end-to-end from very few images and outperforms the prior best method (a sliding-window convolutional network) on the ISBI challenge for segmentation of neuronal structures in electron microscopic stacks.</p></div></a></div><div role="listitem" class="paper-list__paper paper-list__paper--condensed w-dyn-item"><a href="https://www.semanticscholar.org/paper/6f870f7f02a8c59c3e23f407f3ef00dd1dcf8fc4?utm_source=top-arxiv-referred&utm_medium=hubpage&utm_term=16" target="_blank" class="paper-list__paper-link w-inline-block"><h4 class="paper-list__paper-title">Learning Transferable Visual Models From Natural Language Supervision</h4><ul role="list" class="paper-list__paper-meta"><li class="paper-list__paper-meta-holder"><p class="paper-list__paper-meta-item">Radford et al.</p></li><li class="paper-list__paper-meta-holder"><p class="paper-list__paper-meta-item">ICML</p></li><li class="paper-list__paper-meta-holder"><p class="paper-list__paper-meta-item">February 26, 2021</p></li></ul><p class="paper-list__paper-abstract w-condition-invisible">State-of-the-art computer vision systems are trained to predict a fixed set of predetermined object categories. This restricted form of supervision limits their generality and usability since additional labeled data is needed to specify any other visual concept. Learning directly from raw text about images is a promising alternative which leverages a much broader source of supervision. We demonstrate that the simple pre-training task of predicting which caption goes with which image is an efficient and scalable way to learn SOTA image representations from scratch on a dataset of 400 million (image, text) pairs collected from the internet. After pre-training, natural language is used to reference learned visual concepts (or describe new ones) enabling zero-shot transfer of the model to downstream tasks. We study the performance of this approach by benchmarking on over 30 different existing computer vision datasets, spanning tasks such as OCR, action recognition in videos, geo-localization, and many types of fine-grained object classification. The model transfers non-trivially to most tasks and is often competitive with a fully supervised baseline without the need for any dataset specific training. For instance, we match the accuracy of the original ResNet-50 on ImageNet zero-shot without needing to use any of the 1.28 million training examples it was trained on. We release our code and pre-trained model weights at https://github.com/OpenAI/CLIP.</p><div class="paper-list__paper-tldr-holder w-clearfix"><div class="pill pill--gray">TLDR</div><p class="paper-list__paper-abstract">It is demonstrated that the simple pre-training task of predicting which caption goes with which image is an efficient and scalable way to learn SOTA image representations from scratch on a dataset of 400 million (image, text) pairs collected from the internet.</p></div></a></div><div role="listitem" class="paper-list__paper paper-list__paper--condensed w-dyn-item"><a href="https://www.semanticscholar.org/paper/fd2d1d18278fb406bc7de619776a0cf619e415bb?utm_source=top-arxiv-referred&utm_medium=hubpage&utm_term=17" target="_blank" class="paper-list__paper-link w-inline-block"><h4 class="paper-list__paper-title">Coordinated Attacks against Contextual Bandits: Fundamental Limits and Defense Mechanisms</h4><ul role="list" class="paper-list__paper-meta"><li class="paper-list__paper-meta-holder"><p class="paper-list__paper-meta-item">Kwon et al.</p></li><li class="paper-list__paper-meta-holder"><p class="paper-list__paper-meta-item">ICML</p></li><li class="paper-list__paper-meta-holder"><p class="paper-list__paper-meta-item">January 30, 2022</p></li></ul><p class="paper-list__paper-abstract w-condition-invisible">Motivated by online recommendation systems, we propose the problem of finding the optimal policy in multitask contextual bandits when a small fraction $\alpha<1/2$ of tasks (users) are arbitrary and adversarial. The remaining fraction of good users share the same instance of contextual bandits with $S$ contexts and $A$ actions (items). Naturally, whether a user is good or adversarial is not known in advance. The goal is to robustly learn the policy that maximizes rewards for good users with as few user interactions as possible. Without adversarial users, established results in collaborative filtering show that $O(1/\epsilon^2)$ per-user interactions suffice to learn a good policy, precisely because information can be shared across users. This parallelization gain is fundamentally altered by the presence of adversarial users: unless there are super-polynomial number of users, we show a lower bound of $\tilde{\Omega}(\min(S,A) \cdot \alpha^2 / \epsilon^2)$ {\it per-user} interactions to learn an $\epsilon$-optimal policy for the good users. We then show we can achieve an $\tilde{O}(\min(S,A)\cdot \alpha/\epsilon^2)$ upper-bound, by employing efficient robust mean estimators for both uni-variate and high-dimensional random variables. We also show that this can be improved depending on the distributions of contexts.</p><div class="paper-list__paper-tldr-holder w-clearfix"><div class="pill pill--gray">TLDR</div><p class="paper-list__paper-abstract">This work shows it can achieve an Õ(min(S,A) ·α/ ) upper-bound, by employing efficient robust mean estimators for both uni-variate and high-dimensional random variables, and shows that this can be improved depending on the distributions of contexts.</p></div></a></div><div role="listitem" class="paper-list__paper paper-list__paper--condensed w-dyn-item"><a href="https://www.semanticscholar.org/paper/a6cb366736791bcccc5c8639de5a8f9636bf87e8?utm_source=top-arxiv-referred&utm_medium=hubpage&utm_term=18" target="_blank" class="paper-list__paper-link w-inline-block"><h4 class="paper-list__paper-title">Adam: A Method for Stochastic Optimization</h4><ul role="list" class="paper-list__paper-meta"><li class="paper-list__paper-meta-holder"><p class="paper-list__paper-meta-item">Kingma et al.</p></li><li class="paper-list__paper-meta-holder"><p class="paper-list__paper-meta-item">ICLR</p></li><li class="paper-list__paper-meta-holder"><p class="paper-list__paper-meta-item">December 22, 2014</p></li></ul><p class="paper-list__paper-abstract w-condition-invisible">We introduce Adam, an algorithm for first-order gradient-based optimization of stochastic objective functions, based on adaptive estimates of lower-order moments. The method is straightforward to implement, is computationally efficient, has little memory requirements, is invariant to diagonal rescaling of the gradients, and is well suited for problems that are large in terms of data and/or parameters. The method is also appropriate for non-stationary objectives and problems with very noisy and/or sparse gradients. The hyper-parameters have intuitive interpretations and typically require little tuning. Some connections to related algorithms, on which Adam was inspired, are discussed. We also analyze the theoretical convergence properties of the algorithm and provide a regret bound on the convergence rate that is comparable to the best known results under the online convex optimization framework. Empirical results demonstrate that Adam works well in practice and compares favorably to other stochastic optimization methods. Finally, we discuss AdaMax, a variant of Adam based on the infinity norm.</p><div class="paper-list__paper-tldr-holder w-clearfix"><div class="pill pill--gray">TLDR</div><p class="paper-list__paper-abstract">This work introduces Adam, an algorithm for first-order gradient-based optimization of stochastic objective functions, based on adaptive estimates of lower-order moments, and provides a regret bound on the convergence rate that is comparable to the best known results under the online convex optimization framework.</p></div></a></div><div role="listitem" class="paper-list__paper paper-list__paper--condensed w-dyn-item"><a href="https://www.semanticscholar.org/paper/7470a1702c8c86e6f28d32cfa315381150102f5b?utm_source=top-arxiv-referred&utm_medium=hubpage&utm_term=19" target="_blank" class="paper-list__paper-link w-inline-block"><h4 class="paper-list__paper-title">Segment Anything</h4><ul role="list" class="paper-list__paper-meta"><li class="paper-list__paper-meta-holder"><p class="paper-list__paper-meta-item">Kirillov et al.</p></li><li class="paper-list__paper-meta-holder"><p class="paper-list__paper-meta-item">ArXiv</p></li><li class="paper-list__paper-meta-holder"><p class="paper-list__paper-meta-item">April 5, 2023</p></li></ul><p class="paper-list__paper-abstract">We introduce the Segment Anything (SA) project: a new task, model, and dataset for image segmentation. Using our efficient model in a data collection loop, we built the largest segmentation dataset to date (by far), with over 1 billion masks on 11M licensed and privacy respecting images. The model is designed and trained to be promptable, so it can transfer zero-shot to new image distributions and tasks. We evaluate its capabilities on numerous tasks and find that its zero-shot performance is impressive -- often competitive with or even superior to prior fully supervised results. We are releasing the Segment Anything Model (SAM) and corresponding dataset (SA-1B) of 1B masks and 11M images at https://segment-anything.com to foster research into foundation models for computer vision.</p><div class="paper-list__paper-tldr-holder w-clearfix"><div class="pill pill--gray w-condition-invisible">TLDR</div><p class="paper-list__paper-abstract w-dyn-bind-empty"></p></div></a></div><div role="listitem" class="paper-list__paper paper-list__paper--condensed w-dyn-item"><a href="https://www.semanticscholar.org/paper/1b6e810ce0afd0dd093f789d2b2742d047e316d5?utm_source=top-arxiv-referred&utm_medium=hubpage&utm_term=20" target="_blank" class="paper-list__paper-link w-inline-block"><h4 class="paper-list__paper-title">Chain of Thought Prompting Elicits Reasoning in Large Language Models</h4><ul role="list" class="paper-list__paper-meta"><li class="paper-list__paper-meta-holder"><p class="paper-list__paper-meta-item">Wei et al.</p></li><li class="paper-list__paper-meta-holder"><p class="paper-list__paper-meta-item">NeurIPS</p></li><li class="paper-list__paper-meta-holder"><p class="paper-list__paper-meta-item">January 28, 2022</p></li></ul><p class="paper-list__paper-abstract w-condition-invisible">We explore how generating a chain of thought -- a series of intermediate reasoning steps -- significantly improves the ability of large language models to perform complex reasoning. In particular, we show how such reasoning abilities emerge naturally in sufficiently large language models via a simple method called chain of thought prompting, where a few chain of thought demonstrations are provided as exemplars in prompting. Experiments on three large language models show that chain of thought prompting improves performance on a range of arithmetic, commonsense, and symbolic reasoning tasks. The empirical gains can be striking. For instance, prompting a 540B-parameter language model with just eight chain of thought exemplars achieves state of the art accuracy on the GSM8K benchmark of math word problems, surpassing even finetuned GPT-3 with a verifier.</p><div class="paper-list__paper-tldr-holder w-clearfix"><div class="pill pill--gray">TLDR</div><p class="paper-list__paper-abstract">Experiments on three large language models show that chain-of-thought prompting improves performance on a range of arithmetic, commonsense, and symbolic reasoning tasks.</p></div></a></div><div role="listitem" class="paper-list__paper paper-list__paper--condensed w-dyn-item"><a href="https://www.semanticscholar.org/paper/8834e8f3799ae759ee8a5bb5c8b939cf650b01cd?utm_source=top-arxiv-referred&utm_medium=hubpage&utm_term=21" target="_blank" class="paper-list__paper-link w-inline-block"><h4 class="paper-list__paper-title">Artificial Artificial Artificial Intelligence: Crowd Workers Widely Use Large Language Models for Text Production Tasks</h4><ul role="list" class="paper-list__paper-meta"><li class="paper-list__paper-meta-holder"><p class="paper-list__paper-meta-item">Veselovsky et al.</p></li><li class="paper-list__paper-meta-holder"><p class="paper-list__paper-meta-item">ArXiv</p></li><li class="paper-list__paper-meta-holder"><p class="paper-list__paper-meta-item">June 13, 2023</p></li></ul><p class="paper-list__paper-abstract">Large language models (LLMs) are remarkable data annotators. They can be used to generate high-fidelity supervised training data, as well as survey and experimental data. With the widespread adoption of LLMs, human gold--standard annotations are key to understanding the capabilities of LLMs and the validity of their results. However, crowdsourcing, an important, inexpensive way to obtain human annotations, may itself be impacted by LLMs, as crowd workers have financial incentives to use LLMs to increase their productivity and income. To investigate this concern, we conducted a case study on the prevalence of LLM usage by crowd workers. We reran an abstract summarization task from the literature on Amazon Mechanical Turk and, through a combination of keystroke detection and synthetic text classification, estimate that 33-46% of crowd workers used LLMs when completing the task. Although generalization to other, less LLM-friendly tasks is unclear, our results call for platforms, researchers, and crowd workers to find new ways to ensure that human data remain human, perhaps using the methodology proposed here as a stepping stone. Code/data: https://github.com/epfl-dlab/GPTurk</p><div class="paper-list__paper-tldr-holder w-clearfix"><div class="pill pill--gray w-condition-invisible">TLDR</div><p class="paper-list__paper-abstract w-dyn-bind-empty"></p></div></a></div><div role="listitem" class="paper-list__paper paper-list__paper--condensed w-dyn-item"><a href="https://www.semanticscholar.org/paper/a8ca46b171467ceb2d7652fbfb67fe701ad86092?utm_source=top-arxiv-referred&utm_medium=hubpage&utm_term=22" target="_blank" class="paper-list__paper-link w-inline-block"><h4 class="paper-list__paper-title">LoRA: Low-Rank Adaptation of Large Language Models</h4><ul role="list" class="paper-list__paper-meta"><li class="paper-list__paper-meta-holder"><p class="paper-list__paper-meta-item">Hu et al.</p></li><li class="paper-list__paper-meta-holder"><p class="paper-list__paper-meta-item">ICLR</p></li><li class="paper-list__paper-meta-holder"><p class="paper-list__paper-meta-item">June 17, 2021</p></li></ul><p class="paper-list__paper-abstract w-condition-invisible">An important paradigm of natural language processing consists of large-scale pre-training on general domain data and adaptation to particular tasks or domains. As we pre-train larger models, full fine-tuning, which retrains all model parameters, becomes less feasible. Using GPT-3 175B as an example -- deploying independent instances of fine-tuned models, each with 175B parameters, is prohibitively expensive. We propose Low-Rank Adaptation, or LoRA, which freezes the pre-trained model weights and injects trainable rank decomposition matrices into each layer of the Transformer architecture, greatly reducing the number of trainable parameters for downstream tasks. Compared to GPT-3 175B fine-tuned with Adam, LoRA can reduce the number of trainable parameters by 10,000 times and the GPU memory requirement by 3 times. LoRA performs on-par or better than fine-tuning in model quality on RoBERTa, DeBERTa, GPT-2, and GPT-3, despite having fewer trainable parameters, a higher training throughput, and, unlike adapters, no additional inference latency. We also provide an empirical investigation into rank-deficiency in language model adaptation, which sheds light on the efficacy of LoRA. We release a package that facilitates the integration of LoRA with PyTorch models and provide our implementations and model checkpoints for RoBERTa, DeBERTa, and GPT-2 at https://github.com/microsoft/LoRA.</p><div class="paper-list__paper-tldr-holder w-clearfix"><div class="pill pill--gray">TLDR</div><p class="paper-list__paper-abstract">Low-Rank Adaptation, or LoRA, is proposed, which freezes the pre-trained model weights and injects trainable rank decomposition matrices into each layer of the Transformer architecture, greatly reducing the number of trainable parameters for downstream tasks.</p></div></a></div><div role="listitem" class="paper-list__paper paper-list__paper--condensed w-dyn-item"><a href="https://www.semanticscholar.org/paper/104b0bb1da562d53cbda87aec79ef6a2827d191a?utm_source=top-arxiv-referred&utm_medium=hubpage&utm_term=23" target="_blank" class="paper-list__paper-link w-inline-block"><h4 class="paper-list__paper-title">Llama 2: Open Foundation and Fine-Tuned Chat Models</h4><ul role="list" class="paper-list__paper-meta"><li class="paper-list__paper-meta-holder"><p class="paper-list__paper-meta-item">Touvron et al.</p></li><li class="paper-list__paper-meta-holder"><p class="paper-list__paper-meta-item">ArXiv</p></li><li class="paper-list__paper-meta-holder"><p class="paper-list__paper-meta-item">July 18, 2023</p></li></ul><p class="paper-list__paper-abstract">In this work, we develop and release Llama 2, a collection of pretrained and fine-tuned large language models (LLMs) ranging in scale from 7 billion to 70 billion parameters. Our fine-tuned LLMs, called Llama 2-Chat, are optimized for dialogue use cases. Our models outperform open-source chat models on most benchmarks we tested, and based on our human evaluations for helpfulness and safety, may be a suitable substitute for closed-source models. We provide a detailed description of our approach to fine-tuning and safety improvements of Llama 2-Chat in order to enable the community to build on our work and contribute to the responsible development of LLMs.</p><div class="paper-list__paper-tldr-holder w-clearfix"><div class="pill pill--gray w-condition-invisible">TLDR</div><p class="paper-list__paper-abstract w-dyn-bind-empty"></p></div></a></div><div role="listitem" class="paper-list__paper paper-list__paper--condensed w-dyn-item"><a href="https://www.semanticscholar.org/paper/04fa3ebc4c0c0b4a2d2d1a3fc612134a05057696?utm_source=top-arxiv-referred&utm_medium=hubpage&utm_term=24" target="_blank" class="paper-list__paper-link w-inline-block"><h4 class="paper-list__paper-title">Prompt Injection Attacks and Defenses in LLM-Integrated Applications</h4><ul role="list" class="paper-list__paper-meta"><li class="paper-list__paper-meta-holder"><p class="paper-list__paper-meta-item">Liu et al.</p></li><li class="paper-list__paper-meta-holder"><p class="paper-list__paper-meta-item">ArXiv</p></li><li class="paper-list__paper-meta-holder"><p class="paper-list__paper-meta-item">October 19, 2023</p></li></ul><p class="paper-list__paper-abstract">Large Language Models (LLMs) are increasingly deployed as the backend for a variety of real-world applications called LLM-Integrated Applications. Multiple recent works showed that LLM-Integrated Applications are vulnerable to prompt injection attacks, in which an attacker injects malicious instruction/data into the input of those applications such that they produce results as the attacker desires. However, existing works are limited to case studies. As a result, the literature lacks a systematic understanding of prompt injection attacks and their defenses. We aim to bridge the gap in this work. In particular, we propose a general framework to formalize prompt injection attacks. Existing attacks, which are discussed in research papers and blog posts, are special cases in our framework. Our framework enables us to design a new attack by combining existing attacks. Moreover, we also propose a framework to systematize defenses against prompt injection attacks. Using our frameworks, we conduct a systematic evaluation on prompt injection attacks and their defenses with 10 LLMs and 7 tasks. We hope our frameworks can inspire future research in this field. Our code is available at https://github.com/liu00222/Open-Prompt-Injection.</p><div class="paper-list__paper-tldr-holder w-clearfix"><div class="pill pill--gray w-condition-invisible">TLDR</div><p class="paper-list__paper-abstract w-dyn-bind-empty"></p></div></a></div><div role="listitem" class="paper-list__paper paper-list__paper--condensed w-dyn-item"><a href="https://www.semanticscholar.org/paper/86ee1835a56722b76564119437070782fc90eb19?utm_source=top-arxiv-referred&utm_medium=hubpage&utm_term=25" target="_blank" class="paper-list__paper-link w-inline-block"><h4 class="paper-list__paper-title">Generative Adversarial Nets</h4><ul role="list" class="paper-list__paper-meta"><li class="paper-list__paper-meta-holder"><p class="paper-list__paper-meta-item">Goodfellow et al.</p></li><li class="paper-list__paper-meta-holder"><p class="paper-list__paper-meta-item">NIPS</p></li><li class="paper-list__paper-meta-holder"><p class="paper-list__paper-meta-item">June 10, 2014</p></li></ul><p class="paper-list__paper-abstract">We propose a new framework for estimating generative models via an adversarial process, in which we simultaneously train two models: a generative model G that captures the data distribution, and a discriminative model D that estimates the probability that a sample came from the training data rather than G. The training procedure for G is to maximize the probability of D making a mistake. This framework corresponds to a minimax two-player game. In the space of arbitrary functions G and D, a unique solution exists, with G recovering the training data distribution and D equal to ½ everywhere. In the case where G and D are defined by multilayer perceptrons, the entire system can be trained with backpropagation. There is no need for any Markov chains or unrolled approximate inference networks during either training or generation of samples. Experiments demonstrate the potential of the framework through qualitative and quantitative evaluation of the generated samples.</p><div class="paper-list__paper-tldr-holder w-clearfix"><div class="pill pill--gray w-condition-invisible">TLDR</div><p class="paper-list__paper-abstract w-dyn-bind-empty"></p></div></a></div><div role="listitem" class="paper-list__paper paper-list__paper--condensed w-dyn-item"><a href="https://www.semanticscholar.org/paper/2a6f7f0d659c5f7dcd665064b71e7b751592c80e?utm_source=top-arxiv-referred&utm_medium=hubpage&utm_term=26" target="_blank" class="paper-list__paper-link w-inline-block"><h4 class="paper-list__paper-title">YOLOv4: Optimal Speed and Accuracy of Object Detection</h4><ul role="list" class="paper-list__paper-meta"><li class="paper-list__paper-meta-holder"><p class="paper-list__paper-meta-item">Bochkovskiy et al.</p></li><li class="paper-list__paper-meta-holder"><p class="paper-list__paper-meta-item">ArXiv</p></li><li class="paper-list__paper-meta-holder"><p class="paper-list__paper-meta-item">April 23, 2020</p></li></ul><p class="paper-list__paper-abstract w-condition-invisible">There are a huge number of features which are said to improve Convolutional Neural Network (CNN) accuracy. Practical testing of combinations of such features on large datasets, and theoretical justification of the result, is required. Some features operate on certain models exclusively and for certain problems exclusively, or only for small-scale datasets; while some features, such as batch-normalization and residual-connections, are applicable to the majority of models, tasks, and datasets. We assume that such universal features include Weighted-Residual-Connections (WRC), Cross-Stage-Partial-connections (CSP), Cross mini-Batch Normalization (CmBN), Self-adversarial-training (SAT) and Mish-activation. We use new features: WRC, CSP, CmBN, SAT, Mish activation, Mosaic data augmentation, CmBN, DropBlock regularization, and CIoU loss, and combine some of them to achieve state-of-the-art results: 43.5% AP (65.7% AP50) for the MS COCO dataset at a realtime speed of ~65 FPS on Tesla V100. Source code is at this https URL</p><div class="paper-list__paper-tldr-holder w-clearfix"><div class="pill pill--gray">TLDR</div><p class="paper-list__paper-abstract">This work uses new features: WRC, CSP, CmBN, SAT, Mish activation, Mosaic data augmentation, C mBN, DropBlock regularization, and CIoU loss, and combine some of them to achieve state-of-the-art results: 43.5% AP for the MS COCO dataset at a realtime speed of ~65 FPS on Tesla V100.</p></div></a></div><div role="listitem" class="paper-list__paper paper-list__paper--condensed w-dyn-item"><a href="https://www.semanticscholar.org/paper/424561d8585ff8ebce7d5d07de8dbf7aae5e7270?utm_source=top-arxiv-referred&utm_medium=hubpage&utm_term=27" target="_blank" class="paper-list__paper-link w-inline-block"><h4 class="paper-list__paper-title">Faster R-CNN: Towards Real-Time Object Detection with Region Proposal Networks</h4><ul role="list" class="paper-list__paper-meta"><li class="paper-list__paper-meta-holder"><p class="paper-list__paper-meta-item">Ren et al.</p></li><li class="paper-list__paper-meta-holder"><p class="paper-list__paper-meta-item">IEEE Transactions on Pattern Analysis and Machine Intelligence</p></li><li class="paper-list__paper-meta-holder"><p class="paper-list__paper-meta-item">June 4, 2015</p></li></ul><p class="paper-list__paper-abstract w-condition-invisible">State-of-the-art object detection networks depend on region proposal algorithms to hypothesize object locations. Advances like SPPnet and Fast R-CNN have reduced the running time of these detection networks, exposing region proposal computation as a bottleneck. In this work, we introduce a Region Proposal Network (RPN) that shares full-image convolutional features with the detection network, thus enabling nearly cost-free region proposals. An RPN is a fully convolutional network that simultaneously predicts object bounds and objectness scores at each position. The RPN is trained end-to-end to generate high-quality region proposals, which are used by Fast R-CNN for detection. We further merge RPN and Fast R-CNN into a single network by sharing their convolutional features---using the recently popular terminology of neural networks with 'attention' mechanisms, the RPN component tells the unified network where to look. For the very deep VGG-16 model, our detection system has a frame rate of 5fps (including all steps) on a GPU, while achieving state-of-the-art object detection accuracy on PASCAL VOC 2007, 2012, and MS COCO datasets with only 300 proposals per image. In ILSVRC and COCO 2015 competitions, Faster R-CNN and RPN are the foundations of the 1st-place winning entries in several tracks. Code has been made publicly available</p><div class="paper-list__paper-tldr-holder w-clearfix"><div class="pill pill--gray">TLDR</div><p class="paper-list__paper-abstract">This work introduces a Region Proposal Network (RPN) that shares full-image convolutional features with the detection network, thus enabling nearly cost-free region proposals and further merge RPN and Fast R-CNN into a single network by sharing their convolutionAL features.</p></div></a></div><div role="listitem" class="paper-list__paper paper-list__paper--condensed w-dyn-item"><a href="https://www.semanticscholar.org/paper/f8e79ac0ea341056ef20f2616628b3e964764cfd?utm_source=top-arxiv-referred&utm_medium=hubpage&utm_term=28" target="_blank" class="paper-list__paper-link w-inline-block"><h4 class="paper-list__paper-title">You Only Look Once: Unified, Real-Time Object Detection</h4><ul role="list" class="paper-list__paper-meta"><li class="paper-list__paper-meta-holder"><p class="paper-list__paper-meta-item">Redmon et al.</p></li><li class="paper-list__paper-meta-holder"><p class="paper-list__paper-meta-item">2016 IEEE Conference on Computer Vision and Pattern Recognition (CVPR)</p></li><li class="paper-list__paper-meta-holder"><p class="paper-list__paper-meta-item">June 8, 2015</p></li></ul><p class="paper-list__paper-abstract w-condition-invisible">We present YOLO, a new approach to object detection. Prior work on object detection repurposes classifiers to perform detection. Instead, we frame object detection as a regression problem to spatially separated bounding boxes and associated class probabilities. A single neural network predicts bounding boxes and class probabilities directly from full images in one evaluation. Since the whole detection pipeline is a single network, it can be optimized end-to-end directly on detection performance. Our unified architecture is extremely fast. Our base YOLO model processes images in real-time at 45 frames per second. A smaller version of the network, Fast YOLO, processes an astounding 155 frames per second while still achieving double the mAP of other real-time detectors. Compared to state-of-the-art detection systems, YOLO makes more localization errors but is less likely to predict false positives on background. Finally, YOLO learns very general representations of objects. It outperforms other detection methods, including DPM and R-CNN, when generalizing from natural images to other domains like artwork.</p><div class="paper-list__paper-tldr-holder w-clearfix"><div class="pill pill--gray">TLDR</div><p class="paper-list__paper-abstract">Compared to state-of-the-art detection systems, YOLO makes more localization errors but is less likely to predict false positives on background, and outperforms other detection methods, including DPM and R-CNN, when generalizing from natural images to other domains like artwork.</p></div></a></div><div role="listitem" class="paper-list__paper paper-list__paper--condensed w-dyn-item"><a href="https://www.semanticscholar.org/paper/c8b25fab5608c3e033d34b4483ec47e68ba109b7?utm_source=top-arxiv-referred&utm_medium=hubpage&utm_term=29" target="_blank" class="paper-list__paper-link w-inline-block"><h4 class="paper-list__paper-title">Swin Transformer: Hierarchical Vision Transformer using Shifted Windows</h4><ul role="list" class="paper-list__paper-meta"><li class="paper-list__paper-meta-holder"><p class="paper-list__paper-meta-item">Liu et al.</p></li><li class="paper-list__paper-meta-holder"><p class="paper-list__paper-meta-item">2021 IEEE/CVF International Conference on Computer Vision (ICCV)</p></li><li class="paper-list__paper-meta-holder"><p class="paper-list__paper-meta-item">March 25, 2021</p></li></ul><p class="paper-list__paper-abstract w-condition-invisible">This paper presents a new vision Transformer, called Swin Transformer, that capably serves as a general-purpose backbone for computer vision. Challenges in adapting Transformer from language to vision arise from differences between the two domains, such as large variations in the scale of visual entities and the high resolution of pixels in images compared to words in text. To address these differences, we propose a hierarchical Transformer whose representation is computed with Shifted windows. The shifted windowing scheme brings greater efficiency by limiting self-attention computation to non-overlapping local windows while also allowing for cross-window connection. This hierarchical architecture has the flexibility to model at various scales and has linear computational complexity with respect to image size. These qualities of Swin Transformer make it compatible with a broad range of vision tasks, including image classification (87.3 top-1 accuracy on ImageNet-1K) and dense prediction tasks such as object detection (58.7 box AP and 51.1 mask AP on COCO test-dev) and semantic segmentation (53.5 mIoU on ADE20K val). Its performance surpasses the previous state-of-the-art by a large margin of +2.7 box AP and +2.6 mask AP on COCO, and +3.2 mIoU on ADE20K, demonstrating the potential of Transformer-based models as vision backbones. The hierarchical design and the shifted window approach also prove beneficial for all-MLP architectures. The code and models are publicly available at https://github.com/microsoft/Swin-Transformer.</p><div class="paper-list__paper-tldr-holder w-clearfix"><div class="pill pill--gray">TLDR</div><p class="paper-list__paper-abstract">A hierarchical Transformer whose representation is computed with Shifted windows, which has the flexibility to model at various scales and has linear computational complexity with respect to image size and will prove beneficial for all-MLP architectures.</p></div></a></div><div role="listitem" class="paper-list__paper paper-list__paper--condensed w-dyn-item"><a href="https://www.semanticscholar.org/paper/048ed2240412b684ff88ddf6aac7152d04e5d233?utm_source=top-arxiv-referred&utm_medium=hubpage&utm_term=30" target="_blank" class="paper-list__paper-link w-inline-block"><h4 class="paper-list__paper-title">CPLLM: Clinical Prediction with Large Language Models</h4><ul role="list" class="paper-list__paper-meta"><li class="paper-list__paper-meta-holder"><p class="paper-list__paper-meta-item">Shoham et al.</p></li><li class="paper-list__paper-meta-holder"><p class="paper-list__paper-meta-item">ArXiv</p></li><li class="paper-list__paper-meta-holder"><p class="paper-list__paper-meta-item">September 20, 2023</p></li></ul><p class="paper-list__paper-abstract">We present Clinical Prediction with Large Language Models (CPLLM), a method that involves fine-tuning a pre-trained Large Language Model (LLM) for clinical disease prediction. We utilized quantization and fine-tuned the LLM using prompts, with the task of predicting whether patients will be diagnosed with a target disease during their next visit or in the subsequent diagnosis, leveraging their historical diagnosis records. We compared our results versus various baselines, including Logistic Regression, RETAIN, and Med-BERT, which is the current state-of-the-art model for disease prediction using structured EHR data. Our experiments have shown that CPLLM surpasses all the tested models in terms of both PR-AUC and ROC-AUC metrics, displaying noteworthy enhancements compared to the baseline models.</p><div class="paper-list__paper-tldr-holder w-clearfix"><div class="pill pill--gray w-condition-invisible">TLDR</div><p class="paper-list__paper-abstract w-dyn-bind-empty"></p></div></a></div></div></div></div></div></div></div></main><div class="post__list"><h4 class="post__list-heading">Latest News & Updates</h4><div class="w-dyn-list"><div role="list" class="post__grid w-dyn-items"><div role="listitem" class="post w-dyn-item"><a href="https://blog.allenai.org/case-study-iterative-design-for-skimming-support-5563dbe0899e" target="_blank" class="post__link w-inline-block"><div class="post__image-wrapper"><img src="https://cdn.prod.website-files.com/605ba9b55a4a92803e45a32b/652055861194e6fc6bb5983a_skimming_2.0.png" loading="lazy" alt="Case Study: Iterative Design for Skimming Support" sizes="(max-width: 479px) 85vw, (max-width: 767px) 84vw, (max-width: 991px) 87vw, 24vw" srcset="https://cdn.prod.website-files.com/605ba9b55a4a92803e45a32b/652055861194e6fc6bb5983a_skimming_2.0-p-500.png 500w, https://cdn.prod.website-files.com/605ba9b55a4a92803e45a32b/652055861194e6fc6bb5983a_skimming_2.0-p-800.png 800w, https://cdn.prod.website-files.com/605ba9b55a4a92803e45a32b/652055861194e6fc6bb5983a_skimming_2.0-p-1080.png 1080w, https://cdn.prod.website-files.com/605ba9b55a4a92803e45a32b/652055861194e6fc6bb5983a_skimming_2.0-p-1600.png 1600w, https://cdn.prod.website-files.com/605ba9b55a4a92803e45a32b/652055861194e6fc6bb5983a_skimming_2.0-p-2000.png 2000w, https://cdn.prod.website-files.com/605ba9b55a4a92803e45a32b/652055861194e6fc6bb5983a_skimming_2.0-p-2600.png 2600w, https://cdn.prod.website-files.com/605ba9b55a4a92803e45a32b/652055861194e6fc6bb5983a_skimming_2.0-p-3200.png 3200w, https://cdn.prod.website-files.com/605ba9b55a4a92803e45a32b/652055861194e6fc6bb5983a_skimming_2.0.png 5834w" class="post__image"/></div><h4 class="post__title">Case Study: Iterative Design for Skimming Support</h4><div class="post__meta"><div class="post__date">Oct 6, 2023</div><div class="post__read-time">7 min read</div></div><p class="post__intro">How might we help researchers quickly assess the relevance of scientific literature? Take a closer look at Skimming, Semantic Reader’s latest AI feature, and the collaborative design process behind it.</p></a><div class="post__author">Cassidy Trier</div></div><div role="listitem" class="post w-dyn-item"><a href="https://blog.allenai.org/behind-the-scenes-of-semantic-scholars-new-author-influence-design-d7e007ba6a84" target="_blank" class="post__link w-inline-block"><div class="post__image-wrapper"><img src="https://cdn.prod.website-files.com/605ba9b55a4a92803e45a32b/64de863a294e5ef80fee0187_Screen%20Shot%202023-02-06%20at%2011.36.31%20AM.png" loading="lazy" alt="Behind the Scenes of Semantic Scholar’s New Author Influence Design" sizes="(max-width: 479px) 85vw, (max-width: 767px) 84vw, (max-width: 991px) 87vw, 24vw" srcset="https://cdn.prod.website-files.com/605ba9b55a4a92803e45a32b/64de863a294e5ef80fee0187_Screen%20Shot%202023-02-06%20at%2011.36.31%20AM-p-500.png 500w, https://cdn.prod.website-files.com/605ba9b55a4a92803e45a32b/64de863a294e5ef80fee0187_Screen%20Shot%202023-02-06%20at%2011.36.31%20AM-p-800.png 800w, https://cdn.prod.website-files.com/605ba9b55a4a92803e45a32b/64de863a294e5ef80fee0187_Screen%20Shot%202023-02-06%20at%2011.36.31%20AM-p-1080.png 1080w, https://cdn.prod.website-files.com/605ba9b55a4a92803e45a32b/64de863a294e5ef80fee0187_Screen%20Shot%202023-02-06%20at%2011.36.31%20AM-p-1600.png 1600w, https://cdn.prod.website-files.com/605ba9b55a4a92803e45a32b/64de863a294e5ef80fee0187_Screen%20Shot%202023-02-06%20at%2011.36.31%20AM-p-2000.png 2000w, https://cdn.prod.website-files.com/605ba9b55a4a92803e45a32b/64de863a294e5ef80fee0187_Screen%20Shot%202023-02-06%20at%2011.36.31%20AM.png 2150w" class="post__image"/></div><h4 class="post__title">Behind the Scenes of Semantic Scholar’s New Author Influence Design</h4><div class="post__meta"><div class="post__date">Aug 17, 2023</div><div class="post__read-time">5 min read</div></div><p class="post__intro">We released a new version of Author Influence interface to help scholars better discover other scholars in their fields. Here's how we identified user insights and made those design choices.</p></a><div class="post__author">Cassidy Trier, Evie Cheng, Ashley Lee</div></div><div role="listitem" class="post w-dyn-item"><a href="https://www.nature.com/articles/d41586-023-01907-z" target="_blank" class="post__link w-inline-block"><div class="post__image-wrapper"><img src="https://cdn.prod.website-files.com/605ba9b55a4a92803e45a32b/64d15e16ad0f9fd89058273b_nature.webp" loading="lazy" alt="Artificial-intelligence search engines wrangle academic literature" sizes="(max-width: 479px) 85vw, (max-width: 767px) 84vw, (max-width: 991px) 87vw, 24vw" srcset="https://cdn.prod.website-files.com/605ba9b55a4a92803e45a32b/64d15e16ad0f9fd89058273b_nature-p-500.webp 500w, https://cdn.prod.website-files.com/605ba9b55a4a92803e45a32b/64d15e16ad0f9fd89058273b_nature-p-800.webp 800w, https://cdn.prod.website-files.com/605ba9b55a4a92803e45a32b/64d15e16ad0f9fd89058273b_nature-p-1080.webp 1080w, https://cdn.prod.website-files.com/605ba9b55a4a92803e45a32b/64d15e16ad0f9fd89058273b_nature.webp 1248w" class="post__image"/></div><h4 class="post__title">Artificial-intelligence search engines wrangle academic literature</h4><div class="post__meta"><div class="post__date">Aug 7, 2023</div><div class="post__read-time">5 min read</div></div><p class="post__intro">Nature had a chat with Dan Weld, Chief Scientist at Semantic Scholar, to discuss how search engines are helping scientists explore and innovate by making it easier to draw connections from a massive collection of scientific literature.</p></a><div class="post__author">Amanda Heidt</div></div></div></div></div><div class="cta__blade"><h4 class="cta__header">Experience a smarter way to search and discover scholarly research.</h4><a href="https://www.semanticscholar.org/sign-in" class="button button--hero w-button">Create Your Account</a></div><div class="newsletter"><div class="newsletter__container"><div class="newsletter-layout"><h5 class="newsletter-title">Stay Connected With Semantic Scholar</h5><div class="newsletter-embed w-embed w-script"><!--[if lte IE 8]> <script charset="utf-8" type="text/javascript" src="//js.hsforms.net/forms/v2-legacy.js"></script> <![endif]--> <script charset="utf-8" type="text/javascript" src="//js.hsforms.net/forms/v2.js"></script> <script> hbspt.forms.create({ region: "na1", portalId: "5910970", formId: "b8dd2b25-f81d-4cba-a9b6-044e249b7a07" }); </script></div></div></div></div><footer class="site-footer"><div class="site-footer__top"><div class="site-footer__top-container"><div class="site-footer__about"><h6 class="site-footer site-footer__title">What Is Semantic Scholar?</h6><p class="site-footer site-footer__text">Semantic Scholar is a free, AI-powered research tool for scientific literature, based at Ai2.</p><a href="/about" class="site-footer site-footer__link">Learn More</a></div><div class="site-footer__navigation"><ul role="list" class="site-footer site-footer__list w-list-unstyled"><li><h6 class="site-footer site-footer__title">About</h6></li><li><a href="/about" class="site-footer site-footer__link">About Us<br/></a></li><li><a href="/about/team" class="site-footer site-footer__link">Meet the Team<br/></a></li><li><a href="/about/publishers" class="site-footer site-footer__link">Publishers</a></li><li><a href="https://medium.com/ai2-blog/semantic-scholar/home" target="_blank" class="site-footer site-footer__link">Blog</a></li><li><a href="https://allenai.org/careers?team=semantic+scholar#current-openings" target="_blank" class="site-footer site-footer__link">Ai2 Careers</a></li></ul><ul role="list" class="site-footer site-footer__list w-list-unstyled"><li><h6 class="site-footer site-footer__title">Product</h6></li><li><a href="/product" class="site-footer site-footer__link">Product Overview</a></li><li><a href="/product/semantic-reader" class="site-footer site-footer__link">Semantic Reader</a></li><li><a href="/product/scholars-hub" class="site-footer site-footer__link">Scholar's Hub</a></li><li><a href="/product/beta-program" class="site-footer site-footer__link">Beta Program</a></li><li><a href="/product/release-notes" class="site-footer site-footer__link">Release Notes</a></li></ul><ul role="list" class="site-footer site-footer__list w-list-unstyled"><li><h6 class="site-footer site-footer__title">API</h6></li><li><a href="/product/api" class="site-footer site-footer__link">API Overview</a></li><li><a href="/product/api/tutorial" class="site-footer site-footer__link">API Tutorials</a></li><li><a href="https://api.semanticscholar.org/api-docs/" class="site-footer site-footer__link">API Documentation</a></li><li><a href="/product/api/gallery" class="site-footer site-footer__link">API Gallery</a></li></ul><ul id="w-node-_80db44ed-17f7-2024-a450-ff6046e68512-46e684e3" role="list" class="site-footer site-footer__list w-list-unstyled"><li><h6 class="site-footer site-footer__title">Research</h6></li><li><a href="https://allenai.org/papers?tag=Semantic%20Scholar" class="site-footer site-footer__link">Publications</a></li><li><a href="/research/research-team" class="site-footer site-footer__link">Researchers</a></li><li><a href="/research/careers" class="site-footer site-footer__link">Research Careers</a></li><li><a href="/research/prototypes" class="site-footer site-footer__link">Prototypes</a></li><li><a href="/resources" class="site-footer site-footer__link">Resources</a></li></ul><ul id="w-node-a1cfe8f5-f656-0f8f-b57f-f2c91de1b718-46e684e3" role="list" class="site-footer site-footer__list w-list-unstyled"><li><h6 class="site-footer site-footer__title">Help</h6></li><li><a href="https://www.semanticscholar.org/faq" class="site-footer site-footer__link">FAQ</a></li><li><a href="/about/librarians" class="site-footer site-footer__link">Librarians</a></li><li><a href="/product/tutorials" class="site-footer site-footer__link">Tutorials</a></li><li><a href="#" data-w-id="2cf6e605-c551-b5e7-40a2-70dbdd9705a1" class="site-footer site-footer__link site-footer__contact-trigger">Contact</a></li></ul></div></div></div><div class="site-footer__bottom"><div class="site-footer__bottom-container"><p class="site-footer__legal">Proudly built by <a href="https://allenai.org/" target="_blank" class="site-footer site-footer__link">Ai2</a> with the help of our Collaborators<br/><a href="https://allenai.org/terms.html" target="_blank" class="site-footer site-footer__link">Terms of Service</a> • <a href="https://allenai.org/privacy-policy.html" target="_blank" class="site-footer site-footer__link">Privacy Policy</a> • <a href="/product/api/license" class="site-footer site-footer__link">API License Agreement</a></p><img src="https://cdn.prod.website-files.com/605236bb767e9a5bb229c63c/66a167e36965869758dd87f8_Ai2_logo_offwhite_RGB.png" loading="lazy" sizes="94.5703125px" srcset="https://cdn.prod.website-files.com/605236bb767e9a5bb229c63c/66a167e36965869758dd87f8_Ai2_logo_offwhite_RGB-p-500.png 500w, https://cdn.prod.website-files.com/605236bb767e9a5bb229c63c/66a167e36965869758dd87f8_Ai2_logo_offwhite_RGB-p-800.png 800w, https://cdn.prod.website-files.com/605236bb767e9a5bb229c63c/66a167e36965869758dd87f8_Ai2_logo_offwhite_RGB-p-1080.png 1080w, https://cdn.prod.website-files.com/605236bb767e9a5bb229c63c/66a167e36965869758dd87f8_Ai2_logo_offwhite_RGB-p-1600.png 1600w, https://cdn.prod.website-files.com/605236bb767e9a5bb229c63c/66a167e36965869758dd87f8_Ai2_logo_offwhite_RGB-p-2000.png 2000w, https://cdn.prod.website-files.com/605236bb767e9a5bb229c63c/66a167e36965869758dd87f8_Ai2_logo_offwhite_RGB.png 2771w" alt="" class="site-footer__logo"/></div></div><div class="contact-modal"><div class="contact-modal__container"><a data-w-id="094e8a79-f899-529e-250c-5240927de9d7" href="#" class="contact-modal__close w-inline-block"><img src="https://cdn.prod.website-files.com/605236bb767e9a5bb229c63c/61e9b4e329b9d877dee723c3_close-light.svg" loading="lazy" alt="Close" class="contact-modal__close-jewel"/></a><h4 class="margin-top--none">Contact Us</h4><div class="contact-modal__form-wrapper w-form"><form id="freshdesk-contact-form" name="wf-form-Contact" data-name="Contact" action="https://www.semanticscholar.org/api/1/feedback" method="post" class="contact-modal__form" data-wf-page-id="6584745360a4872a287a89a3" data-wf-element-id="7663d2bf-5ed9-4856-37e4-a4966bfbf84f"><p class="margin-bottom--sm">Please visit our <a href="https://www.semanticscholar.org/faq">FAQ</a> to find helpful information before submitting your question.<br/></p><label for="contact-form-name">Your name</label><input class="w-input" maxlength="256" name="name" data-name="name" placeholder="" type="text" id="contact-form-name"/><label for="contact-form-email-2">Your email</label><input class="w-input" maxlength="256" name="email" data-name="email" placeholder="" type="email" id="contact-form-email" required=""/><label for="contact-form-subject">Subject<br/></label><input class="w-input" maxlength="256" name="subject" data-name="subject" placeholder="" type="text" id="contact-form-subject" required=""/><label for="contact-form-topic">Topic<br/></label><select id="contact-form-topic" name="topic" data-name="topic" required="" class="select-field w-select"><option value="">Select A Topic</option><option value="Takedown Request">Remove A Paper</option><option value="Author Disambiguation">Merge Authors</option><option value="Other Problem">Other</option></select><label for="contact-form-feedback-2">Feedback<br/></label><textarea id="contact-form-feedback" name="feedback" maxlength="5000" data-name="feedback" placeholder="" required="" class="margin-bottom--sm w-input"></textarea><input type="submit" data-wait="Please wait..." class="button w-button" value="Contact Us"/></form><div class="contact-modal__form-success w-form-done"><div><strong>Thanks! </strong>Your feedback has been submitted.</div></div><div class="contact-modal__form-error w-form-fail"><div>Something went wrong while submitting the form, please try again.</div></div></div></div><div data-w-id="094e8a79-f899-529e-250c-5240927de9fe" class="contact-modal__overlay"></div></div></footer><script src="https://d3e54v103j8qbb.cloudfront.net/js/jquery-3.5.1.min.dc5e7f18c8.js?site=605236bb767e9a5bb229c63c" type="text/javascript" integrity="sha256-9/aliU8dGd2tb6OSsuzixeV4y/faTqgFtohetphbbj0=" crossorigin="anonymous"></script><script src="https://cdn.prod.website-files.com/605236bb767e9a5bb229c63c/js/semanticscholar.15d8b03cf.js" type="text/javascript"></script><script> $(document).ready(function() { // Contact Form $('.contact-modal__form').submit(function(e){ // Stops regular form submit e.preventDefault(); // Sets variables, encodes form into json var $this = $(this), $parent = $this.parent(), $success = $parent.find(".contact-modal__form-success"), $error = $parent.find(".contact-modal__form-error"), action = $this.attr('action'), submission = $this.serializeArray().reduce((memo, field) => ({...memo, [field.name]: field.value}), {}); // Record URL submission.url=window.location.href; // Submit $.ajax(action, { method: 'POST', contentType: 'application/json', data: JSON.stringify(submission), cache: false, dataType: 'json', crossDomain: true, processData: false }).always(function(e){ // Hides form, shows success $this.hide(); $success.show(); }); // just in case return false; }); // Listens for links to /about/contact and pops up contact form instead of redirecting. $('.main a[href$="about/contact"]').on('click', function(e){ e.preventDefault(); $('.contact-modal').show(); }); }); </script></body></html>