CINXE.COM

Search | arXiv e-print repository

<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/> <!-- new favicon config and versions by realfavicongenerator.net --> <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b"> <!-- end favicon config --> <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\\(","\\)"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a> <!-- contains Cornell logo and sponsor statement --> <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div> <!-- contains arXiv identity and search bar --> <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div> <!-- closes identity --> <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1&ndash;17 of 17 results for author: <span class="mathjax">Bello, I</span> </h1> </div> <div class="level-right is-hidden-mobile"> <!-- feedback for mobile is moved to footer --> <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>&nbsp;&nbsp;</span> </div> </div> <div class="content"> <form method="GET" action="/search/cs" aria-role="search"> Searching in archive <strong>cs</strong>. <a href="/search/?searchtype=author&amp;query=Bello%2C+I">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Bello, I"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Bello%2C+I&amp;terms-0-field=author&amp;size=50&amp;order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Bello, I"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.07603">arXiv:2407.07603</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2407.07603">pdf</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> iiANET: Inception Inspired Attention Hybrid Network for efficient Long-Range Dependency </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Yunusa%2C+H">Haruna Yunusa</a>, <a href="/search/cs?searchtype=author&amp;query=Shiyin%2C+Q">Qin Shiyin</a>, <a href="/search/cs?searchtype=author&amp;query=Chukkol%2C+A+H+A">Abdulrahman Hamman Adama Chukkol</a>, <a href="/search/cs?searchtype=author&amp;query=Bello%2C+I">Isah Bello</a>, <a href="/search/cs?searchtype=author&amp;query=Lawan%2C+A">Adamu Lawan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.07603v1-abstract-short" style="display: inline;"> The recent emergence of hybrid models has introduced another transformative approach to solving computer vision tasks, slowly shifting away from conventional CNN (Convolutional Neural Network) and ViT (Vision Transformer). However, not enough effort has been made to efficiently combine these two approaches to improve capturing long-range dependencies prevalent in complex images. In this paper, we&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.07603v1-abstract-full').style.display = 'inline'; document.getElementById('2407.07603v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.07603v1-abstract-full" style="display: none;"> The recent emergence of hybrid models has introduced another transformative approach to solving computer vision tasks, slowly shifting away from conventional CNN (Convolutional Neural Network) and ViT (Vision Transformer). However, not enough effort has been made to efficiently combine these two approaches to improve capturing long-range dependencies prevalent in complex images. In this paper, we introduce iiANET (Inception Inspired Attention Network), an efficient hybrid model designed to capture long-range dependencies in complex images. The fundamental building block, iiABlock, integrates global 2D-MHSA (Multi-Head Self-Attention) with Registers, MBConv2 (MobileNetV2-based convolution), and dilated convolution in parallel, enabling the model to adeptly leverage self-attention for capturing long-range dependencies while utilizing MBConv2 for effective local-detail extraction and dilated convolution for efficiently expanding the kernel receptive field to capture more contextual information. Lastly, we serially integrate an ECANET (Efficient Channel Attention Network) at the end of each iiABlock to calibrate channel-wise attention for enhanced model performance. Extensive qualitative and quantitative comparative evaluation on various benchmarks demonstrates improved performance over some state-of-the-art models. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.07603v1-abstract-full').style.display = 'none'; document.getElementById('2407.07603v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2402.16291">arXiv:2402.16291</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2402.16291">pdf</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> SaRPFF: A Self-Attention with Register-based Pyramid Feature Fusion module for enhanced RLD detection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Haruna%2C+Y">Yunusa Haruna</a>, <a href="/search/cs?searchtype=author&amp;query=Qin%2C+S">Shiyin Qin</a>, <a href="/search/cs?searchtype=author&amp;query=Chukkol%2C+A+H+A">Abdulrahman Hamman Adama Chukkol</a>, <a href="/search/cs?searchtype=author&amp;query=Bello%2C+I">Isah Bello</a>, <a href="/search/cs?searchtype=author&amp;query=Lawan%2C+A">Adamu Lawan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2402.16291v2-abstract-short" style="display: inline;"> Detecting objects across varying scales is still a challenge in computer vision, particularly in agricultural applications like Rice Leaf Disease (RLD) detection, where objects exhibit significant scale variations (SV). Conventional object detection (OD) like Faster R-CNN, SSD, and YOLO methods often fail to effectively address SV, leading to reduced accuracy and missed detections. To tackle this,&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.16291v2-abstract-full').style.display = 'inline'; document.getElementById('2402.16291v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2402.16291v2-abstract-full" style="display: none;"> Detecting objects across varying scales is still a challenge in computer vision, particularly in agricultural applications like Rice Leaf Disease (RLD) detection, where objects exhibit significant scale variations (SV). Conventional object detection (OD) like Faster R-CNN, SSD, and YOLO methods often fail to effectively address SV, leading to reduced accuracy and missed detections. To tackle this, we propose SaRPFF (Self-Attention with Register-based Pyramid Feature Fusion), a novel module designed to enhance multi-scale object detection. SaRPFF integrates 2D-Multi-Head Self-Attention (MHSA) with Register tokens, improving feature interpretability by mitigating artifacts within MHSA. Additionally, it integrates efficient attention atrous convolutions into the pyramid feature fusion and introduce a deconvolutional layer for refined up-sampling. We evaluate SaRPFF on YOLOv7 using the MRLD and COCO datasets. Our approach demonstrates a +2.61% improvement in Average Precision (AP) on the MRLD dataset compared to the baseline FPN method in YOLOv7. Furthermore, SaRPFF outperforms other FPN variants, including BiFPN, NAS-FPN, and PANET, showcasing its versatility and potential to advance OD techniques. This study highlights SaRPFF effectiveness in addressing SV challenges and its adaptability across FPN-based OD models. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.16291v2-abstract-full').style.display = 'none'; document.getElementById('2402.16291v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 25 February, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2402.09939">arXiv:2402.09939</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2402.09939">pdf</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Generative AI in the Construction Industry: A State-of-the-art Analysis </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Taiwo%2C+R">Ridwan Taiwo</a>, <a href="/search/cs?searchtype=author&amp;query=Bello%2C+I+T">Idris Temitope Bello</a>, <a href="/search/cs?searchtype=author&amp;query=Abdulai%2C+S+F">Sulemana Fatoama Abdulai</a>, <a href="/search/cs?searchtype=author&amp;query=Yussif%2C+A">Abdul-Mugis Yussif</a>, <a href="/search/cs?searchtype=author&amp;query=Salami%2C+B+A">Babatunde Abiodun Salami</a>, <a href="/search/cs?searchtype=author&amp;query=Saka%2C+A">Abdullahi Saka</a>, <a href="/search/cs?searchtype=author&amp;query=Zayed%2C+T">Tarek Zayed</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2402.09939v1-abstract-short" style="display: inline;"> The construction industry is a vital sector of the global economy, but it faces many productivity challenges in various processes, such as design, planning, procurement, inspection, and maintenance. Generative artificial intelligence (AI), which can create novel and realistic data or content, such as text, image, video, or code, based on some input or prior knowledge, offers innovative and disrupt&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.09939v1-abstract-full').style.display = 'inline'; document.getElementById('2402.09939v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2402.09939v1-abstract-full" style="display: none;"> The construction industry is a vital sector of the global economy, but it faces many productivity challenges in various processes, such as design, planning, procurement, inspection, and maintenance. Generative artificial intelligence (AI), which can create novel and realistic data or content, such as text, image, video, or code, based on some input or prior knowledge, offers innovative and disruptive solutions to address these challenges. However, there is a gap in the literature on the current state, opportunities, and challenges of generative AI in the construction industry. This study aims to fill this gap by providing a state-of-the-art analysis of generative AI in construction, with three objectives: (1) to review and categorize the existing and emerging generative AI opportunities and challenges in the construction industry; (2) to propose a framework for construction firms to build customized generative AI solutions using their own data, comprising steps such as data collection, dataset curation, training custom large language model (LLM), model evaluation, and deployment; and (3) to demonstrate the framework via a case study of developing a generative model for querying contract documents. The results show that retrieval augmented generation (RAG) improves the baseline LLM by 5.2, 9.4, and 4.8% in terms of quality, relevance, and reproducibility. This study provides academics and construction professionals with a comprehensive analysis and practical framework to guide the adoption of generative AI techniques to enhance productivity, quality, safety, and sustainability across the construction industry. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.09939v1-abstract-full').style.display = 'none'; document.getElementById('2402.09939v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 February, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">74 pages, 11 figures, 20 tables</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2402.02941">arXiv:2402.02941</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2402.02941">pdf</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Exploring the Synergies of Hybrid CNNs and ViTs Architectures for Computer Vision: A survey </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Yunusa%2C+H">Haruna Yunusa</a>, <a href="/search/cs?searchtype=author&amp;query=Qin%2C+S">Shiyin Qin</a>, <a href="/search/cs?searchtype=author&amp;query=Chukkol%2C+A+H+A">Abdulrahman Hamman Adama Chukkol</a>, <a href="/search/cs?searchtype=author&amp;query=Yusuf%2C+A+A">Abdulganiyu Abdu Yusuf</a>, <a href="/search/cs?searchtype=author&amp;query=Bello%2C+I">Isah Bello</a>, <a href="/search/cs?searchtype=author&amp;query=Lawan%2C+A">Adamu Lawan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2402.02941v1-abstract-short" style="display: inline;"> The hybrid of Convolutional Neural Network (CNN) and Vision Transformers (ViT) architectures has emerged as a groundbreaking approach, pushing the boundaries of computer vision (CV). This comprehensive review provides a thorough examination of the literature on state-of-the-art hybrid CNN-ViT architectures, exploring the synergies between these two approaches. The main content of this survey inclu&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.02941v1-abstract-full').style.display = 'inline'; document.getElementById('2402.02941v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2402.02941v1-abstract-full" style="display: none;"> The hybrid of Convolutional Neural Network (CNN) and Vision Transformers (ViT) architectures has emerged as a groundbreaking approach, pushing the boundaries of computer vision (CV). This comprehensive review provides a thorough examination of the literature on state-of-the-art hybrid CNN-ViT architectures, exploring the synergies between these two approaches. The main content of this survey includes: (1) a background on the vanilla CNN and ViT, (2) systematic review of various taxonomic hybrid designs to explore the synergy achieved through merging CNNs and ViTs models, (3) comparative analysis and application task-specific synergy between different hybrid architectures, (4) challenges and future directions for hybrid models, (5) lastly, the survey concludes with a summary of key findings and recommendations. Through this exploration of hybrid CV architectures, the survey aims to serve as a guiding resource, fostering a deeper understanding of the intricate dynamics between CNNs and ViTs and their collective impact on shaping the future of CV architectures. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.02941v1-abstract-full').style.display = 'none'; document.getElementById('2402.02941v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 February, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2303.08774">arXiv:2303.08774</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2303.08774">pdf</a>, <a href="https://arxiv.org/format/2303.08774">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> GPT-4 Technical Report </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=OpenAI"> OpenAI</a>, <a href="/search/cs?searchtype=author&amp;query=Achiam%2C+J">Josh Achiam</a>, <a href="/search/cs?searchtype=author&amp;query=Adler%2C+S">Steven Adler</a>, <a href="/search/cs?searchtype=author&amp;query=Agarwal%2C+S">Sandhini Agarwal</a>, <a href="/search/cs?searchtype=author&amp;query=Ahmad%2C+L">Lama Ahmad</a>, <a href="/search/cs?searchtype=author&amp;query=Akkaya%2C+I">Ilge Akkaya</a>, <a href="/search/cs?searchtype=author&amp;query=Aleman%2C+F+L">Florencia Leoni Aleman</a>, <a href="/search/cs?searchtype=author&amp;query=Almeida%2C+D">Diogo Almeida</a>, <a href="/search/cs?searchtype=author&amp;query=Altenschmidt%2C+J">Janko Altenschmidt</a>, <a href="/search/cs?searchtype=author&amp;query=Altman%2C+S">Sam Altman</a>, <a href="/search/cs?searchtype=author&amp;query=Anadkat%2C+S">Shyamal Anadkat</a>, <a href="/search/cs?searchtype=author&amp;query=Avila%2C+R">Red Avila</a>, <a href="/search/cs?searchtype=author&amp;query=Babuschkin%2C+I">Igor Babuschkin</a>, <a href="/search/cs?searchtype=author&amp;query=Balaji%2C+S">Suchir Balaji</a>, <a href="/search/cs?searchtype=author&amp;query=Balcom%2C+V">Valerie Balcom</a>, <a href="/search/cs?searchtype=author&amp;query=Baltescu%2C+P">Paul Baltescu</a>, <a href="/search/cs?searchtype=author&amp;query=Bao%2C+H">Haiming Bao</a>, <a href="/search/cs?searchtype=author&amp;query=Bavarian%2C+M">Mohammad Bavarian</a>, <a href="/search/cs?searchtype=author&amp;query=Belgum%2C+J">Jeff Belgum</a>, <a href="/search/cs?searchtype=author&amp;query=Bello%2C+I">Irwan Bello</a>, <a href="/search/cs?searchtype=author&amp;query=Berdine%2C+J">Jake Berdine</a>, <a href="/search/cs?searchtype=author&amp;query=Bernadett-Shapiro%2C+G">Gabriel Bernadett-Shapiro</a>, <a href="/search/cs?searchtype=author&amp;query=Berner%2C+C">Christopher Berner</a>, <a href="/search/cs?searchtype=author&amp;query=Bogdonoff%2C+L">Lenny Bogdonoff</a>, <a href="/search/cs?searchtype=author&amp;query=Boiko%2C+O">Oleg Boiko</a> , et al. (256 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2303.08774v6-abstract-short" style="display: inline;"> We report the development of GPT-4, a large-scale, multimodal model which can accept image and text inputs and produce text outputs. While less capable than humans in many real-world scenarios, GPT-4 exhibits human-level performance on various professional and academic benchmarks, including passing a simulated bar exam with a score around the top 10% of test takers. GPT-4 is a Transformer-based mo&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2303.08774v6-abstract-full').style.display = 'inline'; document.getElementById('2303.08774v6-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2303.08774v6-abstract-full" style="display: none;"> We report the development of GPT-4, a large-scale, multimodal model which can accept image and text inputs and produce text outputs. While less capable than humans in many real-world scenarios, GPT-4 exhibits human-level performance on various professional and academic benchmarks, including passing a simulated bar exam with a score around the top 10% of test takers. GPT-4 is a Transformer-based model pre-trained to predict the next token in a document. The post-training alignment process results in improved performance on measures of factuality and adherence to desired behavior. A core component of this project was developing infrastructure and optimization methods that behave predictably across a wide range of scales. This allowed us to accurately predict some aspects of GPT-4&#39;s performance based on models trained with no more than 1/1,000th the compute of GPT-4. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2303.08774v6-abstract-full').style.display = 'none'; document.getElementById('2303.08774v6-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 15 March, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">100 pages; updated authors list; fixed author names and added citation</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2211.05100">arXiv:2211.05100</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2211.05100">pdf</a>, <a href="https://arxiv.org/format/2211.05100">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> BLOOM: A 176B-Parameter Open-Access Multilingual Language Model </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Workshop%2C+B">BigScience Workshop</a>, <a href="/search/cs?searchtype=author&amp;query=%3A"> :</a>, <a href="/search/cs?searchtype=author&amp;query=Scao%2C+T+L">Teven Le Scao</a>, <a href="/search/cs?searchtype=author&amp;query=Fan%2C+A">Angela Fan</a>, <a href="/search/cs?searchtype=author&amp;query=Akiki%2C+C">Christopher Akiki</a>, <a href="/search/cs?searchtype=author&amp;query=Pavlick%2C+E">Ellie Pavlick</a>, <a href="/search/cs?searchtype=author&amp;query=Ili%C4%87%2C+S">Suzana Ili膰</a>, <a href="/search/cs?searchtype=author&amp;query=Hesslow%2C+D">Daniel Hesslow</a>, <a href="/search/cs?searchtype=author&amp;query=Castagn%C3%A9%2C+R">Roman Castagn茅</a>, <a href="/search/cs?searchtype=author&amp;query=Luccioni%2C+A+S">Alexandra Sasha Luccioni</a>, <a href="/search/cs?searchtype=author&amp;query=Yvon%2C+F">Fran莽ois Yvon</a>, <a href="/search/cs?searchtype=author&amp;query=Gall%C3%A9%2C+M">Matthias Gall茅</a>, <a href="/search/cs?searchtype=author&amp;query=Tow%2C+J">Jonathan Tow</a>, <a href="/search/cs?searchtype=author&amp;query=Rush%2C+A+M">Alexander M. Rush</a>, <a href="/search/cs?searchtype=author&amp;query=Biderman%2C+S">Stella Biderman</a>, <a href="/search/cs?searchtype=author&amp;query=Webson%2C+A">Albert Webson</a>, <a href="/search/cs?searchtype=author&amp;query=Ammanamanchi%2C+P+S">Pawan Sasanka Ammanamanchi</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+T">Thomas Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Sagot%2C+B">Beno卯t Sagot</a>, <a href="/search/cs?searchtype=author&amp;query=Muennighoff%2C+N">Niklas Muennighoff</a>, <a href="/search/cs?searchtype=author&amp;query=del+Moral%2C+A+V">Albert Villanova del Moral</a>, <a href="/search/cs?searchtype=author&amp;query=Ruwase%2C+O">Olatunji Ruwase</a>, <a href="/search/cs?searchtype=author&amp;query=Bawden%2C+R">Rachel Bawden</a>, <a href="/search/cs?searchtype=author&amp;query=Bekman%2C+S">Stas Bekman</a>, <a href="/search/cs?searchtype=author&amp;query=McMillan-Major%2C+A">Angelina McMillan-Major</a> , et al. (369 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2211.05100v4-abstract-short" style="display: inline;"> Large language models (LLMs) have been shown to be able to perform new tasks based on a few demonstrations or natural language instructions. While these capabilities have led to widespread adoption, most LLMs are developed by resource-rich organizations and are frequently kept from the public. As a step towards democratizing this powerful technology, we present BLOOM, a 176B-parameter open-access&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2211.05100v4-abstract-full').style.display = 'inline'; document.getElementById('2211.05100v4-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2211.05100v4-abstract-full" style="display: none;"> Large language models (LLMs) have been shown to be able to perform new tasks based on a few demonstrations or natural language instructions. While these capabilities have led to widespread adoption, most LLMs are developed by resource-rich organizations and are frequently kept from the public. As a step towards democratizing this powerful technology, we present BLOOM, a 176B-parameter open-access language model designed and built thanks to a collaboration of hundreds of researchers. BLOOM is a decoder-only Transformer language model that was trained on the ROOTS corpus, a dataset comprising hundreds of sources in 46 natural and 13 programming languages (59 in total). We find that BLOOM achieves competitive performance on a wide variety of benchmarks, with stronger results after undergoing multitask prompted finetuning. To facilitate future research and applications using LLMs, we publicly release our models and code under the Responsible AI License. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2211.05100v4-abstract-full').style.display = 'none'; document.getElementById('2211.05100v4-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 June, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 9 November, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2022. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2202.08906">arXiv:2202.08906</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2202.08906">pdf</a>, <a href="https://arxiv.org/format/2202.08906">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> ST-MoE: Designing Stable and Transferable Sparse Expert Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Zoph%2C+B">Barret Zoph</a>, <a href="/search/cs?searchtype=author&amp;query=Bello%2C+I">Irwan Bello</a>, <a href="/search/cs?searchtype=author&amp;query=Kumar%2C+S">Sameer Kumar</a>, <a href="/search/cs?searchtype=author&amp;query=Du%2C+N">Nan Du</a>, <a href="/search/cs?searchtype=author&amp;query=Huang%2C+Y">Yanping Huang</a>, <a href="/search/cs?searchtype=author&amp;query=Dean%2C+J">Jeff Dean</a>, <a href="/search/cs?searchtype=author&amp;query=Shazeer%2C+N">Noam Shazeer</a>, <a href="/search/cs?searchtype=author&amp;query=Fedus%2C+W">William Fedus</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2202.08906v2-abstract-short" style="display: inline;"> Scale has opened new frontiers in natural language processing -- but at a high cost. In response, Mixture-of-Experts (MoE) and Switch Transformers have been proposed as an energy efficient path to even larger and more capable language models. But advancing the state-of-the-art across a broad set of natural language tasks has been hindered by training instabilities and uncertain quality during fine&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2202.08906v2-abstract-full').style.display = 'inline'; document.getElementById('2202.08906v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2202.08906v2-abstract-full" style="display: none;"> Scale has opened new frontiers in natural language processing -- but at a high cost. In response, Mixture-of-Experts (MoE) and Switch Transformers have been proposed as an energy efficient path to even larger and more capable language models. But advancing the state-of-the-art across a broad set of natural language tasks has been hindered by training instabilities and uncertain quality during fine-tuning. Our work focuses on these issues and acts as a design guide. We conclude by scaling a sparse model to 269B parameters, with a computational cost comparable to a 32B dense encoder-decoder Transformer (Stable and Transferable Mixture-of-Experts or ST-MoE-32B). For the first time, a sparse model achieves state-of-the-art performance in transfer learning, across a diverse set of tasks including reasoning (SuperGLUE, ARC Easy, ARC Challenge), summarization (XSum, CNN-DM), closed book question answering (WebQA, Natural Questions), and adversarially constructed tasks (Winogrande, ANLI R3). <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2202.08906v2-abstract-full').style.display = 'none'; document.getElementById('2202.08906v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 April, 2022; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 17 February, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">25 pages main text, 39 pages overall</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2109.01696">arXiv:2109.01696</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2109.01696">pdf</a>, <a href="https://arxiv.org/format/2109.01696">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> Revisiting 3D ResNets for Video Recognition </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Du%2C+X">Xianzhi Du</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+Y">Yeqing Li</a>, <a href="/search/cs?searchtype=author&amp;query=Cui%2C+Y">Yin Cui</a>, <a href="/search/cs?searchtype=author&amp;query=Qian%2C+R">Rui Qian</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+J">Jing Li</a>, <a href="/search/cs?searchtype=author&amp;query=Bello%2C+I">Irwan Bello</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2109.01696v1-abstract-short" style="display: inline;"> A recent work from Bello shows that training and scaling strategies may be more significant than model architectures for visual recognition. This short note studies effective training and scaling strategies for video recognition models. We propose a simple scaling strategy for 3D ResNets, in combination with improved training strategies and minor architectural changes. The resulting models, termed&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2109.01696v1-abstract-full').style.display = 'inline'; document.getElementById('2109.01696v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2109.01696v1-abstract-full" style="display: none;"> A recent work from Bello shows that training and scaling strategies may be more significant than model architectures for visual recognition. This short note studies effective training and scaling strategies for video recognition models. We propose a simple scaling strategy for 3D ResNets, in combination with improved training strategies and minor architectural changes. The resulting models, termed 3D ResNet-RS, attain competitive performance of 81.0 on Kinetics-400 and 83.8 on Kinetics-600 without pre-training. When pre-trained on a large Web Video Text dataset, our best model achieves 83.5 and 84.3 on Kinetics-400 and Kinetics-600. The proposed scaling rule is further evaluated in a self-supervised setup using contrastive learning, demonstrating improved performance. Code is available at: https://github.com/tensorflow/models/tree/master/official. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2109.01696v1-abstract-full').style.display = 'none'; document.getElementById('2109.01696v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 September, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2021. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">6 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2103.07579">arXiv:2103.07579</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2103.07579">pdf</a>, <a href="https://arxiv.org/format/2103.07579">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Revisiting ResNets: Improved Training and Scaling Strategies </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Bello%2C+I">Irwan Bello</a>, <a href="/search/cs?searchtype=author&amp;query=Fedus%2C+W">William Fedus</a>, <a href="/search/cs?searchtype=author&amp;query=Du%2C+X">Xianzhi Du</a>, <a href="/search/cs?searchtype=author&amp;query=Cubuk%2C+E+D">Ekin D. Cubuk</a>, <a href="/search/cs?searchtype=author&amp;query=Srinivas%2C+A">Aravind Srinivas</a>, <a href="/search/cs?searchtype=author&amp;query=Lin%2C+T">Tsung-Yi Lin</a>, <a href="/search/cs?searchtype=author&amp;query=Shlens%2C+J">Jonathon Shlens</a>, <a href="/search/cs?searchtype=author&amp;query=Zoph%2C+B">Barret Zoph</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2103.07579v1-abstract-short" style="display: inline;"> Novel computer vision architectures monopolize the spotlight, but the impact of the model architecture is often conflated with simultaneous changes to training methodology and scaling strategies. Our work revisits the canonical ResNet (He et al., 2015) and studies these three aspects in an effort to disentangle them. Perhaps surprisingly, we find that training and scaling strategies may matter mor&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2103.07579v1-abstract-full').style.display = 'inline'; document.getElementById('2103.07579v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2103.07579v1-abstract-full" style="display: none;"> Novel computer vision architectures monopolize the spotlight, but the impact of the model architecture is often conflated with simultaneous changes to training methodology and scaling strategies. Our work revisits the canonical ResNet (He et al., 2015) and studies these three aspects in an effort to disentangle them. Perhaps surprisingly, we find that training and scaling strategies may matter more than architectural changes, and further, that the resulting ResNets match recent state-of-the-art models. We show that the best performing scaling strategy depends on the training regime and offer two new scaling strategies: (1) scale model depth in regimes where overfitting can occur (width scaling is preferable otherwise); (2) increase image resolution more slowly than previously recommended (Tan &amp; Le, 2019). Using improved training and scaling strategies, we design a family of ResNet architectures, ResNet-RS, which are 1.7x - 2.7x faster than EfficientNets on TPUs, while achieving similar accuracies on ImageNet. In a large-scale semi-supervised learning setup, ResNet-RS achieves 86.2% top-1 ImageNet accuracy, while being 4.7x faster than EfficientNet NoisyStudent. The training techniques improve transfer performance on a suite of downstream tasks (rivaling state-of-the-art self-supervised algorithms) and extend to video classification on Kinetics-400. We recommend practitioners use these simple revised ResNets as baselines for future research. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2103.07579v1-abstract-full').style.display = 'none'; document.getElementById('2103.07579v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 March, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2021. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2102.08602">arXiv:2102.08602</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2102.08602">pdf</a>, <a href="https://arxiv.org/format/2102.08602">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> LambdaNetworks: Modeling Long-Range Interactions Without Attention </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Bello%2C+I">Irwan Bello</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2102.08602v1-abstract-short" style="display: inline;"> We present lambda layers -- an alternative framework to self-attention -- for capturing long-range interactions between an input and structured contextual information (e.g. a pixel surrounded by other pixels). Lambda layers capture such interactions by transforming available contexts into linear functions, termed lambdas, and applying these linear functions to each input separately. Similar to lin&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2102.08602v1-abstract-full').style.display = 'inline'; document.getElementById('2102.08602v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2102.08602v1-abstract-full" style="display: none;"> We present lambda layers -- an alternative framework to self-attention -- for capturing long-range interactions between an input and structured contextual information (e.g. a pixel surrounded by other pixels). Lambda layers capture such interactions by transforming available contexts into linear functions, termed lambdas, and applying these linear functions to each input separately. Similar to linear attention, lambda layers bypass expensive attention maps, but in contrast, they model both content and position-based interactions which enables their application to large structured inputs such as images. The resulting neural network architectures, LambdaNetworks, significantly outperform their convolutional and attentional counterparts on ImageNet classification, COCO object detection and COCO instance segmentation, while being more computationally efficient. Additionally, we design LambdaResNets, a family of hybrid architectures across different scales, that considerably improves the speed-accuracy tradeoff of image classification models. LambdaResNets reach excellent accuracies on ImageNet while being 3.2 - 4.4x faster than the popular EfficientNets on modern machine learning accelerators. When training with an additional 130M pseudo-labeled images, LambdaResNets achieve up to a 9.5x speed-up over the corresponding EfficientNet checkpoints. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2102.08602v1-abstract-full').style.display = 'none'; document.getElementById('2102.08602v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 February, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2021. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted for publication at the International Conference in Learning Representations 2021 (Spotlight)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2010.03019">arXiv:2010.03019</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2010.03019">pdf</a>, <a href="https://arxiv.org/format/2010.03019">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Global Self-Attention Networks for Image Recognition </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Shen%2C+Z">Zhuoran Shen</a>, <a href="/search/cs?searchtype=author&amp;query=Bello%2C+I">Irwan Bello</a>, <a href="/search/cs?searchtype=author&amp;query=Vemulapalli%2C+R">Raviteja Vemulapalli</a>, <a href="/search/cs?searchtype=author&amp;query=Jia%2C+X">Xuhui Jia</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+C">Ching-Hui Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2010.03019v2-abstract-short" style="display: inline;"> Recently, a series of works in computer vision have shown promising results on various image and video understanding tasks using self-attention. However, due to the quadratic computational and memory complexities of self-attention, these works either apply attention only to low-resolution feature maps in later stages of a deep network or restrict the receptive field of attention in each layer to a&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2010.03019v2-abstract-full').style.display = 'inline'; document.getElementById('2010.03019v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2010.03019v2-abstract-full" style="display: none;"> Recently, a series of works in computer vision have shown promising results on various image and video understanding tasks using self-attention. However, due to the quadratic computational and memory complexities of self-attention, these works either apply attention only to low-resolution feature maps in later stages of a deep network or restrict the receptive field of attention in each layer to a small local region. To overcome these limitations, this work introduces a new global self-attention module, referred to as the GSA module, which is efficient enough to serve as the backbone component of a deep network. This module consists of two parallel layers: a content attention layer that attends to pixels based only on their content and a positional attention layer that attends to pixels based on their spatial locations. The output of this module is the sum of the outputs of the two layers. Based on the proposed GSA module, we introduce new standalone global attention-based deep networks that use GSA modules instead of convolutions to model pixel interactions. Due to the global extent of the proposed GSA module, a GSA network has the ability to model long-range pixel interactions throughout the network. Our experimental results show that GSA networks outperform the corresponding convolution-based networks significantly on the CIFAR-100 and ImageNet datasets while using less parameters and computations. The proposed GSA networks also outperform various existing attention-based networks on the ImageNet dataset. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2010.03019v2-abstract-full').style.display = 'none'; document.getElementById('2010.03019v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 October, 2020; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 6 October, 2020; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2020. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1906.05909">arXiv:1906.05909</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/1906.05909">pdf</a>, <a href="https://arxiv.org/format/1906.05909">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Stand-Alone Self-Attention in Vision Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Ramachandran%2C+P">Prajit Ramachandran</a>, <a href="/search/cs?searchtype=author&amp;query=Parmar%2C+N">Niki Parmar</a>, <a href="/search/cs?searchtype=author&amp;query=Vaswani%2C+A">Ashish Vaswani</a>, <a href="/search/cs?searchtype=author&amp;query=Bello%2C+I">Irwan Bello</a>, <a href="/search/cs?searchtype=author&amp;query=Levskaya%2C+A">Anselm Levskaya</a>, <a href="/search/cs?searchtype=author&amp;query=Shlens%2C+J">Jonathon Shlens</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1906.05909v1-abstract-short" style="display: inline;"> Convolutions are a fundamental building block of modern computer vision systems. Recent approaches have argued for going beyond convolutions in order to capture long-range dependencies. These efforts focus on augmenting convolutional models with content-based interactions, such as self-attention and non-local means, to achieve gains on a number of vision tasks. The natural question that arises is&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1906.05909v1-abstract-full').style.display = 'inline'; document.getElementById('1906.05909v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1906.05909v1-abstract-full" style="display: none;"> Convolutions are a fundamental building block of modern computer vision systems. Recent approaches have argued for going beyond convolutions in order to capture long-range dependencies. These efforts focus on augmenting convolutional models with content-based interactions, such as self-attention and non-local means, to achieve gains on a number of vision tasks. The natural question that arises is whether attention can be a stand-alone primitive for vision models instead of serving as just an augmentation on top of convolutions. In developing and testing a pure self-attention vision model, we verify that self-attention can indeed be an effective stand-alone layer. A simple procedure of replacing all instances of spatial convolutions with a form of self-attention applied to ResNet model produces a fully self-attentional model that outperforms the baseline on ImageNet classification with 12% fewer FLOPS and 29% fewer parameters. On COCO object detection, a pure self-attention model matches the mAP of a baseline RetinaNet while having 39% fewer FLOPS and 34% fewer parameters. Detailed ablation studies demonstrate that self-attention is especially impactful when used in later layers. These results establish that stand-alone self-attention is an important addition to the vision practitioner&#39;s toolbox. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1906.05909v1-abstract-full').style.display = 'none'; document.getElementById('1906.05909v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 June, 2019; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2019. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1904.09925">arXiv:1904.09925</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/1904.09925">pdf</a>, <a href="https://arxiv.org/format/1904.09925">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Attention Augmented Convolutional Networks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Bello%2C+I">Irwan Bello</a>, <a href="/search/cs?searchtype=author&amp;query=Zoph%2C+B">Barret Zoph</a>, <a href="/search/cs?searchtype=author&amp;query=Vaswani%2C+A">Ashish Vaswani</a>, <a href="/search/cs?searchtype=author&amp;query=Shlens%2C+J">Jonathon Shlens</a>, <a href="/search/cs?searchtype=author&amp;query=Le%2C+Q+V">Quoc V. Le</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1904.09925v5-abstract-short" style="display: inline;"> Convolutional networks have been the paradigm of choice in many computer vision applications. The convolution operation however has a significant weakness in that it only operates on a local neighborhood, thus missing global information. Self-attention, on the other hand, has emerged as a recent advance to capture long range interactions, but has mostly been applied to sequence modeling and genera&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1904.09925v5-abstract-full').style.display = 'inline'; document.getElementById('1904.09925v5-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1904.09925v5-abstract-full" style="display: none;"> Convolutional networks have been the paradigm of choice in many computer vision applications. The convolution operation however has a significant weakness in that it only operates on a local neighborhood, thus missing global information. Self-attention, on the other hand, has emerged as a recent advance to capture long range interactions, but has mostly been applied to sequence modeling and generative modeling tasks. In this paper, we consider the use of self-attention for discriminative visual tasks as an alternative to convolutions. We introduce a novel two-dimensional relative self-attention mechanism that proves competitive in replacing convolutions as a stand-alone computational primitive for image classification. We find in control experiments that the best results are obtained when combining both convolutions and self-attention. We therefore propose to augment convolutional operators with this self-attention mechanism by concatenating convolutional feature maps with a set of feature maps produced via self-attention. Extensive experiments show that Attention Augmentation leads to consistent improvements in image classification on ImageNet and object detection on COCO across many different models and scales, including ResNets and a state-of-the art mobile constrained network, while keeping the number of parameters similar. In particular, our method achieves a $1.3\%$ top-1 accuracy improvement on ImageNet classification over a ResNet50 baseline and outperforms other attention mechanisms for images such as Squeeze-and-Excitation. It also achieves an improvement of 1.4 mAP in COCO Object Detection on top of a RetinaNet baseline. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1904.09925v5-abstract-full').style.display = 'none'; document.getElementById('1904.09925v5-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 September, 2020; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 22 April, 2019; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2019. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ICCV 2019</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1810.02019">arXiv:1810.02019</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/1810.02019">pdf</a>, <a href="https://arxiv.org/format/1810.02019">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Seq2Slate: Re-ranking and Slate Optimization with RNNs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Bello%2C+I">Irwan Bello</a>, <a href="/search/cs?searchtype=author&amp;query=Kulkarni%2C+S">Sayali Kulkarni</a>, <a href="/search/cs?searchtype=author&amp;query=Jain%2C+S">Sagar Jain</a>, <a href="/search/cs?searchtype=author&amp;query=Boutilier%2C+C">Craig Boutilier</a>, <a href="/search/cs?searchtype=author&amp;query=Chi%2C+E">Ed Chi</a>, <a href="/search/cs?searchtype=author&amp;query=Eban%2C+E">Elad Eban</a>, <a href="/search/cs?searchtype=author&amp;query=Luo%2C+X">Xiyang Luo</a>, <a href="/search/cs?searchtype=author&amp;query=Mackey%2C+A">Alan Mackey</a>, <a href="/search/cs?searchtype=author&amp;query=Meshi%2C+O">Ofer Meshi</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1810.02019v3-abstract-short" style="display: inline;"> Ranking is a central task in machine learning and information retrieval. In this task, it is especially important to present the user with a slate of items that is appealing as a whole. This in turn requires taking into account interactions between items, since intuitively, placing an item on the slate affects the decision of which other items should be placed alongside it. In this work, we propos&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1810.02019v3-abstract-full').style.display = 'inline'; document.getElementById('1810.02019v3-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1810.02019v3-abstract-full" style="display: none;"> Ranking is a central task in machine learning and information retrieval. In this task, it is especially important to present the user with a slate of items that is appealing as a whole. This in turn requires taking into account interactions between items, since intuitively, placing an item on the slate affects the decision of which other items should be placed alongside it. In this work, we propose a sequence-to-sequence model for ranking called seq2slate. At each step, the model predicts the next `best&#39; item to place on the slate given the items already selected. The sequential nature of the model allows complex dependencies between the items to be captured directly in a flexible and scalable way. We show how to learn the model end-to-end from weak supervision in the form of easily obtained click-through data. We further demonstrate the usefulness of our approach in experiments on standard ranking benchmarks as well as in a real-world recommendation system. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1810.02019v3-abstract-full').style.display = 'none'; document.getElementById('1810.02019v3-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 March, 2019; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 3 October, 2018; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2018. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1808.02822">arXiv:1808.02822</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/1808.02822">pdf</a>, <a href="https://arxiv.org/format/1808.02822">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Neural and Evolutionary Computing">cs.NE</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Backprop Evolution </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Alber%2C+M">Maximilian Alber</a>, <a href="/search/cs?searchtype=author&amp;query=Bello%2C+I">Irwan Bello</a>, <a href="/search/cs?searchtype=author&amp;query=Zoph%2C+B">Barret Zoph</a>, <a href="/search/cs?searchtype=author&amp;query=Kindermans%2C+P">Pieter-Jan Kindermans</a>, <a href="/search/cs?searchtype=author&amp;query=Ramachandran%2C+P">Prajit Ramachandran</a>, <a href="/search/cs?searchtype=author&amp;query=Le%2C+Q">Quoc Le</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1808.02822v1-abstract-short" style="display: inline;"> The back-propagation algorithm is the cornerstone of deep learning. Despite its importance, few variations of the algorithm have been attempted. This work presents an approach to discover new variations of the back-propagation equation. We use a domain specific lan- guage to describe update equations as a list of primitive functions. An evolution-based method is used to discover new propagation ru&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1808.02822v1-abstract-full').style.display = 'inline'; document.getElementById('1808.02822v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1808.02822v1-abstract-full" style="display: none;"> The back-propagation algorithm is the cornerstone of deep learning. Despite its importance, few variations of the algorithm have been attempted. This work presents an approach to discover new variations of the back-propagation equation. We use a domain specific lan- guage to describe update equations as a list of primitive functions. An evolution-based method is used to discover new propagation rules that maximize the generalization per- formance after a few epochs of training. We find several update equations that can train faster with short training times than standard back-propagation, and perform similar as standard back-propagation at convergence. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1808.02822v1-abstract-full').style.display = 'none'; document.getElementById('1808.02822v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 August, 2018; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2018. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1709.07417">arXiv:1709.07417</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/1709.07417">pdf</a>, <a href="https://arxiv.org/format/1709.07417">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Neural Optimizer Search with Reinforcement Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Bello%2C+I">Irwan Bello</a>, <a href="/search/cs?searchtype=author&amp;query=Zoph%2C+B">Barret Zoph</a>, <a href="/search/cs?searchtype=author&amp;query=Vasudevan%2C+V">Vijay Vasudevan</a>, <a href="/search/cs?searchtype=author&amp;query=Le%2C+Q+V">Quoc V. Le</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1709.07417v2-abstract-short" style="display: inline;"> We present an approach to automate the process of discovering optimization methods, with a focus on deep learning architectures. We train a Recurrent Neural Network controller to generate a string in a domain specific language that describes a mathematical update equation based on a list of primitive functions, such as the gradient, running average of the gradient, etc. The controller is trained w&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1709.07417v2-abstract-full').style.display = 'inline'; document.getElementById('1709.07417v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1709.07417v2-abstract-full" style="display: none;"> We present an approach to automate the process of discovering optimization methods, with a focus on deep learning architectures. We train a Recurrent Neural Network controller to generate a string in a domain specific language that describes a mathematical update equation based on a list of primitive functions, such as the gradient, running average of the gradient, etc. The controller is trained with Reinforcement Learning to maximize the performance of a model after a few epochs. On CIFAR-10, our method discovers several update rules that are better than many commonly used optimizers, such as Adam, RMSProp, or SGD with and without Momentum on a ConvNet model. We introduce two new optimizers, named PowerSign and AddSign, which we show transfer well and improve training on a variety of different tasks and architectures, including ImageNet classification and Google&#39;s neural machine translation system. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1709.07417v2-abstract-full').style.display = 'none'; document.getElementById('1709.07417v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 September, 2017; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 21 September, 2017; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2017. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ICML 2017 Conference paper</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1611.09940">arXiv:1611.09940</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/1611.09940">pdf</a>, <a href="https://arxiv.org/ps/1611.09940">ps</a>, <a href="https://arxiv.org/format/1611.09940">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Neural Combinatorial Optimization with Reinforcement Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Bello%2C+I">Irwan Bello</a>, <a href="/search/cs?searchtype=author&amp;query=Pham%2C+H">Hieu Pham</a>, <a href="/search/cs?searchtype=author&amp;query=Le%2C+Q+V">Quoc V. Le</a>, <a href="/search/cs?searchtype=author&amp;query=Norouzi%2C+M">Mohammad Norouzi</a>, <a href="/search/cs?searchtype=author&amp;query=Bengio%2C+S">Samy Bengio</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1611.09940v3-abstract-short" style="display: inline;"> This paper presents a framework to tackle combinatorial optimization problems using neural networks and reinforcement learning. We focus on the traveling salesman problem (TSP) and train a recurrent network that, given a set of city coordinates, predicts a distribution over different city permutations. Using negative tour length as the reward signal, we optimize the parameters of the recurrent net&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1611.09940v3-abstract-full').style.display = 'inline'; document.getElementById('1611.09940v3-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1611.09940v3-abstract-full" style="display: none;"> This paper presents a framework to tackle combinatorial optimization problems using neural networks and reinforcement learning. We focus on the traveling salesman problem (TSP) and train a recurrent network that, given a set of city coordinates, predicts a distribution over different city permutations. Using negative tour length as the reward signal, we optimize the parameters of the recurrent network using a policy gradient method. We compare learning the network parameters on a set of training graphs against learning them on individual test graphs. Despite the computational expense, without much engineering and heuristic designing, Neural Combinatorial Optimization achieves close to optimal results on 2D Euclidean graphs with up to 100 nodes. Applied to the KnapSack, another NP-hard problem, the same method obtains optimal solutions for instances with up to 200 items. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1611.09940v3-abstract-full').style.display = 'none'; document.getElementById('1611.09940v3-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 January, 2017; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 29 November, 2016; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2016. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Under review as a conference paper at ICLR 2017</span> </p> </li> </ol> <div class="is-hidden-tablet"> <!-- feedback for mobile only --> <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>&nbsp;&nbsp;</span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary"> <!-- MetaColumn 1 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div> <!-- end MetaColumn 1 --> <!-- MetaColumn 2 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div> <!-- end MetaColumn 2 --> </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>

Pages: 1 2 3 4 5 6 7 8 9 10