CINXE.COM
Search | arXiv e-print repository
<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/> <!-- new favicon config and versions by realfavicongenerator.net --> <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b"> <!-- end favicon config --> <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\\(","\\)"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a> <!-- contains Cornell logo and sponsor statement --> <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div> <!-- contains arXiv identity and search bar --> <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div> <!-- closes identity --> <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1–17 of 17 results for author: <span class="mathjax">Shyam, P</span> </h1> </div> <div class="level-right is-hidden-mobile"> <!-- feedback for mobile is moved to footer --> <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a> </span> </div> </div> <div class="content"> <form method="GET" action="/search/cs" aria-role="search"> Searching in archive <strong>cs</strong>. <a href="/search/?searchtype=author&query=Shyam%2C+P">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Shyam, P"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Shyam%2C+P&terms-0-field=author&size=50&order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Shyam, P"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.12219">arXiv:2410.12219</a> <span> [<a href="https://arxiv.org/pdf/2410.12219">pdf</a>, <a href="https://arxiv.org/format/2410.12219">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> </div> </div> <p class="title is-5 mathjax"> OmnixR: Evaluating Omni-modality Language Models on Reasoning across Modalities </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+L">Lichang Chen</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+H">Hexiang Hu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+M">Mingda Zhang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yiwen Chen</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zifeng Wang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yandong Li</a>, <a href="/search/cs?searchtype=author&query=Shyam%2C+P">Pranav Shyam</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+T">Tianyi Zhou</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+H">Heng Huang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+M">Ming-Hsuan Yang</a>, <a href="/search/cs?searchtype=author&query=Gong%2C+B">Boqing Gong</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.12219v1-abstract-short" style="display: inline;"> We introduce OmnixR, an evaluation suite designed to benchmark SoTA Omni-modality Language Models, such as GPT-4o and Gemini. Evaluating OLMs, which integrate multiple modalities such as text, vision, and audio, presents unique challenges. Particularly, the user message might often consist of multiple modalities, such that OLMs have to establish holistic understanding and reasoning across modaliti… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.12219v1-abstract-full').style.display = 'inline'; document.getElementById('2410.12219v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.12219v1-abstract-full" style="display: none;"> We introduce OmnixR, an evaluation suite designed to benchmark SoTA Omni-modality Language Models, such as GPT-4o and Gemini. Evaluating OLMs, which integrate multiple modalities such as text, vision, and audio, presents unique challenges. Particularly, the user message might often consist of multiple modalities, such that OLMs have to establish holistic understanding and reasoning across modalities to accomplish the task. Existing benchmarks are limited to single modality or dual-modality tasks, overlooking comprehensive multi-modal assessments of model reasoning. To address this, OmnixR offers two evaluation variants: (1)synthetic subset: a synthetic dataset generated automatically by translating text into multiple modalities--audio, images, video, and hybrids (Omnify). (2)realistic subset: a real-world dataset, manually curated and annotated by experts, for evaluating cross-modal reasoning in natural settings. OmnixR presents a unique evaluation towards assessing OLMs over a diverse mix of modalities, such as a question that involves video, audio, and text, providing a rigorous cross-modal reasoning testbed unlike any existing benchmarks. Our experiments find that all state-of-the-art OLMs struggle with OmnixR questions that require integrating information from multiple modalities to answer. Further analysis highlights differences in reasoning behavior, underscoring the challenges of omni-modal AI alignment. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.12219v1-abstract-full').style.display = 'none'; document.getElementById('2410.12219v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">19 pages, 6 figures, 12 tables</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2403.05530">arXiv:2403.05530</a> <span> [<a href="https://arxiv.org/pdf/2403.05530">pdf</a>, <a href="https://arxiv.org/format/2403.05530">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Gemini 1.5: Unlocking multimodal understanding across millions of tokens of context </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Gemini+Team"> Gemini Team</a>, <a href="/search/cs?searchtype=author&query=Georgiev%2C+P">Petko Georgiev</a>, <a href="/search/cs?searchtype=author&query=Lei%2C+V+I">Ving Ian Lei</a>, <a href="/search/cs?searchtype=author&query=Burnell%2C+R">Ryan Burnell</a>, <a href="/search/cs?searchtype=author&query=Bai%2C+L">Libin Bai</a>, <a href="/search/cs?searchtype=author&query=Gulati%2C+A">Anmol Gulati</a>, <a href="/search/cs?searchtype=author&query=Tanzer%2C+G">Garrett Tanzer</a>, <a href="/search/cs?searchtype=author&query=Vincent%2C+D">Damien Vincent</a>, <a href="/search/cs?searchtype=author&query=Pan%2C+Z">Zhufeng Pan</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+S">Shibo Wang</a>, <a href="/search/cs?searchtype=author&query=Mariooryad%2C+S">Soroosh Mariooryad</a>, <a href="/search/cs?searchtype=author&query=Ding%2C+Y">Yifan Ding</a>, <a href="/search/cs?searchtype=author&query=Geng%2C+X">Xinyang Geng</a>, <a href="/search/cs?searchtype=author&query=Alcober%2C+F">Fred Alcober</a>, <a href="/search/cs?searchtype=author&query=Frostig%2C+R">Roy Frostig</a>, <a href="/search/cs?searchtype=author&query=Omernick%2C+M">Mark Omernick</a>, <a href="/search/cs?searchtype=author&query=Walker%2C+L">Lexi Walker</a>, <a href="/search/cs?searchtype=author&query=Paduraru%2C+C">Cosmin Paduraru</a>, <a href="/search/cs?searchtype=author&query=Sorokin%2C+C">Christina Sorokin</a>, <a href="/search/cs?searchtype=author&query=Tacchetti%2C+A">Andrea Tacchetti</a>, <a href="/search/cs?searchtype=author&query=Gaffney%2C+C">Colin Gaffney</a>, <a href="/search/cs?searchtype=author&query=Daruki%2C+S">Samira Daruki</a>, <a href="/search/cs?searchtype=author&query=Sercinoglu%2C+O">Olcan Sercinoglu</a>, <a href="/search/cs?searchtype=author&query=Gleicher%2C+Z">Zach Gleicher</a>, <a href="/search/cs?searchtype=author&query=Love%2C+J">Juliette Love</a> , et al. (1112 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2403.05530v5-abstract-short" style="display: inline;"> In this report, we introduce the Gemini 1.5 family of models, representing the next generation of highly compute-efficient multimodal models capable of recalling and reasoning over fine-grained information from millions of tokens of context, including multiple long documents and hours of video and audio. The family includes two new models: (1) an updated Gemini 1.5 Pro, which exceeds the February… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.05530v5-abstract-full').style.display = 'inline'; document.getElementById('2403.05530v5-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2403.05530v5-abstract-full" style="display: none;"> In this report, we introduce the Gemini 1.5 family of models, representing the next generation of highly compute-efficient multimodal models capable of recalling and reasoning over fine-grained information from millions of tokens of context, including multiple long documents and hours of video and audio. The family includes two new models: (1) an updated Gemini 1.5 Pro, which exceeds the February version on the great majority of capabilities and benchmarks; (2) Gemini 1.5 Flash, a more lightweight variant designed for efficiency with minimal regression in quality. Gemini 1.5 models achieve near-perfect recall on long-context retrieval tasks across modalities, improve the state-of-the-art in long-document QA, long-video QA and long-context ASR, and match or surpass Gemini 1.0 Ultra's state-of-the-art performance across a broad set of benchmarks. Studying the limits of Gemini 1.5's long-context ability, we find continued improvement in next-token prediction and near-perfect retrieval (>99%) up to at least 10M tokens, a generational leap over existing models such as Claude 3.0 (200k) and GPT-4 Turbo (128k). Finally, we highlight real-world use cases, such as Gemini 1.5 collaborating with professionals on completing their tasks achieving 26 to 75% time savings across 10 different job categories, as well as surprising new capabilities of large language models at the frontier; when given a grammar manual for Kalamang, a language with fewer than 200 speakers worldwide, the model learns to translate English to Kalamang at a similar level to a person who learned from the same content. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.05530v5-abstract-full').style.display = 'none'; document.getElementById('2403.05530v5-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 8 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2312.11805">arXiv:2312.11805</a> <span> [<a href="https://arxiv.org/pdf/2312.11805">pdf</a>, <a href="https://arxiv.org/format/2312.11805">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Gemini: A Family of Highly Capable Multimodal Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Gemini+Team"> Gemini Team</a>, <a href="/search/cs?searchtype=author&query=Anil%2C+R">Rohan Anil</a>, <a href="/search/cs?searchtype=author&query=Borgeaud%2C+S">Sebastian Borgeaud</a>, <a href="/search/cs?searchtype=author&query=Alayrac%2C+J">Jean-Baptiste Alayrac</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+J">Jiahui Yu</a>, <a href="/search/cs?searchtype=author&query=Soricut%2C+R">Radu Soricut</a>, <a href="/search/cs?searchtype=author&query=Schalkwyk%2C+J">Johan Schalkwyk</a>, <a href="/search/cs?searchtype=author&query=Dai%2C+A+M">Andrew M. Dai</a>, <a href="/search/cs?searchtype=author&query=Hauth%2C+A">Anja Hauth</a>, <a href="/search/cs?searchtype=author&query=Millican%2C+K">Katie Millican</a>, <a href="/search/cs?searchtype=author&query=Silver%2C+D">David Silver</a>, <a href="/search/cs?searchtype=author&query=Johnson%2C+M">Melvin Johnson</a>, <a href="/search/cs?searchtype=author&query=Antonoglou%2C+I">Ioannis Antonoglou</a>, <a href="/search/cs?searchtype=author&query=Schrittwieser%2C+J">Julian Schrittwieser</a>, <a href="/search/cs?searchtype=author&query=Glaese%2C+A">Amelia Glaese</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+J">Jilin Chen</a>, <a href="/search/cs?searchtype=author&query=Pitler%2C+E">Emily Pitler</a>, <a href="/search/cs?searchtype=author&query=Lillicrap%2C+T">Timothy Lillicrap</a>, <a href="/search/cs?searchtype=author&query=Lazaridou%2C+A">Angeliki Lazaridou</a>, <a href="/search/cs?searchtype=author&query=Firat%2C+O">Orhan Firat</a>, <a href="/search/cs?searchtype=author&query=Molloy%2C+J">James Molloy</a>, <a href="/search/cs?searchtype=author&query=Isard%2C+M">Michael Isard</a>, <a href="/search/cs?searchtype=author&query=Barham%2C+P+R">Paul R. Barham</a>, <a href="/search/cs?searchtype=author&query=Hennigan%2C+T">Tom Hennigan</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+B">Benjamin Lee</a> , et al. (1325 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2312.11805v4-abstract-short" style="display: inline;"> This report introduces a new family of multimodal models, Gemini, that exhibit remarkable capabilities across image, audio, video, and text understanding. The Gemini family consists of Ultra, Pro, and Nano sizes, suitable for applications ranging from complex reasoning tasks to on-device memory-constrained use-cases. Evaluation on a broad range of benchmarks shows that our most-capable Gemini Ultr… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.11805v4-abstract-full').style.display = 'inline'; document.getElementById('2312.11805v4-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2312.11805v4-abstract-full" style="display: none;"> This report introduces a new family of multimodal models, Gemini, that exhibit remarkable capabilities across image, audio, video, and text understanding. The Gemini family consists of Ultra, Pro, and Nano sizes, suitable for applications ranging from complex reasoning tasks to on-device memory-constrained use-cases. Evaluation on a broad range of benchmarks shows that our most-capable Gemini Ultra model advances the state of the art in 30 of 32 of these benchmarks - notably being the first model to achieve human-expert performance on the well-studied exam benchmark MMLU, and improving the state of the art in every one of the 20 multimodal benchmarks we examined. We believe that the new capabilities of the Gemini family in cross-modal reasoning and language understanding will enable a wide variety of use cases. We discuss our approach toward post-training and deploying Gemini models responsibly to users through services including Gemini, Gemini Advanced, Google AI Studio, and Cloud Vertex AI. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.11805v4-abstract-full').style.display = 'none'; document.getElementById('2312.11805v4-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 18 December, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2303.08774">arXiv:2303.08774</a> <span> [<a href="https://arxiv.org/pdf/2303.08774">pdf</a>, <a href="https://arxiv.org/format/2303.08774">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> GPT-4 Technical Report </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=OpenAI"> OpenAI</a>, <a href="/search/cs?searchtype=author&query=Achiam%2C+J">Josh Achiam</a>, <a href="/search/cs?searchtype=author&query=Adler%2C+S">Steven Adler</a>, <a href="/search/cs?searchtype=author&query=Agarwal%2C+S">Sandhini Agarwal</a>, <a href="/search/cs?searchtype=author&query=Ahmad%2C+L">Lama Ahmad</a>, <a href="/search/cs?searchtype=author&query=Akkaya%2C+I">Ilge Akkaya</a>, <a href="/search/cs?searchtype=author&query=Aleman%2C+F+L">Florencia Leoni Aleman</a>, <a href="/search/cs?searchtype=author&query=Almeida%2C+D">Diogo Almeida</a>, <a href="/search/cs?searchtype=author&query=Altenschmidt%2C+J">Janko Altenschmidt</a>, <a href="/search/cs?searchtype=author&query=Altman%2C+S">Sam Altman</a>, <a href="/search/cs?searchtype=author&query=Anadkat%2C+S">Shyamal Anadkat</a>, <a href="/search/cs?searchtype=author&query=Avila%2C+R">Red Avila</a>, <a href="/search/cs?searchtype=author&query=Babuschkin%2C+I">Igor Babuschkin</a>, <a href="/search/cs?searchtype=author&query=Balaji%2C+S">Suchir Balaji</a>, <a href="/search/cs?searchtype=author&query=Balcom%2C+V">Valerie Balcom</a>, <a href="/search/cs?searchtype=author&query=Baltescu%2C+P">Paul Baltescu</a>, <a href="/search/cs?searchtype=author&query=Bao%2C+H">Haiming Bao</a>, <a href="/search/cs?searchtype=author&query=Bavarian%2C+M">Mohammad Bavarian</a>, <a href="/search/cs?searchtype=author&query=Belgum%2C+J">Jeff Belgum</a>, <a href="/search/cs?searchtype=author&query=Bello%2C+I">Irwan Bello</a>, <a href="/search/cs?searchtype=author&query=Berdine%2C+J">Jake Berdine</a>, <a href="/search/cs?searchtype=author&query=Bernadett-Shapiro%2C+G">Gabriel Bernadett-Shapiro</a>, <a href="/search/cs?searchtype=author&query=Berner%2C+C">Christopher Berner</a>, <a href="/search/cs?searchtype=author&query=Bogdonoff%2C+L">Lenny Bogdonoff</a>, <a href="/search/cs?searchtype=author&query=Boiko%2C+O">Oleg Boiko</a> , et al. (256 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2303.08774v6-abstract-short" style="display: inline;"> We report the development of GPT-4, a large-scale, multimodal model which can accept image and text inputs and produce text outputs. While less capable than humans in many real-world scenarios, GPT-4 exhibits human-level performance on various professional and academic benchmarks, including passing a simulated bar exam with a score around the top 10% of test takers. GPT-4 is a Transformer-based mo… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2303.08774v6-abstract-full').style.display = 'inline'; document.getElementById('2303.08774v6-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2303.08774v6-abstract-full" style="display: none;"> We report the development of GPT-4, a large-scale, multimodal model which can accept image and text inputs and produce text outputs. While less capable than humans in many real-world scenarios, GPT-4 exhibits human-level performance on various professional and academic benchmarks, including passing a simulated bar exam with a score around the top 10% of test takers. GPT-4 is a Transformer-based model pre-trained to predict the next token in a document. The post-training alignment process results in improved performance on measures of factuality and adherence to desired behavior. A core component of this project was developing infrastructure and optimization methods that behave predictably across a wide range of scales. This allowed us to accurately predict some aspects of GPT-4's performance based on models trained with no more than 1/1,000th the compute of GPT-4. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2303.08774v6-abstract-full').style.display = 'none'; document.getElementById('2303.08774v6-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 15 March, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">100 pages; updated authors list; fixed author names and added citation</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2204.01795">arXiv:2204.01795</a> <span> [<a href="https://arxiv.org/pdf/2204.01795">pdf</a>, <a href="https://arxiv.org/format/2204.01795">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> Lightweight HDR Camera ISP for Robust Perception in Dynamic Illumination Conditions via Fourier Adversarial Networks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Shyam%2C+P">Pranjay Shyam</a>, <a href="/search/cs?searchtype=author&query=Sengar%2C+S+S">Sandeep Singh Sengar</a>, <a href="/search/cs?searchtype=author&query=Yoon%2C+K">Kuk-Jin Yoon</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+K">Kyung-Soo Kim</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2204.01795v2-abstract-short" style="display: inline;"> The limited dynamic range of commercial compact camera sensors results in an inaccurate representation of scenes with varying illumination conditions, adversely affecting image quality and subsequently limiting the performance of underlying image processing algorithms. Current state-of-the-art (SoTA) convolutional neural networks (CNN) are developed as post-processing techniques to independently r… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2204.01795v2-abstract-full').style.display = 'inline'; document.getElementById('2204.01795v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2204.01795v2-abstract-full" style="display: none;"> The limited dynamic range of commercial compact camera sensors results in an inaccurate representation of scenes with varying illumination conditions, adversely affecting image quality and subsequently limiting the performance of underlying image processing algorithms. Current state-of-the-art (SoTA) convolutional neural networks (CNN) are developed as post-processing techniques to independently recover under-/over-exposed images. However, when applied to images containing real-world degradations such as glare, high-beam, color bleeding with varying noise intensity, these algorithms amplify the degradations, further degrading image quality. We propose a lightweight two-stage image enhancement algorithm sequentially balancing illumination and noise removal using frequency priors for structural guidance to overcome these limitations. Furthermore, to ensure realistic image quality, we leverage the relationship between frequency and spatial domain properties of an image and propose a Fourier spectrum-based adversarial framework (AFNet) for consistent image enhancement under varying illumination conditions. While current formulations of image enhancement are envisioned as post-processing techniques, we examine if such an algorithm could be extended to integrate the functionality of the Image Signal Processing (ISP) pipeline within the camera sensor benefiting from RAW sensor data and lightweight CNN architecture. Based on quantitative and qualitative evaluations, we also examine the practicality and effects of image enhancement techniques on the performance of common perception tasks such as object detection and semantic segmentation in varying illumination conditions. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2204.01795v2-abstract-full').style.display = 'none'; document.getElementById('2204.01795v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 May, 2022; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 4 April, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted in BMVC 2021</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2202.13144">arXiv:2202.13144</a> <span> [<a href="https://arxiv.org/pdf/2202.13144">pdf</a>, <a href="https://arxiv.org/format/2202.13144">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> DGSS : Domain Generalized Semantic Segmentation using Iterative Style Mining and Latent Representation Alignment </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Shyam%2C+P">Pranjay Shyam</a>, <a href="/search/cs?searchtype=author&query=Bangunharcana%2C+A">Antyanta Bangunharcana</a>, <a href="/search/cs?searchtype=author&query=Yoon%2C+K">Kuk-Jin Yoon</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+K">Kyung-Soo Kim</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2202.13144v1-abstract-short" style="display: inline;"> Semantic segmentation algorithms require access to well-annotated datasets captured under diverse illumination conditions to ensure consistent performance. However, poor visibility conditions at varying illumination conditions result in laborious and error-prone labeling. Alternatively, using synthetic samples to train segmentation algorithms has gained interest with the drawback of domain gap tha… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2202.13144v1-abstract-full').style.display = 'inline'; document.getElementById('2202.13144v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2202.13144v1-abstract-full" style="display: none;"> Semantic segmentation algorithms require access to well-annotated datasets captured under diverse illumination conditions to ensure consistent performance. However, poor visibility conditions at varying illumination conditions result in laborious and error-prone labeling. Alternatively, using synthetic samples to train segmentation algorithms has gained interest with the drawback of domain gap that results in sub-optimal performance. While current state-of-the-art (SoTA) have proposed different mechanisms to bridge the domain gap, they still perform poorly in low illumination conditions with an average performance drop of - 10.7 mIOU. In this paper, we focus upon single source domain generalization to overcome the domain gap and propose a two-step framework wherein we first identify an adversarial style that maximizes the domain gap between stylized and source images. Subsequently, these stylized images are used to categorically align features such that features belonging to the same class are clustered together in latent space, irrespective of domain gap. Furthermore, to increase intra-class variance while training, we propose a style mixing mechanism wherein the same objects from different styles are mixed to construct a new training image. This framework allows us to achieve a domain generalized semantic segmentation algorithm with consistent performance without prior information of the target domain while relying on a single source. Based on extensive experiments, we match SoTA performance on SYNTHIA $\to$ Cityscapes, GTAV $\to$ Cityscapes while setting new SoTA on GTAV $\to$ Dark Zurich and GTAV $\to$ Night Driving benchmarks without retraining. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2202.13144v1-abstract-full').style.display = 'none'; document.getElementById('2202.13144v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 February, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2022. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2201.10005">arXiv:2201.10005</a> <span> [<a href="https://arxiv.org/pdf/2201.10005">pdf</a>, <a href="https://arxiv.org/format/2201.10005">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Text and Code Embeddings by Contrastive Pre-Training </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Neelakantan%2C+A">Arvind Neelakantan</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+T">Tao Xu</a>, <a href="/search/cs?searchtype=author&query=Puri%2C+R">Raul Puri</a>, <a href="/search/cs?searchtype=author&query=Radford%2C+A">Alec Radford</a>, <a href="/search/cs?searchtype=author&query=Han%2C+J+M">Jesse Michael Han</a>, <a href="/search/cs?searchtype=author&query=Tworek%2C+J">Jerry Tworek</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+Q">Qiming Yuan</a>, <a href="/search/cs?searchtype=author&query=Tezak%2C+N">Nikolas Tezak</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+J+W">Jong Wook Kim</a>, <a href="/search/cs?searchtype=author&query=Hallacy%2C+C">Chris Hallacy</a>, <a href="/search/cs?searchtype=author&query=Heidecke%2C+J">Johannes Heidecke</a>, <a href="/search/cs?searchtype=author&query=Shyam%2C+P">Pranav Shyam</a>, <a href="/search/cs?searchtype=author&query=Power%2C+B">Boris Power</a>, <a href="/search/cs?searchtype=author&query=Nekoul%2C+T+E">Tyna Eloundou Nekoul</a>, <a href="/search/cs?searchtype=author&query=Sastry%2C+G">Girish Sastry</a>, <a href="/search/cs?searchtype=author&query=Krueger%2C+G">Gretchen Krueger</a>, <a href="/search/cs?searchtype=author&query=Schnurr%2C+D">David Schnurr</a>, <a href="/search/cs?searchtype=author&query=Such%2C+F+P">Felipe Petroski Such</a>, <a href="/search/cs?searchtype=author&query=Hsu%2C+K">Kenny Hsu</a>, <a href="/search/cs?searchtype=author&query=Thompson%2C+M">Madeleine Thompson</a>, <a href="/search/cs?searchtype=author&query=Khan%2C+T">Tabarak Khan</a>, <a href="/search/cs?searchtype=author&query=Sherbakov%2C+T">Toki Sherbakov</a>, <a href="/search/cs?searchtype=author&query=Jang%2C+J">Joanne Jang</a>, <a href="/search/cs?searchtype=author&query=Welinder%2C+P">Peter Welinder</a>, <a href="/search/cs?searchtype=author&query=Weng%2C+L">Lilian Weng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2201.10005v1-abstract-short" style="display: inline;"> Text embeddings are useful features in many applications such as semantic search and computing text similarity. Previous work typically trains models customized for different use cases, varying in dataset choice, training objective and model architecture. In this work, we show that contrastive pre-training on unsupervised data at scale leads to high quality vector representations of text and code.… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2201.10005v1-abstract-full').style.display = 'inline'; document.getElementById('2201.10005v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2201.10005v1-abstract-full" style="display: none;"> Text embeddings are useful features in many applications such as semantic search and computing text similarity. Previous work typically trains models customized for different use cases, varying in dataset choice, training objective and model architecture. In this work, we show that contrastive pre-training on unsupervised data at scale leads to high quality vector representations of text and code. The same unsupervised text embeddings that achieve new state-of-the-art results in linear-probe classification also display impressive semantic search capabilities and sometimes even perform competitively with fine-tuned models. On linear-probe classification accuracy averaging over 7 tasks, our best unsupervised model achieves a relative improvement of 4% and 1.8% over previous best unsupervised and supervised text embedding models respectively. The same text embeddings when evaluated on large-scale semantic search attains a relative improvement of 23.4%, 14.7%, and 10.6% over previous best unsupervised methods on MSMARCO, Natural Questions and TriviaQA benchmarks, respectively. Similarly to text embeddings, we train code embedding models on (text, code) pairs, obtaining a 20.8% relative improvement over prior best work on code search. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2201.10005v1-abstract-full').style.display = 'none'; document.getElementById('2201.10005v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 January, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2022. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2112.10741">arXiv:2112.10741</a> <span> [<a href="https://arxiv.org/pdf/2112.10741">pdf</a>, <a href="https://arxiv.org/format/2112.10741">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Graphics">cs.GR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> GLIDE: Towards Photorealistic Image Generation and Editing with Text-Guided Diffusion Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Nichol%2C+A">Alex Nichol</a>, <a href="/search/cs?searchtype=author&query=Dhariwal%2C+P">Prafulla Dhariwal</a>, <a href="/search/cs?searchtype=author&query=Ramesh%2C+A">Aditya Ramesh</a>, <a href="/search/cs?searchtype=author&query=Shyam%2C+P">Pranav Shyam</a>, <a href="/search/cs?searchtype=author&query=Mishkin%2C+P">Pamela Mishkin</a>, <a href="/search/cs?searchtype=author&query=McGrew%2C+B">Bob McGrew</a>, <a href="/search/cs?searchtype=author&query=Sutskever%2C+I">Ilya Sutskever</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+M">Mark Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2112.10741v3-abstract-short" style="display: inline;"> Diffusion models have recently been shown to generate high-quality synthetic images, especially when paired with a guidance technique to trade off diversity for fidelity. We explore diffusion models for the problem of text-conditional image synthesis and compare two different guidance strategies: CLIP guidance and classifier-free guidance. We find that the latter is preferred by human evaluators f… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2112.10741v3-abstract-full').style.display = 'inline'; document.getElementById('2112.10741v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2112.10741v3-abstract-full" style="display: none;"> Diffusion models have recently been shown to generate high-quality synthetic images, especially when paired with a guidance technique to trade off diversity for fidelity. We explore diffusion models for the problem of text-conditional image synthesis and compare two different guidance strategies: CLIP guidance and classifier-free guidance. We find that the latter is preferred by human evaluators for both photorealism and caption similarity, and often produces photorealistic samples. Samples from a 3.5 billion parameter text-conditional diffusion model using classifier-free guidance are favored by human evaluators to those from DALL-E, even when the latter uses expensive CLIP reranking. Additionally, we find that our models can be fine-tuned to perform image inpainting, enabling powerful text-driven image editing. We train a smaller model on a filtered dataset and release the code and weights at https://github.com/openai/glide-text2im. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2112.10741v3-abstract-full').style.display = 'none'; document.getElementById('2112.10741v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 March, 2022; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 20 December, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2021. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">20 pages, 18 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2110.05448">arXiv:2110.05448</a> <span> [<a href="https://arxiv.org/pdf/2110.05448">pdf</a>, <a href="https://arxiv.org/format/2110.05448">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Unsupervised Neural Machine Translation with Generative Language Models Only </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Han%2C+J+M">Jesse Michael Han</a>, <a href="/search/cs?searchtype=author&query=Babuschkin%2C+I">Igor Babuschkin</a>, <a href="/search/cs?searchtype=author&query=Edwards%2C+H">Harrison Edwards</a>, <a href="/search/cs?searchtype=author&query=Neelakantan%2C+A">Arvind Neelakantan</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+T">Tao Xu</a>, <a href="/search/cs?searchtype=author&query=Polu%2C+S">Stanislas Polu</a>, <a href="/search/cs?searchtype=author&query=Ray%2C+A">Alex Ray</a>, <a href="/search/cs?searchtype=author&query=Shyam%2C+P">Pranav Shyam</a>, <a href="/search/cs?searchtype=author&query=Ramesh%2C+A">Aditya Ramesh</a>, <a href="/search/cs?searchtype=author&query=Radford%2C+A">Alec Radford</a>, <a href="/search/cs?searchtype=author&query=Sutskever%2C+I">Ilya Sutskever</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2110.05448v1-abstract-short" style="display: inline;"> We show how to derive state-of-the-art unsupervised neural machine translation systems from generatively pre-trained language models. Our method consists of three steps: few-shot amplification, distillation, and backtranslation. We first use the zero-shot translation ability of large pre-trained language models to generate translations for a small set of unlabeled sentences. We then amplify these… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2110.05448v1-abstract-full').style.display = 'inline'; document.getElementById('2110.05448v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2110.05448v1-abstract-full" style="display: none;"> We show how to derive state-of-the-art unsupervised neural machine translation systems from generatively pre-trained language models. Our method consists of three steps: few-shot amplification, distillation, and backtranslation. We first use the zero-shot translation ability of large pre-trained language models to generate translations for a small set of unlabeled sentences. We then amplify these zero-shot translations by using them as few-shot demonstrations for sampling a larger synthetic dataset. This dataset is distilled by discarding the few-shot demonstrations and then fine-tuning. During backtranslation, we repeatedly generate translations for a set of inputs and then fine-tune a single language model on both directions of the translation task at once, ensuring cycle-consistency by swapping the roles of gold monotext and generated translations when fine-tuning. By using our method to leverage GPT-3's zero-shot translation capability, we achieve a new state-of-the-art in unsupervised translation on the WMT14 English-French benchmark, attaining a BLEU score of 42.1. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2110.05448v1-abstract-full').style.display = 'none'; document.getElementById('2110.05448v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 October, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2021. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">10 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2103.05889">arXiv:2103.05889</a> <span> [<a href="https://arxiv.org/pdf/2103.05889">pdf</a>, <a href="https://arxiv.org/format/2103.05889">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Evaluating COPY-BLEND Augmentation for Low Level Vision Tasks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Shyam%2C+P">Pranjay Shyam</a>, <a href="/search/cs?searchtype=author&query=Sengar%2C+S+S">Sandeep Singh Sengar</a>, <a href="/search/cs?searchtype=author&query=Yoon%2C+K">Kuk-Jin Yoon</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+K">Kyung-Soo Kim</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2103.05889v1-abstract-short" style="display: inline;"> Region modification-based data augmentation techniques have shown to improve performance for high level vision tasks (object detection, semantic segmentation, image classification, etc.) by encouraging underlying algorithms to focus on multiple discriminative features. However, as these techniques destroy spatial relationship with neighboring regions, performance can be deteriorated when using the… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2103.05889v1-abstract-full').style.display = 'inline'; document.getElementById('2103.05889v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2103.05889v1-abstract-full" style="display: none;"> Region modification-based data augmentation techniques have shown to improve performance for high level vision tasks (object detection, semantic segmentation, image classification, etc.) by encouraging underlying algorithms to focus on multiple discriminative features. However, as these techniques destroy spatial relationship with neighboring regions, performance can be deteriorated when using them to train algorithms designed for low level vision tasks (low light image enhancement, image dehazing, deblurring, etc.) where textural consistency between recovered and its neighboring regions is important to ensure effective performance. In this paper, we examine the efficacy of a simple copy-blend data augmentation technique that copies patches from noisy images and blends onto a clean image and vice versa to ensure that an underlying algorithm localizes and recovers affected regions resulting in increased perceptual quality of a recovered image. To assess performance improvement, we perform extensive experiments alongside different region modification-based augmentation techniques and report observations such as improved performance, reduced requirement for training dataset, and early convergence across tasks such as low light image enhancement, image dehazing and image deblurring without any modification to baseline algorithm. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2103.05889v1-abstract-full').style.display = 'none'; document.getElementById('2103.05889v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 March, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2021. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2101.10449">arXiv:2101.10449</a> <span> [<a href="https://arxiv.org/pdf/2101.10449">pdf</a>, <a href="https://arxiv.org/format/2101.10449">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Towards Domain Invariant Single Image Dehazing </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Shyam%2C+P">Pranjay Shyam</a>, <a href="/search/cs?searchtype=author&query=Yoon%2C+K">Kuk-Jin Yoon</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+K">Kyung-Soo Kim</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2101.10449v1-abstract-short" style="display: inline;"> Presence of haze in images obscures underlying information, which is undesirable in applications requiring accurate environment information. To recover such an image, a dehazing algorithm should localize and recover affected regions while ensuring consistency between recovered and its neighboring regions. However owing to fixed receptive field of convolutional kernels and non uniform haze distribu… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2101.10449v1-abstract-full').style.display = 'inline'; document.getElementById('2101.10449v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2101.10449v1-abstract-full" style="display: none;"> Presence of haze in images obscures underlying information, which is undesirable in applications requiring accurate environment information. To recover such an image, a dehazing algorithm should localize and recover affected regions while ensuring consistency between recovered and its neighboring regions. However owing to fixed receptive field of convolutional kernels and non uniform haze distribution, assuring consistency between regions is difficult. In this paper, we utilize an encoder-decoder based network architecture to perform the task of dehazing and integrate an spatially aware channel attention mechanism to enhance features of interest beyond the receptive field of traditional conventional kernels. To ensure performance consistency across diverse range of haze densities, we utilize greedy localized data augmentation mechanism. Synthetic datasets are typically used to ensure a large amount of paired training samples, however the methodology to generate such samples introduces a gap between them and real images while accounting for only uniform haze distribution and overlooking more realistic scenario of non-uniform haze distribution resulting in inferior dehazing performance when evaluated on real datasets. Despite this, the abundance of paired samples within synthetic datasets cannot be ignored. Thus to ensure performance consistency across diverse datasets, we train the proposed network within an adversarial prior-guided framework that relies on a generated image along with its low and high frequency components to determine if properties of dehazed images matches those of ground truth. We preform extensive experiments to validate the dehazing and domain invariance performance of proposed framework across diverse domains and report state-of-the-art (SoTA) results. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2101.10449v1-abstract-full').style.display = 'none'; document.getElementById('2101.10449v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 January, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2021. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2009.00842">arXiv:2009.00842</a> <span> [<a href="https://arxiv.org/pdf/2009.00842">pdf</a>, <a href="https://arxiv.org/format/2009.00842">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Retaining Image Feature Matching Performance Under Low Light Conditions </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Shyam%2C+P">Pranjay Shyam</a>, <a href="/search/cs?searchtype=author&query=Bangunharcana%2C+A">Antyanta Bangunharcana</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+K">Kyung-Soo Kim</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2009.00842v1-abstract-short" style="display: inline;"> Poor image quality in low light images may result in a reduced number of feature matching between images. In this paper, we investigate the performance of feature extraction algorithms in low light environments. To find an optimal setting to retain feature matching performance in low light images, we look into the effect of changing feature acceptance threshold for feature detector and adding pre-… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2009.00842v1-abstract-full').style.display = 'inline'; document.getElementById('2009.00842v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2009.00842v1-abstract-full" style="display: none;"> Poor image quality in low light images may result in a reduced number of feature matching between images. In this paper, we investigate the performance of feature extraction algorithms in low light environments. To find an optimal setting to retain feature matching performance in low light images, we look into the effect of changing feature acceptance threshold for feature detector and adding pre-processing in the form of Low Light Image Enhancement (LLIE) prior to feature detection. We observe that even in low light images, feature matching using traditional hand-crafted feature detectors still performs reasonably well by lowering the threshold parameter. We also show that applying Low Light Image Enhancement (LLIE) algorithms can improve feature matching even more when paired with the right feature extraction algorithm. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2009.00842v1-abstract-full').style.display = 'none'; document.getElementById('2009.00842v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 September, 2020; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2020. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted in ICCAS 2020 - 20th International Conference on Control, Robotics, and Systems</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2005.14165">arXiv:2005.14165</a> <span> [<a href="https://arxiv.org/pdf/2005.14165">pdf</a>, <a href="https://arxiv.org/format/2005.14165">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Language Models are Few-Shot Learners </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Brown%2C+T+B">Tom B. Brown</a>, <a href="/search/cs?searchtype=author&query=Mann%2C+B">Benjamin Mann</a>, <a href="/search/cs?searchtype=author&query=Ryder%2C+N">Nick Ryder</a>, <a href="/search/cs?searchtype=author&query=Subbiah%2C+M">Melanie Subbiah</a>, <a href="/search/cs?searchtype=author&query=Kaplan%2C+J">Jared Kaplan</a>, <a href="/search/cs?searchtype=author&query=Dhariwal%2C+P">Prafulla Dhariwal</a>, <a href="/search/cs?searchtype=author&query=Neelakantan%2C+A">Arvind Neelakantan</a>, <a href="/search/cs?searchtype=author&query=Shyam%2C+P">Pranav Shyam</a>, <a href="/search/cs?searchtype=author&query=Sastry%2C+G">Girish Sastry</a>, <a href="/search/cs?searchtype=author&query=Askell%2C+A">Amanda Askell</a>, <a href="/search/cs?searchtype=author&query=Agarwal%2C+S">Sandhini Agarwal</a>, <a href="/search/cs?searchtype=author&query=Herbert-Voss%2C+A">Ariel Herbert-Voss</a>, <a href="/search/cs?searchtype=author&query=Krueger%2C+G">Gretchen Krueger</a>, <a href="/search/cs?searchtype=author&query=Henighan%2C+T">Tom Henighan</a>, <a href="/search/cs?searchtype=author&query=Child%2C+R">Rewon Child</a>, <a href="/search/cs?searchtype=author&query=Ramesh%2C+A">Aditya Ramesh</a>, <a href="/search/cs?searchtype=author&query=Ziegler%2C+D+M">Daniel M. Ziegler</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+J">Jeffrey Wu</a>, <a href="/search/cs?searchtype=author&query=Winter%2C+C">Clemens Winter</a>, <a href="/search/cs?searchtype=author&query=Hesse%2C+C">Christopher Hesse</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+M">Mark Chen</a>, <a href="/search/cs?searchtype=author&query=Sigler%2C+E">Eric Sigler</a>, <a href="/search/cs?searchtype=author&query=Litwin%2C+M">Mateusz Litwin</a>, <a href="/search/cs?searchtype=author&query=Gray%2C+S">Scott Gray</a>, <a href="/search/cs?searchtype=author&query=Chess%2C+B">Benjamin Chess</a> , et al. (6 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2005.14165v4-abstract-short" style="display: inline;"> Recent work has demonstrated substantial gains on many NLP tasks and benchmarks by pre-training on a large corpus of text followed by fine-tuning on a specific task. While typically task-agnostic in architecture, this method still requires task-specific fine-tuning datasets of thousands or tens of thousands of examples. By contrast, humans can generally perform a new language task from only a few… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2005.14165v4-abstract-full').style.display = 'inline'; document.getElementById('2005.14165v4-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2005.14165v4-abstract-full" style="display: none;"> Recent work has demonstrated substantial gains on many NLP tasks and benchmarks by pre-training on a large corpus of text followed by fine-tuning on a specific task. While typically task-agnostic in architecture, this method still requires task-specific fine-tuning datasets of thousands or tens of thousands of examples. By contrast, humans can generally perform a new language task from only a few examples or from simple instructions - something which current NLP systems still largely struggle to do. Here we show that scaling up language models greatly improves task-agnostic, few-shot performance, sometimes even reaching competitiveness with prior state-of-the-art fine-tuning approaches. Specifically, we train GPT-3, an autoregressive language model with 175 billion parameters, 10x more than any previous non-sparse language model, and test its performance in the few-shot setting. For all tasks, GPT-3 is applied without any gradient updates or fine-tuning, with tasks and few-shot demonstrations specified purely via text interaction with the model. GPT-3 achieves strong performance on many NLP datasets, including translation, question-answering, and cloze tasks, as well as several tasks that require on-the-fly reasoning or domain adaptation, such as unscrambling words, using a novel word in a sentence, or performing 3-digit arithmetic. At the same time, we also identify some datasets where GPT-3's few-shot learning still struggles, as well as some datasets where GPT-3 faces methodological issues related to training on large web corpora. Finally, we find that GPT-3 can generate samples of news articles which human evaluators have difficulty distinguishing from articles written by humans. We discuss broader societal impacts of this finding and of GPT-3 in general. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2005.14165v4-abstract-full').style.display = 'none'; document.getElementById('2005.14165v4-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 July, 2020; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 28 May, 2020; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2020. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">40+32 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1912.02877">arXiv:1912.02877</a> <span> [<a href="https://arxiv.org/pdf/1912.02877">pdf</a>, <a href="https://arxiv.org/format/1912.02877">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> Training Agents using Upside-Down Reinforcement Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Srivastava%2C+R+K">Rupesh Kumar Srivastava</a>, <a href="/search/cs?searchtype=author&query=Shyam%2C+P">Pranav Shyam</a>, <a href="/search/cs?searchtype=author&query=Mutz%2C+F">Filipe Mutz</a>, <a href="/search/cs?searchtype=author&query=Ja%C5%9Bkowski%2C+W">Wojciech Ja艣kowski</a>, <a href="/search/cs?searchtype=author&query=Schmidhuber%2C+J">J眉rgen Schmidhuber</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1912.02877v2-abstract-short" style="display: inline;"> We develop Upside-Down Reinforcement Learning (UDRL), a method for learning to act using only supervised learning techniques. Unlike traditional algorithms, UDRL does not use reward prediction or search for an optimal policy. Instead, it trains agents to follow commands such as "obtain so much total reward in so much time." Many of its general principles are outlined in a companion report; the goa… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1912.02877v2-abstract-full').style.display = 'inline'; document.getElementById('1912.02877v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1912.02877v2-abstract-full" style="display: none;"> We develop Upside-Down Reinforcement Learning (UDRL), a method for learning to act using only supervised learning techniques. Unlike traditional algorithms, UDRL does not use reward prediction or search for an optimal policy. Instead, it trains agents to follow commands such as "obtain so much total reward in so much time." Many of its general principles are outlined in a companion report; the goal of this paper is to develop a practical learning algorithm and show that this conceptually simple perspective on agent training can produce a range of rewarding behaviors for multiple episodic environments. Experiments show that on some tasks UDRL's performance can be surprisingly competitive with, and even exceed that of some traditional baseline algorithms developed over decades of research. Based on these results, we suggest that alternative approaches to expected reward maximization have an important role to play in training useful autonomous agents. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1912.02877v2-abstract-full').style.display = 'none'; document.getElementById('1912.02877v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 September, 2021; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 5 December, 2019; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2019. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Extends NeurIPS 2019 Deep Reinforcement Learning workshop presentation</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1902.02441">arXiv:1902.02441</a> <span> [<a href="https://arxiv.org/pdf/1902.02441">pdf</a>, <a href="https://arxiv.org/format/1902.02441">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Artificial Intelligence for Prosthetics - challenge solutions </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Kidzi%C5%84ski%2C+%C5%81">艁ukasz Kidzi艅ski</a>, <a href="/search/cs?searchtype=author&query=Ong%2C+C">Carmichael Ong</a>, <a href="/search/cs?searchtype=author&query=Mohanty%2C+S+P">Sharada Prasanna Mohanty</a>, <a href="/search/cs?searchtype=author&query=Hicks%2C+J">Jennifer Hicks</a>, <a href="/search/cs?searchtype=author&query=Carroll%2C+S+F">Sean F. Carroll</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+B">Bo Zhou</a>, <a href="/search/cs?searchtype=author&query=Zeng%2C+H">Hongsheng Zeng</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+F">Fan Wang</a>, <a href="/search/cs?searchtype=author&query=Lian%2C+R">Rongzhong Lian</a>, <a href="/search/cs?searchtype=author&query=Tian%2C+H">Hao Tian</a>, <a href="/search/cs?searchtype=author&query=Ja%C5%9Bkowski%2C+W">Wojciech Ja艣kowski</a>, <a href="/search/cs?searchtype=author&query=Andersen%2C+G">Garrett Andersen</a>, <a href="/search/cs?searchtype=author&query=Lykkeb%C3%B8%2C+O+R">Odd Rune Lykkeb酶</a>, <a href="/search/cs?searchtype=author&query=Toklu%2C+N+E">Nihat Engin Toklu</a>, <a href="/search/cs?searchtype=author&query=Shyam%2C+P">Pranav Shyam</a>, <a href="/search/cs?searchtype=author&query=Srivastava%2C+R+K">Rupesh Kumar Srivastava</a>, <a href="/search/cs?searchtype=author&query=Kolesnikov%2C+S">Sergey Kolesnikov</a>, <a href="/search/cs?searchtype=author&query=Hrinchuk%2C+O">Oleksii Hrinchuk</a>, <a href="/search/cs?searchtype=author&query=Pechenko%2C+A">Anton Pechenko</a>, <a href="/search/cs?searchtype=author&query=Ljungstr%C3%B6m%2C+M">Mattias Ljungstr枚m</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zhen Wang</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+X">Xu Hu</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+Z">Zehong Hu</a>, <a href="/search/cs?searchtype=author&query=Qiu%2C+M">Minghui Qiu</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+J">Jun Huang</a> , et al. (25 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1902.02441v1-abstract-short" style="display: inline;"> In the NeurIPS 2018 Artificial Intelligence for Prosthetics challenge, participants were tasked with building a controller for a musculoskeletal model with a goal of matching a given time-varying velocity vector. Top participants were invited to describe their algorithms. In this work, we describe the challenge and present thirteen solutions that used deep reinforcement learning approaches. Many s… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1902.02441v1-abstract-full').style.display = 'inline'; document.getElementById('1902.02441v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1902.02441v1-abstract-full" style="display: none;"> In the NeurIPS 2018 Artificial Intelligence for Prosthetics challenge, participants were tasked with building a controller for a musculoskeletal model with a goal of matching a given time-varying velocity vector. Top participants were invited to describe their algorithms. In this work, we describe the challenge and present thirteen solutions that used deep reinforcement learning approaches. Many solutions use similar relaxations and heuristics, such as reward shaping, frame skipping, discretization of the action space, symmetry, and policy blending. However, each team implemented different modifications of the known algorithms by, for example, dividing the task into subtasks, learning low-level control, or by incorporating expert knowledge and using imitation learning. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1902.02441v1-abstract-full').style.display = 'none'; document.getElementById('1902.02441v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 February, 2019; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2019. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1810.12162">arXiv:1810.12162</a> <span> [<a href="https://arxiv.org/pdf/1810.12162">pdf</a>, <a href="https://arxiv.org/format/1810.12162">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Information Theory">cs.IT</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Neural and Evolutionary Computing">cs.NE</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Model-Based Active Exploration </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Shyam%2C+P">Pranav Shyam</a>, <a href="/search/cs?searchtype=author&query=Ja%C5%9Bkowski%2C+W">Wojciech Ja艣kowski</a>, <a href="/search/cs?searchtype=author&query=Gomez%2C+F">Faustino Gomez</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1810.12162v5-abstract-short" style="display: inline;"> Efficient exploration is an unsolved problem in Reinforcement Learning which is usually addressed by reactively rewarding the agent for fortuitously encountering novel situations. This paper introduces an efficient active exploration algorithm, Model-Based Active eXploration (MAX), which uses an ensemble of forward models to plan to observe novel events. This is carried out by optimizing agent beh… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1810.12162v5-abstract-full').style.display = 'inline'; document.getElementById('1810.12162v5-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1810.12162v5-abstract-full" style="display: none;"> Efficient exploration is an unsolved problem in Reinforcement Learning which is usually addressed by reactively rewarding the agent for fortuitously encountering novel situations. This paper introduces an efficient active exploration algorithm, Model-Based Active eXploration (MAX), which uses an ensemble of forward models to plan to observe novel events. This is carried out by optimizing agent behaviour with respect to a measure of novelty derived from the Bayesian perspective of exploration, which is estimated using the disagreement between the futures predicted by the ensemble members. We show empirically that in semi-random discrete environments where directed exploration is critical to make progress, MAX is at least an order of magnitude more efficient than strong baselines. MAX scales to high-dimensional continuous environments where it builds task-agnostic models that can be used for any downstream task. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1810.12162v5-abstract-full').style.display = 'none'; document.getElementById('1810.12162v5-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 June, 2019; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 29 October, 2018; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2018. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ICML 2019. Code: https://github.com/nnaisense/max</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1703.00767">arXiv:1703.00767</a> <span> [<a href="https://arxiv.org/pdf/1703.00767">pdf</a>, <a href="https://arxiv.org/format/1703.00767">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Attentive Recurrent Comparators </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Shyam%2C+P">Pranav Shyam</a>, <a href="/search/cs?searchtype=author&query=Gupta%2C+S">Shubham Gupta</a>, <a href="/search/cs?searchtype=author&query=Dukkipati%2C+A">Ambedkar Dukkipati</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1703.00767v3-abstract-short" style="display: inline;"> Rapid learning requires flexible representations to quickly adopt to new evidence. We develop a novel class of models called Attentive Recurrent Comparators (ARCs) that form representations of objects by cycling through them and making observations. Using the representations extracted by ARCs, we develop a way of approximating a \textit{dynamic representation space} and use it for one-shot learnin… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1703.00767v3-abstract-full').style.display = 'inline'; document.getElementById('1703.00767v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1703.00767v3-abstract-full" style="display: none;"> Rapid learning requires flexible representations to quickly adopt to new evidence. We develop a novel class of models called Attentive Recurrent Comparators (ARCs) that form representations of objects by cycling through them and making observations. Using the representations extracted by ARCs, we develop a way of approximating a \textit{dynamic representation space} and use it for one-shot learning. In the task of one-shot classification on the Omniglot dataset, we achieve the state of the art performance with an error rate of 1.5\%. This represents the first super-human result achieved for this task with a generic model that uses only pixel information. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1703.00767v3-abstract-full').style.display = 'none'; document.getElementById('1703.00767v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 June, 2017; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 2 March, 2017; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2017. </p> </li> </ol> <div class="is-hidden-tablet"> <!-- feedback for mobile only --> <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a> </span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary"> <!-- MetaColumn 1 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div> <!-- end MetaColumn 1 --> <!-- MetaColumn 2 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div> <!-- end MetaColumn 2 --> </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>