Search | arXiv e-print repository

<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/>  <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b">  <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\$","\$"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a>  <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div>  <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div>  <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1–18 of 18 results for author: <span class="mathjax">Goodman, S</span> </h1> </div> <div class="level-right is-hidden-mobile">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> <div class="content"> <form method="GET" action="/search/cs" aria-role="search"> Searching in archive <strong>cs</strong>. <a href="/search/?searchtype=author&query=Goodman%2C+S">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Goodman, S"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Goodman%2C+S&terms-0-field=author&size=50&order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Goodman, S"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2310.09199">arXiv:2310.09199</a> <span> [<a href="https://arxiv.org/pdf/2310.09199">pdf</a>, <a href="https://arxiv.org/format/2310.09199">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> PaLI-3 Vision Language Models: Smaller, Faster, Stronger </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+X">Xi Chen</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xiao Wang</a>, <a href="/search/cs?searchtype=author&query=Beyer%2C+L">Lucas Beyer</a>, <a href="/search/cs?searchtype=author&query=Kolesnikov%2C+A">Alexander Kolesnikov</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+J">Jialin Wu</a>, <a href="/search/cs?searchtype=author&query=Voigtlaender%2C+P">Paul Voigtlaender</a>, <a href="/search/cs?searchtype=author&query=Mustafa%2C+B">Basil Mustafa</a>, <a href="/search/cs?searchtype=author&query=Goodman%2C+S">Sebastian Goodman</a>, <a href="/search/cs?searchtype=author&query=Alabdulmohsin%2C+I">Ibrahim Alabdulmohsin</a>, <a href="/search/cs?searchtype=author&query=Padlewski%2C+P">Piotr Padlewski</a>, <a href="/search/cs?searchtype=author&query=Salz%2C+D">Daniel Salz</a>, <a href="/search/cs?searchtype=author&query=Xiong%2C+X">Xi Xiong</a>, <a href="/search/cs?searchtype=author&query=Vlasic%2C+D">Daniel Vlasic</a>, <a href="/search/cs?searchtype=author&query=Pavetic%2C+F">Filip Pavetic</a>, <a href="/search/cs?searchtype=author&query=Rong%2C+K">Keran Rong</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+T">Tianli Yu</a>, <a href="/search/cs?searchtype=author&query=Keysers%2C+D">Daniel Keysers</a>, <a href="/search/cs?searchtype=author&query=Zhai%2C+X">Xiaohua Zhai</a>, <a href="/search/cs?searchtype=author&query=Soricut%2C+R">Radu Soricut</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2310.09199v2-abstract-short" style="display: inline;"> This paper presents PaLI-3, a smaller, faster, and stronger vision language model (VLM) that compares favorably to similar models that are 10x larger. As part of arriving at this strong performance, we compare Vision Transformer (ViT) models pretrained using classification objectives to contrastively (SigLIP) pretrained ones. We find that, while slightly underperforming on standard image classific… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2310.09199v2-abstract-full').style.display = 'inline'; document.getElementById('2310.09199v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2310.09199v2-abstract-full" style="display: none;"> This paper presents PaLI-3, a smaller, faster, and stronger vision language model (VLM) that compares favorably to similar models that are 10x larger. As part of arriving at this strong performance, we compare Vision Transformer (ViT) models pretrained using classification objectives to contrastively (SigLIP) pretrained ones. We find that, while slightly underperforming on standard image classification benchmarks, SigLIP-based PaLI shows superior performance across various multimodal benchmarks, especially on localization and visually-situated text understanding. We scale the SigLIP image encoder up to 2 billion parameters, and achieves a new state-of-the-art on multilingual cross-modal retrieval. We hope that PaLI-3, at only 5B parameters, rekindles research on fundamental pieces of complex VLMs, and could fuel a new generation of scaled-up models. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2310.09199v2-abstract-full').style.display = 'none'; document.getElementById('2310.09199v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 October, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 13 October, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2308.06912">arXiv:2308.06912</a> <span> [<a href="https://arxiv.org/pdf/2308.06912">pdf</a>, <a href="https://arxiv.org/format/2308.06912">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> CausalLM is not optimal for in-context learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ding%2C+N">Nan Ding</a>, <a href="/search/cs?searchtype=author&query=Levinboim%2C+T">Tomer Levinboim</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+J">Jialin Wu</a>, <a href="/search/cs?searchtype=author&query=Goodman%2C+S">Sebastian Goodman</a>, <a href="/search/cs?searchtype=author&query=Soricut%2C+R">Radu Soricut</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2308.06912v3-abstract-short" style="display: inline;"> Recent empirical evidence indicates that transformer based in-context learning performs better when using a prefix language model (prefixLM), in which in-context samples can all attend to each other, compared to causal language models (causalLM), which use auto-regressive attention that prohibits in-context samples to attend to future samples. While this result is intuitive, it is not understood f… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2308.06912v3-abstract-full').style.display = 'inline'; document.getElementById('2308.06912v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2308.06912v3-abstract-full" style="display: none;"> Recent empirical evidence indicates that transformer based in-context learning performs better when using a prefix language model (prefixLM), in which in-context samples can all attend to each other, compared to causal language models (causalLM), which use auto-regressive attention that prohibits in-context samples to attend to future samples. While this result is intuitive, it is not understood from a theoretical perspective. In this paper we take a theoretical approach and analyze the convergence behavior of prefixLM and causalLM under a certain parameter construction. Our analysis shows that both LM types converge to their stationary points at a linear rate, but that while prefixLM converges to the optimal solution of linear regression, causalLM convergence dynamics follows that of an online gradient descent algorithm, which is not guaranteed to be optimal even as the number of samples grows infinitely. We supplement our theoretical claims with empirical experiments over synthetic and real tasks and using various types of transformers. Our experiments verify that causalLM consistently underperforms prefixLM in all settings. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2308.06912v3-abstract-full').style.display = 'none'; document.getElementById('2308.06912v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 February, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 13 August, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ICLR 2024 conference paper. Code available at: https://github.com/google-research/causallm_icl</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2305.18565">arXiv:2305.18565</a> <span> [<a href="https://arxiv.org/pdf/2305.18565">pdf</a>, <a href="https://arxiv.org/format/2305.18565">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> PaLI-X: On Scaling up a Multilingual Vision and Language Model </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+X">Xi Chen</a>, <a href="/search/cs?searchtype=author&query=Djolonga%2C+J">Josip Djolonga</a>, <a href="/search/cs?searchtype=author&query=Padlewski%2C+P">Piotr Padlewski</a>, <a href="/search/cs?searchtype=author&query=Mustafa%2C+B">Basil Mustafa</a>, <a href="/search/cs?searchtype=author&query=Changpinyo%2C+S">Soravit Changpinyo</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+J">Jialin Wu</a>, <a href="/search/cs?searchtype=author&query=Ruiz%2C+C+R">Carlos Riquelme Ruiz</a>, <a href="/search/cs?searchtype=author&query=Goodman%2C+S">Sebastian Goodman</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xiao Wang</a>, <a href="/search/cs?searchtype=author&query=Tay%2C+Y">Yi Tay</a>, <a href="/search/cs?searchtype=author&query=Shakeri%2C+S">Siamak Shakeri</a>, <a href="/search/cs?searchtype=author&query=Dehghani%2C+M">Mostafa Dehghani</a>, <a href="/search/cs?searchtype=author&query=Salz%2C+D">Daniel Salz</a>, <a href="/search/cs?searchtype=author&query=Lucic%2C+M">Mario Lucic</a>, <a href="/search/cs?searchtype=author&query=Tschannen%2C+M">Michael Tschannen</a>, <a href="/search/cs?searchtype=author&query=Nagrani%2C+A">Arsha Nagrani</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+H">Hexiang Hu</a>, <a href="/search/cs?searchtype=author&query=Joshi%2C+M">Mandar Joshi</a>, <a href="/search/cs?searchtype=author&query=Pang%2C+B">Bo Pang</a>, <a href="/search/cs?searchtype=author&query=Montgomery%2C+C">Ceslee Montgomery</a>, <a href="/search/cs?searchtype=author&query=Pietrzyk%2C+P">Paulina Pietrzyk</a>, <a href="/search/cs?searchtype=author&query=Ritter%2C+M">Marvin Ritter</a>, <a href="/search/cs?searchtype=author&query=Piergiovanni%2C+A">AJ Piergiovanni</a>, <a href="/search/cs?searchtype=author&query=Minderer%2C+M">Matthias Minderer</a>, <a href="/search/cs?searchtype=author&query=Pavetic%2C+F">Filip Pavetic</a> , et al. (18 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2305.18565v1-abstract-short" style="display: inline;"> We present the training recipe and results of scaling up PaLI-X, a multilingual vision and language model, both in terms of size of the components and the breadth of its training task mixture. Our model achieves new levels of performance on a wide-range of varied and complex tasks, including multiple image-based captioning and question-answering tasks, image-based document understanding and few-sh… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2305.18565v1-abstract-full').style.display = 'inline'; document.getElementById('2305.18565v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2305.18565v1-abstract-full" style="display: none;"> We present the training recipe and results of scaling up PaLI-X, a multilingual vision and language model, both in terms of size of the components and the breadth of its training task mixture. Our model achieves new levels of performance on a wide-range of varied and complex tasks, including multiple image-based captioning and question-answering tasks, image-based document understanding and few-shot (in-context) learning, as well as object detection, video question answering, and video captioning. PaLI-X advances the state-of-the-art on most vision-and-language benchmarks considered (25+ of them). Finally, we observe emerging capabilities, such as complex counting and multilingual object detection, tasks that are not explicitly in the training mix. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2305.18565v1-abstract-full').style.display = 'none'; document.getElementById('2305.18565v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 May, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2209.06794">arXiv:2209.06794</a> <span> [<a href="https://arxiv.org/pdf/2209.06794">pdf</a>, <a href="https://arxiv.org/format/2209.06794">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> PaLI: A Jointly-Scaled Multilingual Language-Image Model </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+X">Xi Chen</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xiao Wang</a>, <a href="/search/cs?searchtype=author&query=Changpinyo%2C+S">Soravit Changpinyo</a>, <a href="/search/cs?searchtype=author&query=Piergiovanni%2C+A">AJ Piergiovanni</a>, <a href="/search/cs?searchtype=author&query=Padlewski%2C+P">Piotr Padlewski</a>, <a href="/search/cs?searchtype=author&query=Salz%2C+D">Daniel Salz</a>, <a href="/search/cs?searchtype=author&query=Goodman%2C+S">Sebastian Goodman</a>, <a href="/search/cs?searchtype=author&query=Grycner%2C+A">Adam Grycner</a>, <a href="/search/cs?searchtype=author&query=Mustafa%2C+B">Basil Mustafa</a>, <a href="/search/cs?searchtype=author&query=Beyer%2C+L">Lucas Beyer</a>, <a href="/search/cs?searchtype=author&query=Kolesnikov%2C+A">Alexander Kolesnikov</a>, <a href="/search/cs?searchtype=author&query=Puigcerver%2C+J">Joan Puigcerver</a>, <a href="/search/cs?searchtype=author&query=Ding%2C+N">Nan Ding</a>, <a href="/search/cs?searchtype=author&query=Rong%2C+K">Keran Rong</a>, <a href="/search/cs?searchtype=author&query=Akbari%2C+H">Hassan Akbari</a>, <a href="/search/cs?searchtype=author&query=Mishra%2C+G">Gaurav Mishra</a>, <a href="/search/cs?searchtype=author&query=Xue%2C+L">Linting Xue</a>, <a href="/search/cs?searchtype=author&query=Thapliyal%2C+A">Ashish Thapliyal</a>, <a href="/search/cs?searchtype=author&query=Bradbury%2C+J">James Bradbury</a>, <a href="/search/cs?searchtype=author&query=Kuo%2C+W">Weicheng Kuo</a>, <a href="/search/cs?searchtype=author&query=Seyedhosseini%2C+M">Mojtaba Seyedhosseini</a>, <a href="/search/cs?searchtype=author&query=Jia%2C+C">Chao Jia</a>, <a href="/search/cs?searchtype=author&query=Ayan%2C+B+K">Burcu Karagol Ayan</a>, <a href="/search/cs?searchtype=author&query=Riquelme%2C+C">Carlos Riquelme</a>, <a href="/search/cs?searchtype=author&query=Steiner%2C+A">Andreas Steiner</a> , et al. (4 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2209.06794v4-abstract-short" style="display: inline;"> Effective scaling and a flexible task interface enable large language models to excel at many tasks. We present PaLI (Pathways Language and Image model), a model that extends this approach to the joint modeling of language and vision. PaLI generates text based on visual and textual inputs, and with this interface performs many vision, language, and multimodal tasks, in many languages. To train PaL… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2209.06794v4-abstract-full').style.display = 'inline'; document.getElementById('2209.06794v4-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2209.06794v4-abstract-full" style="display: none;"> Effective scaling and a flexible task interface enable large language models to excel at many tasks. We present PaLI (Pathways Language and Image model), a model that extends this approach to the joint modeling of language and vision. PaLI generates text based on visual and textual inputs, and with this interface performs many vision, language, and multimodal tasks, in many languages. To train PaLI, we make use of large pre-trained encoder-decoder language models and Vision Transformers (ViTs). This allows us to capitalize on their existing capabilities and leverage the substantial cost of training them. We find that joint scaling of the vision and language components is important. Since existing Transformers for language are much larger than their vision counterparts, we train a large, 4-billion parameter ViT (ViT-e) to quantify the benefits from even larger-capacity vision models. To train PaLI, we create a large multilingual mix of pretraining tasks, based on a new image-text training set containing 10B images and texts in over 100 languages. PaLI achieves state-of-the-art in multiple vision and language tasks (such as captioning, visual question-answering, scene-text understanding), while retaining a simple, modular, and scalable design. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2209.06794v4-abstract-full').style.display = 'none'; document.getElementById('2209.06794v4-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 June, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 14 September, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ICLR 2023 (Notable-top-5%)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2209.05534">arXiv:2209.05534</a> <span> [<a href="https://arxiv.org/pdf/2209.05534">pdf</a>, <a href="https://arxiv.org/format/2209.05534">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> PreSTU: Pre-Training for Scene-Text Understanding </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Kil%2C+J">Jihyung Kil</a>, <a href="/search/cs?searchtype=author&query=Changpinyo%2C+S">Soravit Changpinyo</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+X">Xi Chen</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+H">Hexiang Hu</a>, <a href="/search/cs?searchtype=author&query=Goodman%2C+S">Sebastian Goodman</a>, <a href="/search/cs?searchtype=author&query=Chao%2C+W">Wei-Lun Chao</a>, <a href="/search/cs?searchtype=author&query=Soricut%2C+R">Radu Soricut</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2209.05534v3-abstract-short" style="display: inline;"> The ability to recognize and reason about text embedded in visual inputs is often lacking in vision-and-language (V&L) models, perhaps because V&L pre-training methods have often failed to include such an ability in their training objective. In this paper, we propose PreSTU, a novel pre-training recipe dedicated to scene-text understanding (STU). PreSTU introduces OCR-aware pre-training objectives… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2209.05534v3-abstract-full').style.display = 'inline'; document.getElementById('2209.05534v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2209.05534v3-abstract-full" style="display: none;"> The ability to recognize and reason about text embedded in visual inputs is often lacking in vision-and-language (V&L) models, perhaps because V&L pre-training methods have often failed to include such an ability in their training objective. In this paper, we propose PreSTU, a novel pre-training recipe dedicated to scene-text understanding (STU). PreSTU introduces OCR-aware pre-training objectives that encourage the model to recognize text from an image and connect it to the rest of the image content. We implement PreSTU using a simple transformer-based encoder-decoder architecture, combined with large-scale image-text datasets with scene text obtained from an off-the-shelf OCR system. We empirically demonstrate the effectiveness of this pre-training approach on eight visual question answering and four image captioning benchmarks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2209.05534v3-abstract-full').style.display = 'none'; document.getElementById('2209.05534v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 August, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 12 September, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to ICCV 2023</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2207.02308">arXiv:2207.02308</a> <span> [<a href="https://arxiv.org/pdf/2207.02308">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1145/3517428.3544819">10.1145/3517428.3544819 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> LaMPost: Design and Evaluation of an AI-assisted Email Writing Prototype for Adults with Dyslexia </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Goodman%2C+S+M">Steven M. Goodman</a>, <a href="/search/cs?searchtype=author&query=Buehler%2C+E">Erin Buehler</a>, <a href="/search/cs?searchtype=author&query=Clary%2C+P">Patrick Clary</a>, <a href="/search/cs?searchtype=author&query=Coenen%2C+A">Andy Coenen</a>, <a href="/search/cs?searchtype=author&query=Donsbach%2C+A">Aaron Donsbach</a>, <a href="/search/cs?searchtype=author&query=Horne%2C+T+N">Tiffanie N. Horne</a>, <a href="/search/cs?searchtype=author&query=Lahav%2C+M">Michal Lahav</a>, <a href="/search/cs?searchtype=author&query=Macdonald%2C+R">Robert Macdonald</a>, <a href="/search/cs?searchtype=author&query=Michaels%2C+R+B">Rain Breaw Michaels</a>, <a href="/search/cs?searchtype=author&query=Narayanan%2C+A">Ajit Narayanan</a>, <a href="/search/cs?searchtype=author&query=Pushkarna%2C+M">Mahima Pushkarna</a>, <a href="/search/cs?searchtype=author&query=Riley%2C+J">Joel Riley</a>, <a href="/search/cs?searchtype=author&query=Santana%2C+A">Alex Santana</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+L">Lei Shi</a>, <a href="/search/cs?searchtype=author&query=Sweeney%2C+R">Rachel Sweeney</a>, <a href="/search/cs?searchtype=author&query=Weaver%2C+P">Phil Weaver</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+A">Ann Yuan</a>, <a href="/search/cs?searchtype=author&query=Morris%2C+M+R">Meredith Ringel Morris</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2207.02308v1-abstract-short" style="display: inline;"> Prior work has explored the writing challenges experienced by people with dyslexia, and the potential for new spelling, grammar, and word retrieval technologies to address these challenges. However, the capabilities for natural language generation demonstrated by the latest class of large language models (LLMs) highlight an opportunity to explore new forms of human-AI writing support tools. In thi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2207.02308v1-abstract-full').style.display = 'inline'; document.getElementById('2207.02308v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2207.02308v1-abstract-full" style="display: none;"> Prior work has explored the writing challenges experienced by people with dyslexia, and the potential for new spelling, grammar, and word retrieval technologies to address these challenges. However, the capabilities for natural language generation demonstrated by the latest class of large language models (LLMs) highlight an opportunity to explore new forms of human-AI writing support tools. In this paper, we introduce LaMPost, a prototype email-writing interface that explores the potential for LLMs to power writing support tools that address the varied needs of people with dyslexia. LaMPost draws from our understanding of these needs and introduces novel AI-powered features for email-writing, including: outlining main ideas, generating a subject line, suggesting changes, rewriting a selection. We evaluated LaMPost with 19 adults with dyslexia, identifying many promising routes for further exploration (including the popularity of the "rewrite" and "subject line" features), but also finding that the current generation of LLMs may not surpass the accuracy and quality thresholds required to meet the needs of writers with dyslexia. Surprisingly, we found that participants' awareness of the AI had no effect on their perception of the system, nor on their feelings of autonomy, expression, and self-efficacy when writing emails. Our findings yield further insight into the benefits and drawbacks of using LLMs as writing support for adults with dyslexia and provide a foundation to build upon in future research. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2207.02308v1-abstract-full').style.display = 'none'; document.getElementById('2207.02308v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 July, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">To appear at The 24th International ACM SIGACCESS Conference on Computers and Accessibility (ASSETS '22), October 23-26, 2022, Athens, Greece. 26 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2203.17189">arXiv:2203.17189</a> <span> [<a href="https://arxiv.org/pdf/2203.17189">pdf</a>, <a href="https://arxiv.org/format/2203.17189">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Scaling Up Models and Data with $\texttt{t5x}$ and $\texttt{seqio}$ </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Roberts%2C+A">Adam Roberts</a>, <a href="/search/cs?searchtype=author&query=Chung%2C+H+W">Hyung Won Chung</a>, <a href="/search/cs?searchtype=author&query=Levskaya%2C+A">Anselm Levskaya</a>, <a href="/search/cs?searchtype=author&query=Mishra%2C+G">Gaurav Mishra</a>, <a href="/search/cs?searchtype=author&query=Bradbury%2C+J">James Bradbury</a>, <a href="/search/cs?searchtype=author&query=Andor%2C+D">Daniel Andor</a>, <a href="/search/cs?searchtype=author&query=Narang%2C+S">Sharan Narang</a>, <a href="/search/cs?searchtype=author&query=Lester%2C+B">Brian Lester</a>, <a href="/search/cs?searchtype=author&query=Gaffney%2C+C">Colin Gaffney</a>, <a href="/search/cs?searchtype=author&query=Mohiuddin%2C+A">Afroz Mohiuddin</a>, <a href="/search/cs?searchtype=author&query=Hawthorne%2C+C">Curtis Hawthorne</a>, <a href="/search/cs?searchtype=author&query=Lewkowycz%2C+A">Aitor Lewkowycz</a>, <a href="/search/cs?searchtype=author&query=Salcianu%2C+A">Alex Salcianu</a>, <a href="/search/cs?searchtype=author&query=van+Zee%2C+M">Marc van Zee</a>, <a href="/search/cs?searchtype=author&query=Austin%2C+J">Jacob Austin</a>, <a href="/search/cs?searchtype=author&query=Goodman%2C+S">Sebastian Goodman</a>, <a href="/search/cs?searchtype=author&query=Soares%2C+L+B">Livio Baldini Soares</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+H">Haitang Hu</a>, <a href="/search/cs?searchtype=author&query=Tsvyashchenko%2C+S">Sasha Tsvyashchenko</a>, <a href="/search/cs?searchtype=author&query=Chowdhery%2C+A">Aakanksha Chowdhery</a>, <a href="/search/cs?searchtype=author&query=Bastings%2C+J">Jasmijn Bastings</a>, <a href="/search/cs?searchtype=author&query=Bulian%2C+J">Jannis Bulian</a>, <a href="/search/cs?searchtype=author&query=Garcia%2C+X">Xavier Garcia</a>, <a href="/search/cs?searchtype=author&query=Ni%2C+J">Jianmo Ni</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+A">Andrew Chen</a> , et al. (18 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2203.17189v1-abstract-short" style="display: inline;"> Recent neural network-based language models have benefited greatly from scaling up the size of training datasets and the number of parameters in the models themselves. Scaling can be complicated due to various factors including the need to distribute computation on supercomputer clusters (e.g., TPUs), prevent bottlenecks when infeeding data, and ensure reproducible results. In this work, we presen… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2203.17189v1-abstract-full').style.display = 'inline'; document.getElementById('2203.17189v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2203.17189v1-abstract-full" style="display: none;"> Recent neural network-based language models have benefited greatly from scaling up the size of training datasets and the number of parameters in the models themselves. Scaling can be complicated due to various factors including the need to distribute computation on supercomputer clusters (e.g., TPUs), prevent bottlenecks when infeeding data, and ensure reproducible results. In this work, we present two software libraries that ease these issues: $\texttt{t5x}$ simplifies the process of building and training large language models at scale while maintaining ease of use, and $\texttt{seqio}$ provides a task-based API for simple creation of fast and reproducible training data and evaluation pipelines. These open-source libraries have been used to train models with hundreds of billions of parameters on datasets with multiple terabytes of training data. Along with the libraries, we release configurations and instructions for T5-like encoder-decoder models as well as GPT-like decoder-only architectures. $\texttt{t5x}$ and $\texttt{seqio}$ are open source and available at https://github.com/google-research/t5x and https://github.com/google/seqio, respectively. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2203.17189v1-abstract-full').style.display = 'none'; document.getElementById('2203.17189v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 March, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2022. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2202.11134">arXiv:2202.11134</a> <span> [<a href="https://arxiv.org/pdf/2202.11134">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1145/3491102.3502020">10.1145/3491102.3502020 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> ProtoSound: A Personalized and Scalable Sound Recognition System for Deaf and Hard-of-Hearing Users </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Jain%2C+D">Dhruv Jain</a>, <a href="/search/cs?searchtype=author&query=Nguyen%2C+K+H+A">Khoa Huynh Anh Nguyen</a>, <a href="/search/cs?searchtype=author&query=Goodman%2C+S">Steven Goodman</a>, <a href="/search/cs?searchtype=author&query=Grossman-Kahn%2C+R">Rachel Grossman-Kahn</a>, <a href="/search/cs?searchtype=author&query=Ngo%2C+H">Hung Ngo</a>, <a href="/search/cs?searchtype=author&query=Kusupati%2C+A">Aditya Kusupati</a>, <a href="/search/cs?searchtype=author&query=Du%2C+R">Ruofei Du</a>, <a href="/search/cs?searchtype=author&query=Olwal%2C+A">Alex Olwal</a>, <a href="/search/cs?searchtype=author&query=Findlater%2C+L">Leah Findlater</a>, <a href="/search/cs?searchtype=author&query=Froehlich%2C+J+E">Jon E. Froehlich</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2202.11134v1-abstract-short" style="display: inline;"> Recent advances have enabled automatic sound recognition systems for deaf and hard of hearing (DHH) users on mobile devices. However, these tools use pre-trained, generic sound recognition models, which do not meet the diverse needs of DHH users. We introduce ProtoSound, an interactive system for customizing sound recognition models by recording a few examples, thereby enabling personalized and fi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2202.11134v1-abstract-full').style.display = 'inline'; document.getElementById('2202.11134v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2202.11134v1-abstract-full" style="display: none;"> Recent advances have enabled automatic sound recognition systems for deaf and hard of hearing (DHH) users on mobile devices. However, these tools use pre-trained, generic sound recognition models, which do not meet the diverse needs of DHH users. We introduce ProtoSound, an interactive system for customizing sound recognition models by recording a few examples, thereby enabling personalized and fine-grained categories. ProtoSound is motivated by prior work examining sound awareness needs of DHH people and by a survey we conducted with 472 DHH participants. To evaluate ProtoSound, we characterized performance on two real-world sound datasets, showing significant improvement over state-of-the-art (e.g., +9.7% accuracy on the first dataset). We then deployed ProtoSound's end-user training and real-time recognition through a mobile application and recruited 19 hearing participants who listened to the real-world sounds and rated the accuracy across 56 locations (e.g., homes, restaurants, parks). Results show that ProtoSound personalized the model on-device in real-time and accurately learned sounds across diverse acoustic contexts. We close by discussing open challenges in personalizable sound recognition, including the need for better recording interfaces and algorithmic improvements. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2202.11134v1-abstract-full').style.display = 'none'; document.getElementById('2202.11134v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 February, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Published at the ACM CHI Conference on Human Factors in Computing Systems (CHI) 2022</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2109.10412">arXiv:2109.10412</a> <span> [<a href="https://arxiv.org/pdf/2109.10412">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1145/3479578">10.1145/3479578 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Social, Environmental, and Technical: Factors at Play in the Current Use and Future Design of Small-Group Captioning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=McDonnell%2C+E+J">Emma J. McDonnell</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+P">Ping Liu</a>, <a href="/search/cs?searchtype=author&query=Goodman%2C+S+M">Steven M. Goodman</a>, <a href="/search/cs?searchtype=author&query=Kushalnagar%2C+R">Raja Kushalnagar</a>, <a href="/search/cs?searchtype=author&query=Froehlich%2C+J+E">Jon E. Froehlich</a>, <a href="/search/cs?searchtype=author&query=Findlater%2C+L">Leah Findlater</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2109.10412v1-abstract-short" style="display: inline;"> Real-time captioning is a critical accessibility tool for many d/Deaf and hard of hearing (DHH) people. While the vast majority of captioning work has focused on formal settings and technical innovations, in contrast, we investigate captioning for informal, interactive small-group conversations, which have a high degree of spontaneity and foster dynamic social interactions. This paper reports on s… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2109.10412v1-abstract-full').style.display = 'inline'; document.getElementById('2109.10412v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2109.10412v1-abstract-full" style="display: none;"> Real-time captioning is a critical accessibility tool for many d/Deaf and hard of hearing (DHH) people. While the vast majority of captioning work has focused on formal settings and technical innovations, in contrast, we investigate captioning for informal, interactive small-group conversations, which have a high degree of spontaneity and foster dynamic social interactions. This paper reports on semi-structured interviews and design probe activities we conducted with 15 DHH participants to understand their use of existing real-time captioning services and future design preferences for both in-person and remote small-group communication. We found that our participants' experiences of captioned small-group conversations are shaped by social, environmental, and technical considerations (e.g., interlocutors' pre-established relationships, the type of captioning displays available, and how far captions lag behind speech). When considering future captioning tools, participants were interested in greater feedback on non-speech elements of conversation (e.g., speaker identity, speech rate, volume) both for their personal use and to guide hearing interlocutors toward more accessible communication. We contribute a qualitative account of DHH people's real-time captioning experiences during small-group conversation and future design considerations to better support the groups being captioned, both in person and online. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2109.10412v1-abstract-full').style.display = 'none'; document.getElementById('2109.10412v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 September, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2021. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">25 pages, 3 figures, to be published in the PACMHCI-CSCW2 October 2021 edition, to be presented at CSCW 2021</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2106.06899">arXiv:2106.06899</a> <span> [<a href="https://arxiv.org/pdf/2106.06899">pdf</a>, <a href="https://arxiv.org/format/2106.06899">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Memory-efficient Transformers via Top-$k$ Attention </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Gupta%2C+A">Ankit Gupta</a>, <a href="/search/cs?searchtype=author&query=Dar%2C+G">Guy Dar</a>, <a href="/search/cs?searchtype=author&query=Goodman%2C+S">Shaya Goodman</a>, <a href="/search/cs?searchtype=author&query=Ciprut%2C+D">David Ciprut</a>, <a href="/search/cs?searchtype=author&query=Berant%2C+J">Jonathan Berant</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2106.06899v1-abstract-short" style="display: inline;"> Following the success of dot-product attention in Transformers, numerous approximations have been recently proposed to address its quadratic complexity with respect to the input length. While these variants are memory and compute efficient, it is not possible to directly use them with popular pre-trained language models trained using vanilla attention, without an expensive corrective pre-training… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2106.06899v1-abstract-full').style.display = 'inline'; document.getElementById('2106.06899v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2106.06899v1-abstract-full" style="display: none;"> Following the success of dot-product attention in Transformers, numerous approximations have been recently proposed to address its quadratic complexity with respect to the input length. While these variants are memory and compute efficient, it is not possible to directly use them with popular pre-trained language models trained using vanilla attention, without an expensive corrective pre-training stage. In this work, we propose a simple yet highly accurate approximation for vanilla attention. We process the queries in chunks, and for each query, compute the top-$k$ scores with respect to the keys. Our approach offers several advantages: (a) its memory usage is linear in the input size, similar to linear attention variants, such as Performer and RFA (b) it is a drop-in replacement for vanilla attention that does not require any corrective pre-training, and (c) it can also lead to significant memory savings in the feed-forward layers after casting them into the familiar query-key-value framework. We evaluate the quality of top-$k$ approximation for multi-head attention layers on the Long Range Arena Benchmark, and for feed-forward layers of T5 and UnifiedQA on multiple QA datasets. We show our approach leads to accuracy that is nearly-identical to vanilla attention in multiple setups including training from scratch, fine-tuning, and zero-shot inference. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2106.06899v1-abstract-full').style.display = 'none'; document.getElementById('2106.06899v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 June, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2021. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2105.14099">arXiv:2105.14099</a> <span> [<a href="https://arxiv.org/pdf/2105.14099">pdf</a>, <a href="https://arxiv.org/format/2105.14099">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Bridging the Gap Between Practice and PAC-Bayes Theory in Few-Shot Meta-Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ding%2C+N">Nan Ding</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+X">Xi Chen</a>, <a href="/search/cs?searchtype=author&query=Levinboim%2C+T">Tomer Levinboim</a>, <a href="/search/cs?searchtype=author&query=Goodman%2C+S">Sebastian Goodman</a>, <a href="/search/cs?searchtype=author&query=Soricut%2C+R">Radu Soricut</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2105.14099v2-abstract-short" style="display: inline;"> Despite recent advances in its theoretical understanding, there still remains a significant gap in the ability of existing PAC-Bayesian theories on meta-learning to explain performance improvements in the few-shot learning setting, where the number of training examples in the target tasks is severely limited. This gap originates from an assumption in the existing theories which supposes that the n… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2105.14099v2-abstract-full').style.display = 'inline'; document.getElementById('2105.14099v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2105.14099v2-abstract-full" style="display: none;"> Despite recent advances in its theoretical understanding, there still remains a significant gap in the ability of existing PAC-Bayesian theories on meta-learning to explain performance improvements in the few-shot learning setting, where the number of training examples in the target tasks is severely limited. This gap originates from an assumption in the existing theories which supposes that the number of training examples in the observed tasks and the number of training examples in the target tasks follow the same distribution, an assumption that rarely holds in practice. By relaxing this assumption, we develop two PAC-Bayesian bounds tailored for the few-shot learning setting and show that two existing meta-learning algorithms (MAML and Reptile) can be derived from our bounds, thereby bridging the gap between practice and PAC-Bayesian theories. Furthermore, we derive a new computationally-efficient PACMAML algorithm, and show it outperforms existing meta-learning algorithms on several few-shot benchmark datasets. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2105.14099v2-abstract-full').style.display = 'none'; document.getElementById('2105.14099v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 October, 2021; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 28 May, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2021. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Neural Information Processing Systems 2021</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2010.03494">arXiv:2010.03494</a> <span> [<a href="https://arxiv.org/pdf/2010.03494">pdf</a>, <a href="https://arxiv.org/format/2010.03494">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> TeaForN: Teacher-Forcing with N-grams </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Goodman%2C+S">Sebastian Goodman</a>, <a href="/search/cs?searchtype=author&query=Ding%2C+N">Nan Ding</a>, <a href="/search/cs?searchtype=author&query=Soricut%2C+R">Radu Soricut</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2010.03494v2-abstract-short" style="display: inline;"> Sequence generation models trained with teacher-forcing suffer from issues related to exposure bias and lack of differentiability across timesteps. Our proposed method, Teacher-Forcing with N-grams (TeaForN), addresses both these problems directly, through the use of a stack of N decoders trained to decode along a secondary time axis that allows model parameter updates based on N prediction steps.… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2010.03494v2-abstract-full').style.display = 'inline'; document.getElementById('2010.03494v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2010.03494v2-abstract-full" style="display: none;"> Sequence generation models trained with teacher-forcing suffer from issues related to exposure bias and lack of differentiability across timesteps. Our proposed method, Teacher-Forcing with N-grams (TeaForN), addresses both these problems directly, through the use of a stack of N decoders trained to decode along a secondary time axis that allows model parameter updates based on N prediction steps. TeaForN can be used with a wide class of decoder architectures and requires minimal modifications from a standard teacher-forcing setup. Empirically, we show that TeaForN boosts generation quality on one Machine Translation benchmark, WMT 2014 English-French, and two News Summarization benchmarks, CNN/Dailymail and Gigaword. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2010.03494v2-abstract-full').style.display = 'none'; document.getElementById('2010.03494v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 October, 2020; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 7 October, 2020; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2020. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">to be published in EMNLP 2020</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2006.08686">arXiv:2006.08686</a> <span> [<a href="https://arxiv.org/pdf/2006.08686">pdf</a>, <a href="https://arxiv.org/format/2006.08686">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Multi-Image Summarization: Textual Summary from a Set of Cohesive Images </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Trieu%2C+N">Nicholas Trieu</a>, <a href="/search/cs?searchtype=author&query=Goodman%2C+S">Sebastian Goodman</a>, <a href="/search/cs?searchtype=author&query=Narayana%2C+P">Pradyumna Narayana</a>, <a href="/search/cs?searchtype=author&query=Sone%2C+K">Kazoo Sone</a>, <a href="/search/cs?searchtype=author&query=Soricut%2C+R">Radu Soricut</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2006.08686v1-abstract-short" style="display: inline;"> Multi-sentence summarization is a well studied problem in NLP, while generating image descriptions for a single image is a well studied problem in Computer Vision. However, for applications such as image cluster labeling or web page summarization, summarizing a set of images is also a useful and challenging task. This paper proposes the new task of multi-image summarization, which aims to generate… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2006.08686v1-abstract-full').style.display = 'inline'; document.getElementById('2006.08686v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2006.08686v1-abstract-full" style="display: none;"> Multi-sentence summarization is a well studied problem in NLP, while generating image descriptions for a single image is a well studied problem in Computer Vision. However, for applications such as image cluster labeling or web page summarization, summarizing a set of images is also a useful and challenging task. This paper proposes the new task of multi-image summarization, which aims to generate a concise and descriptive textual summary given a coherent set of input images. We propose a model that extends the image-captioning Transformer-based architecture for single image to multi-image. A dense average image feature aggregation network allows the model to focus on a coherent subset of attributes across the input images. We explore various input representations to the Transformer network and empirically show that aggregated image features are superior to individual image embeddings. We additionally show that the performance of the model is further improved by pretraining the model parameters on a single-image captioning task, which appears to be particularly effective in eliminating hallucinations in the output. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2006.08686v1-abstract-full').style.display = 'none'; document.getElementById('2006.08686v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 June, 2020; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2020. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">9 pages, 5 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1909.11942">arXiv:1909.11942</a> <span> [<a href="https://arxiv.org/pdf/1909.11942">pdf</a>, <a href="https://arxiv.org/format/1909.11942">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> ALBERT: A Lite BERT for Self-supervised Learning of Language Representations </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lan%2C+Z">Zhenzhong Lan</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+M">Mingda Chen</a>, <a href="/search/cs?searchtype=author&query=Goodman%2C+S">Sebastian Goodman</a>, <a href="/search/cs?searchtype=author&query=Gimpel%2C+K">Kevin Gimpel</a>, <a href="/search/cs?searchtype=author&query=Sharma%2C+P">Piyush Sharma</a>, <a href="/search/cs?searchtype=author&query=Soricut%2C+R">Radu Soricut</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1909.11942v6-abstract-short" style="display: inline;"> Increasing model size when pretraining natural language representations often results in improved performance on downstream tasks. However, at some point further model increases become harder due to GPU/TPU memory limitations and longer training times. To address these problems, we present two parameter-reduction techniques to lower memory consumption and increase the training speed of BERT. Compr… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1909.11942v6-abstract-full').style.display = 'inline'; document.getElementById('1909.11942v6-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1909.11942v6-abstract-full" style="display: none;"> Increasing model size when pretraining natural language representations often results in improved performance on downstream tasks. However, at some point further model increases become harder due to GPU/TPU memory limitations and longer training times. To address these problems, we present two parameter-reduction techniques to lower memory consumption and increase the training speed of BERT. Comprehensive empirical evidence shows that our proposed methods lead to models that scale much better compared to the original BERT. We also use a self-supervised loss that focuses on modeling inter-sentence coherence, and show it consistently helps downstream tasks with multi-sentence inputs. As a result, our best model establishes new state-of-the-art results on the GLUE, RACE, and \squad benchmarks while having fewer parameters compared to BERT-large. The code and the pretrained models are available at https://github.com/google-research/ALBERT. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1909.11942v6-abstract-full').style.display = 'none'; document.getElementById('1909.11942v6-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 February, 2020; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 26 September, 2019; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2019. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1909.10599">arXiv:1909.10599</a> <span> [<a href="https://arxiv.org/pdf/1909.10599">pdf</a>, <a href="https://arxiv.org/ps/1909.10599">ps</a>, <a href="https://arxiv.org/format/1909.10599">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Multi-stage Pretraining for Abstractive Summarization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Goodman%2C+S">Sebastian Goodman</a>, <a href="/search/cs?searchtype=author&query=Lan%2C+Z">Zhenzhong Lan</a>, <a href="/search/cs?searchtype=author&query=Soricut%2C+R">Radu Soricut</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1909.10599v1-abstract-short" style="display: inline;"> Neural models for abstractive summarization tend to achieve the best performance in the presence of highly specialized, summarization specific modeling add-ons such as pointer-generator, coverage-modeling, and inferencetime heuristics. We show here that pretraining can complement such modeling advancements to yield improved results in both short-form and long-form abstractive summarization using t… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1909.10599v1-abstract-full').style.display = 'inline'; document.getElementById('1909.10599v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1909.10599v1-abstract-full" style="display: none;"> Neural models for abstractive summarization tend to achieve the best performance in the presence of highly specialized, summarization specific modeling add-ons such as pointer-generator, coverage-modeling, and inferencetime heuristics. We show here that pretraining can complement such modeling advancements to yield improved results in both short-form and long-form abstractive summarization using two key concepts: full-network initialization and multi-stage pretraining. Our method allows the model to transitively benefit from multiple pretraining tasks, from generic language tasks to a specialized summarization task to an even more specialized one such as bullet-based summarization. Using this approach, we demonstrate improvements of 1.05 ROUGE-L points on the Gigaword benchmark and 1.78 ROUGE-L points on the CNN/DailyMail benchmark, compared to a randomly-initialized baseline. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1909.10599v1-abstract-full').style.display = 'none'; document.getElementById('1909.10599v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 September, 2019; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2019. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1908.07333">arXiv:1908.07333</a> <span> [<a href="https://arxiv.org/pdf/1908.07333">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computers and Society">cs.CY</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> </div> </div> <p class="title is-5 mathjax"> Fairness Issues in AI Systems that Augment Sensory Abilities </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Findlater%2C+L">Leah Findlater</a>, <a href="/search/cs?searchtype=author&query=Goodman%2C+S">Steven Goodman</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+Y">Yuhang Zhao</a>, <a href="/search/cs?searchtype=author&query=Azenkot%2C+S">Shiri Azenkot</a>, <a href="/search/cs?searchtype=author&query=Hanley%2C+M">Margot Hanley</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1908.07333v1-abstract-short" style="display: inline;"> Systems that augment sensory abilities are increasingly employing AI and machine learning (ML) approaches, with applications ranging from object recognition and scene description tools for blind users to sound awareness tools for d/Deaf users. However, unlike many other AI-enabled technologies, these systems provide information that is already available to non-disabled people. In this paper, we di… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1908.07333v1-abstract-full').style.display = 'inline'; document.getElementById('1908.07333v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1908.07333v1-abstract-full" style="display: none;"> Systems that augment sensory abilities are increasingly employing AI and machine learning (ML) approaches, with applications ranging from object recognition and scene description tools for blind users to sound awareness tools for d/Deaf users. However, unlike many other AI-enabled technologies, these systems provide information that is already available to non-disabled people. In this paper, we discuss unique AI fairness challenges that arise in this context, including accessibility issues with data and models, ethical implications in deciding what sensory information to convey to the user, and privacy concerns both for the primary user and for others. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1908.07333v1-abstract-full').style.display = 'none'; document.getElementById('1908.07333v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 August, 2019; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2019. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">4 pages. Accepted to the ACM ASSETS 2019 Workshop on AI Fairness for People with Disabilities</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1612.07833">arXiv:1612.07833</a> <span> [<a href="https://arxiv.org/pdf/1612.07833">pdf</a>, <a href="https://arxiv.org/format/1612.07833">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Understanding Image and Text Simultaneously: a Dual Vision-Language Machine Comprehension Task </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ding%2C+N">Nan Ding</a>, <a href="/search/cs?searchtype=author&query=Goodman%2C+S">Sebastian Goodman</a>, <a href="/search/cs?searchtype=author&query=Sha%2C+F">Fei Sha</a>, <a href="/search/cs?searchtype=author&query=Soricut%2C+R">Radu Soricut</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1612.07833v1-abstract-short" style="display: inline;"> We introduce a new multi-modal task for computer systems, posed as a combined vision-language comprehension challenge: identifying the most suitable text describing a scene, given several similar options. Accomplishing the task entails demonstrating comprehension beyond just recognizing "keywords" (or key-phrases) and their corresponding visual concepts. Instead, it requires an alignment between t… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1612.07833v1-abstract-full').style.display = 'inline'; document.getElementById('1612.07833v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1612.07833v1-abstract-full" style="display: none;"> We introduce a new multi-modal task for computer systems, posed as a combined vision-language comprehension challenge: identifying the most suitable text describing a scene, given several similar options. Accomplishing the task entails demonstrating comprehension beyond just recognizing "keywords" (or key-phrases) and their corresponding visual concepts. Instead, it requires an alignment between the representations of the two modalities that achieves a visually-grounded "understanding" of various linguistic elements and their dependencies. This new task also admits an easy-to-compute and well-studied metric: the accuracy in detecting the true target among the decoys. The paper makes several contributions: an effective and extensible mechanism for generating decoys from (human-created) image captions; an instance of applying this mechanism, yielding a large-scale machine comprehension dataset (based on the COCO images and captions) that we make publicly available; human evaluation results on this dataset, informing a performance upper-bound; and several baseline and competitive learning approaches that illustrate the utility of the proposed task and dataset in advancing both image and language comprehension. We also show that, in a multi-task learning setting, the performance on the proposed task is positively correlated with the end-to-end task of image captioning. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1612.07833v1-abstract-full').style.display = 'none'; document.getElementById('1612.07833v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 December, 2016; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2016. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">11 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1105.6020">arXiv:1105.6020</a> <span> [<a href="https://arxiv.org/pdf/1105.6020">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> </div> </div> <p class="title is-5 mathjax"> Level of Presence in Team-Building Activities: Gaming Component in Virtual Environments </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=De+Leo%2C+G">Gianluca De Leo</a>, <a href="/search/cs?searchtype=author&query=Goodman%2C+K+S">Koren S. Goodman</a>, <a href="/search/cs?searchtype=author&query=Radici%2C+E">Elena Radici</a>, <a href="/search/cs?searchtype=author&query=Secrhist%2C+S+R">Scott R. Secrhist</a>, <a href="/search/cs?searchtype=author&query=Mastaglio%2C+T+W">Thomas W. Mastaglio</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1105.6020v1-abstract-short" style="display: inline;"> Historically the training of teams has been implemented using a face-to-face approach. In the past decade, on-line multiuser virtual environments have offered a solution for training teams whose members are geographically dispersed. In order to develop on effective team training activity, a high sense of presence among the participant needs to be reached. Previous research studies reported being a… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1105.6020v1-abstract-full').style.display = 'inline'; document.getElementById('1105.6020v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="1105.6020v1-abstract-full" style="display: none;"> Historically the training of teams has been implemented using a face-to-face approach. In the past decade, on-line multiuser virtual environments have offered a solution for training teams whose members are geographically dispersed. In order to develop on effective team training activity, a high sense of presence among the participant needs to be reached. Previous research studies reported being able to reach a high level of presence even when using inexpensive technology such as laptop and headset. This study evaluates the level of presence of ten subjects who have to perform a team-building activity in a multi-user virtual environment using a laptop computer and a headset. The authors are interested in determining which user characterizes, such as gender, age and knowledge of computers, have a strong correlation with the level of sense of presence. The results of this study showed that female participants were more likely to engage in the activity and perceived fewer negative effects. Participants who reported less negative effects such as feeling tired, dizzy, or experiencing eye strain during the team-building activity reached a higher level of sense of presence. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1105.6020v1-abstract-full').style.display = 'none'; document.getElementById('1105.6020v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 May, 2011; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2011. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">10 pages, 1 figure, 5 tables; The International Journal of Multimedia & Its Applications (IJMA) Vol.3, No.2, May 2011</span> </p> </li> </ol> <div class="is-hidden-tablet">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary">  <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div>   <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div>  </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>

CINXE.COM

Search | arXiv e-print repository