Search | arXiv e-print repository

<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/>  <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b">  <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\$","\$"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a>  <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div>  <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div>  <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1–32 of 32 results for author: <span class="mathjax">Bitton, Y</span> </h1> </div> <div class="level-right is-hidden-mobile">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> <div class="content"> <form method="GET" action="/search/cs" aria-role="search"> Searching in archive <strong>cs</strong>. <a href="/search/?searchtype=author&query=Bitton%2C+Y">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Bitton, Y"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Bitton%2C+Y&terms-0-field=author&size=50&order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Bitton, Y"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.03555">arXiv:2412.03555</a> <span> [<a href="https://arxiv.org/pdf/2412.03555">pdf</a>, <a href="https://arxiv.org/format/2412.03555">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> PaliGemma 2: A Family of Versatile VLMs for Transfer </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Steiner%2C+A">Andreas Steiner</a>, <a href="/search/cs?searchtype=author&query=Pinto%2C+A+S">Andr茅 Susano Pinto</a>, <a href="/search/cs?searchtype=author&query=Tschannen%2C+M">Michael Tschannen</a>, <a href="/search/cs?searchtype=author&query=Keysers%2C+D">Daniel Keysers</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xiao Wang</a>, <a href="/search/cs?searchtype=author&query=Bitton%2C+Y">Yonatan Bitton</a>, <a href="/search/cs?searchtype=author&query=Gritsenko%2C+A">Alexey Gritsenko</a>, <a href="/search/cs?searchtype=author&query=Minderer%2C+M">Matthias Minderer</a>, <a href="/search/cs?searchtype=author&query=Sherbondy%2C+A">Anthony Sherbondy</a>, <a href="/search/cs?searchtype=author&query=Long%2C+S">Shangbang Long</a>, <a href="/search/cs?searchtype=author&query=Qin%2C+S">Siyang Qin</a>, <a href="/search/cs?searchtype=author&query=Ingle%2C+R">Reeve Ingle</a>, <a href="/search/cs?searchtype=author&query=Bugliarello%2C+E">Emanuele Bugliarello</a>, <a href="/search/cs?searchtype=author&query=Kazemzadeh%2C+S">Sahar Kazemzadeh</a>, <a href="/search/cs?searchtype=author&query=Mesnard%2C+T">Thomas Mesnard</a>, <a href="/search/cs?searchtype=author&query=Alabdulmohsin%2C+I">Ibrahim Alabdulmohsin</a>, <a href="/search/cs?searchtype=author&query=Beyer%2C+L">Lucas Beyer</a>, <a href="/search/cs?searchtype=author&query=Zhai%2C+X">Xiaohua Zhai</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.03555v1-abstract-short" style="display: inline;"> PaliGemma 2 is an upgrade of the PaliGemma open Vision-Language Model (VLM) based on the Gemma 2 family of language models. We combine the SigLIP-So400m vision encoder that was also used by PaliGemma with the whole range of Gemma 2 models, from the 2B one all the way up to the 27B model. We train these models at three resolutions (224px, 448px, and 896px) in multiple stages to equip them with broa… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.03555v1-abstract-full').style.display = 'inline'; document.getElementById('2412.03555v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.03555v1-abstract-full" style="display: none;"> PaliGemma 2 is an upgrade of the PaliGemma open Vision-Language Model (VLM) based on the Gemma 2 family of language models. We combine the SigLIP-So400m vision encoder that was also used by PaliGemma with the whole range of Gemma 2 models, from the 2B one all the way up to the 27B model. We train these models at three resolutions (224px, 448px, and 896px) in multiple stages to equip them with broad knowledge for transfer via fine-tuning. The resulting family of base models covering different model sizes and resolutions allows us to investigate factors impacting transfer performance (such as learning rate) and to analyze the interplay between the type of task, model size, and resolution. We further increase the number and breadth of transfer tasks beyond the scope of PaliGemma including different OCR-related tasks such as table structure recognition, molecular structure recognition, music score recognition, as well as long fine-grained captioning and radiography report generation, on which PaliGemma 2 obtains state-of-the-art results. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.03555v1-abstract-full').style.display = 'none'; document.getElementById('2412.03555v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.09018">arXiv:2411.09018</a> <span> [<a href="https://arxiv.org/pdf/2411.09018">pdf</a>, <a href="https://arxiv.org/format/2411.09018">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Bridging the Visual Gap: Fine-Tuning Multimodal Models with Knowledge-Adapted Captions </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yanuka%2C+M">Moran Yanuka</a>, <a href="/search/cs?searchtype=author&query=Kish%2C+A+B">Assaf Ben Kish</a>, <a href="/search/cs?searchtype=author&query=Bitton%2C+Y">Yonatan Bitton</a>, <a href="/search/cs?searchtype=author&query=Szpektor%2C+I">Idan Szpektor</a>, <a href="/search/cs?searchtype=author&query=Giryes%2C+R">Raja Giryes</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.09018v3-abstract-short" style="display: inline;"> Recent research increasingly focuses on training vision-language models (VLMs) with long, detailed image captions. However, small-scale VLMs often struggle to balance the richness of these captions with the risk of hallucinating content during fine-tuning. In this paper, we explore how well VLMs adapt to such captions. To quantify caption quality, we propose Decomposed NLI (DNLI), an evaluation fr… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.09018v3-abstract-full').style.display = 'inline'; document.getElementById('2411.09018v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.09018v3-abstract-full" style="display: none;"> Recent research increasingly focuses on training vision-language models (VLMs) with long, detailed image captions. However, small-scale VLMs often struggle to balance the richness of these captions with the risk of hallucinating content during fine-tuning. In this paper, we explore how well VLMs adapt to such captions. To quantify caption quality, we propose Decomposed NLI (DNLI), an evaluation framework that breaks down generated captions into individual propositions, assessing each in isolation. This fine-grained analysis reveals a critical balance between capturing descriptive details and preventing hallucinations. Our findings show that simply reducing caption complexity or employing standard data curation techniques does not effectively resolve this issue. To tackle this challenge, we introduce Knowledge Adapted (KnowAda) fine-tuning, a data-centric approach that automatically adapts training data with the model's existing knowledge and visual understanding. KnowAda minimizes hallucinations while preserving high descriptiveness. We validate this approach across several small-scale VLMs (up to 7B parameters) and dense caption datasets, demonstrating that KnowAda effectively balances hallucination reduction and descriptiveness. Our results show that KnowAda outperforms various baselines in both automatic metrics and human evaluations. We will release our code and models. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.09018v3-abstract-full').style.display = 'none'; document.getElementById('2411.09018v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 13 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to NAACL 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.11824">arXiv:2410.11824</a> <span> [<a href="https://arxiv.org/pdf/2410.11824">pdf</a>, <a href="https://arxiv.org/format/2410.11824">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> KITTEN: A Knowledge-Intensive Evaluation of Image Generation on Visual Entities </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Huang%2C+H">Hsin-Ping Huang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xinyi Wang</a>, <a href="/search/cs?searchtype=author&query=Bitton%2C+Y">Yonatan Bitton</a>, <a href="/search/cs?searchtype=author&query=Taitelbaum%2C+H">Hagai Taitelbaum</a>, <a href="/search/cs?searchtype=author&query=Tomar%2C+G+S">Gaurav Singh Tomar</a>, <a href="/search/cs?searchtype=author&query=Chang%2C+M">Ming-Wei Chang</a>, <a href="/search/cs?searchtype=author&query=Jia%2C+X">Xuhui Jia</a>, <a href="/search/cs?searchtype=author&query=Chan%2C+K+C+K">Kelvin C. K. Chan</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+H">Hexiang Hu</a>, <a href="/search/cs?searchtype=author&query=Su%2C+Y">Yu-Chuan Su</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+M">Ming-Hsuan Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.11824v1-abstract-short" style="display: inline;"> Recent advancements in text-to-image generation have significantly enhanced the quality of synthesized images. Despite this progress, evaluations predominantly focus on aesthetic appeal or alignment with text prompts. Consequently, there is limited understanding of whether these models can accurately represent a wide variety of realistic visual entities - a task requiring real-world knowledge. To… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.11824v1-abstract-full').style.display = 'inline'; document.getElementById('2410.11824v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.11824v1-abstract-full" style="display: none;"> Recent advancements in text-to-image generation have significantly enhanced the quality of synthesized images. Despite this progress, evaluations predominantly focus on aesthetic appeal or alignment with text prompts. Consequently, there is limited understanding of whether these models can accurately represent a wide variety of realistic visual entities - a task requiring real-world knowledge. To address this gap, we propose a benchmark focused on evaluating Knowledge-InTensive image generaTion on real-world ENtities (i.e., KITTEN). Using KITTEN, we conduct a systematic study on the fidelity of entities in text-to-image generation models, focusing on their ability to generate a wide range of real-world visual entities, such as landmark buildings, aircraft, plants, and animals. We evaluate the latest text-to-image models and retrieval-augmented customization models using both automatic metrics and carefully-designed human evaluations, with an emphasis on the fidelity of entities in the generated images. Our findings reveal that even the most advanced text-to-image models often fail to generate entities with accurate visual details. Although retrieval-augmented models can enhance the fidelity of entity by incorporating reference images during testing, they often over-rely on these references and struggle to produce novel configurations of the entity as requested in creative text prompts. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.11824v1-abstract-full').style.display = 'none'; document.getElementById('2410.11824v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Project page: https://kitten-project.github.io/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.02613">arXiv:2410.02613</a> <span> [<a href="https://arxiv.org/pdf/2410.02613">pdf</a>, <a href="https://arxiv.org/format/2410.02613">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> NL-Eye: Abductive NLI for Images </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ventura%2C+M">Mor Ventura</a>, <a href="/search/cs?searchtype=author&query=Toker%2C+M">Michael Toker</a>, <a href="/search/cs?searchtype=author&query=Calderon%2C+N">Nitay Calderon</a>, <a href="/search/cs?searchtype=author&query=Gekhman%2C+Z">Zorik Gekhman</a>, <a href="/search/cs?searchtype=author&query=Bitton%2C+Y">Yonatan Bitton</a>, <a href="/search/cs?searchtype=author&query=Reichart%2C+R">Roi Reichart</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.02613v1-abstract-short" style="display: inline;"> Will a Visual Language Model (VLM)-based bot warn us about slipping if it detects a wet floor? Recent VLMs have demonstrated impressive capabilities, yet their ability to infer outcomes and causes remains underexplored. To address this, we introduce NL-Eye, a benchmark designed to assess VLMs' visual abductive reasoning skills. NL-Eye adapts the abductive Natural Language Inference (NLI) task to t… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.02613v1-abstract-full').style.display = 'inline'; document.getElementById('2410.02613v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.02613v1-abstract-full" style="display: none;"> Will a Visual Language Model (VLM)-based bot warn us about slipping if it detects a wet floor? Recent VLMs have demonstrated impressive capabilities, yet their ability to infer outcomes and causes remains underexplored. To address this, we introduce NL-Eye, a benchmark designed to assess VLMs' visual abductive reasoning skills. NL-Eye adapts the abductive Natural Language Inference (NLI) task to the visual domain, requiring models to evaluate the plausibility of hypothesis images based on a premise image and explain their decisions. NL-Eye consists of 350 carefully curated triplet examples (1,050 images) spanning diverse reasoning categories: physical, functional, logical, emotional, cultural, and social. The data curation process involved two steps - writing textual descriptions and generating images using text-to-image models, both requiring substantial human involvement to ensure high-quality and challenging scenes. Our experiments show that VLMs struggle significantly on NL-Eye, often performing at random baseline levels, while humans excel in both plausibility prediction and explanation quality. This demonstrates a deficiency in the abductive reasoning capabilities of modern VLMs. NL-Eye represents a crucial step toward developing VLMs capable of robust multimodal reasoning for real-world applications, including accident-prevention bots and generated video verification. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.02613v1-abstract-full').style.display = 'none'; document.getElementById('2410.02613v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.19474">arXiv:2407.19474</a> <span> [<a href="https://arxiv.org/pdf/2407.19474">pdf</a>, <a href="https://arxiv.org/format/2407.19474">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Visual Riddles: a Commonsense and World Knowledge Challenge for Large Vision and Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Bitton-Guetta%2C+N">Nitzan Bitton-Guetta</a>, <a href="/search/cs?searchtype=author&query=Slobodkin%2C+A">Aviv Slobodkin</a>, <a href="/search/cs?searchtype=author&query=Maimon%2C+A">Aviya Maimon</a>, <a href="/search/cs?searchtype=author&query=Habba%2C+E">Eliya Habba</a>, <a href="/search/cs?searchtype=author&query=Rassin%2C+R">Royi Rassin</a>, <a href="/search/cs?searchtype=author&query=Bitton%2C+Y">Yonatan Bitton</a>, <a href="/search/cs?searchtype=author&query=Szpektor%2C+I">Idan Szpektor</a>, <a href="/search/cs?searchtype=author&query=Globerson%2C+A">Amir Globerson</a>, <a href="/search/cs?searchtype=author&query=Elovici%2C+Y">Yuval Elovici</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.19474v2-abstract-short" style="display: inline;"> Imagine observing someone scratching their arm; to understand why, additional context would be necessary. However, spotting a mosquito nearby would immediately offer a likely explanation for the person's discomfort, thereby alleviating the need for further information. This example illustrates how subtle visual cues can challenge our cognitive skills and demonstrates the complexity of interpreting… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.19474v2-abstract-full').style.display = 'inline'; document.getElementById('2407.19474v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.19474v2-abstract-full" style="display: none;"> Imagine observing someone scratching their arm; to understand why, additional context would be necessary. However, spotting a mosquito nearby would immediately offer a likely explanation for the person's discomfort, thereby alleviating the need for further information. This example illustrates how subtle visual cues can challenge our cognitive skills and demonstrates the complexity of interpreting visual scenarios. To study these skills, we present Visual Riddles, a benchmark aimed to test vision and language models on visual riddles requiring commonsense and world knowledge. The benchmark comprises 400 visual riddles, each featuring a unique image created by a variety of text-to-image models, question, ground-truth answer, textual hint, and attribution. Human evaluation reveals that existing models lag significantly behind human performance, which is at 82% accuracy, with Gemini-Pro-1.5 leading with 40% accuracy. Our benchmark comes with automatic evaluation tasks to make assessment scalable. These findings underscore the potential of Visual Riddles as a valuable resource for enhancing vision and language models' capabilities in interpreting complex visual scenarios. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.19474v2-abstract-full').style.display = 'none'; document.getElementById('2407.19474v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 28 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">https://visual-riddles.github.io/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.11814">arXiv:2407.11814</a> <span> [<a href="https://arxiv.org/pdf/2407.11814">pdf</a>, <a href="https://arxiv.org/format/2407.11814">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Contrastive Sequential-Diffusion Learning: Non-linear and Multi-Scene Instructional Video Synthesis </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ramos%2C+V">Vasco Ramos</a>, <a href="/search/cs?searchtype=author&query=Bitton%2C+Y">Yonatan Bitton</a>, <a href="/search/cs?searchtype=author&query=Yarom%2C+M">Michal Yarom</a>, <a href="/search/cs?searchtype=author&query=Szpektor%2C+I">Idan Szpektor</a>, <a href="/search/cs?searchtype=author&query=Magalhaes%2C+J">Joao Magalhaes</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.11814v3-abstract-short" style="display: inline;"> Generated video scenes for action-centric sequence descriptions, such as recipe instructions and do-it-yourself projects, often include non-linear patterns, where the next video may need to be visually consistent not with the immediately preceding video but with earlier ones. Current multi-scene video synthesis approaches fail to meet these consistency requirements. To address this, we propose a c… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.11814v3-abstract-full').style.display = 'inline'; document.getElementById('2407.11814v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.11814v3-abstract-full" style="display: none;"> Generated video scenes for action-centric sequence descriptions, such as recipe instructions and do-it-yourself projects, often include non-linear patterns, where the next video may need to be visually consistent not with the immediately preceding video but with earlier ones. Current multi-scene video synthesis approaches fail to meet these consistency requirements. To address this, we propose a contrastive sequential video diffusion method that selects the most suitable previously generated scene to guide and condition the denoising process of the next scene. The result is a multi-scene video that is grounded in the scene descriptions and coherent w.r.t. the scenes that require visual consistency. Experiments with action-centered data from the real world demonstrate the practicality and improved consistency of our model compared to previous work. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.11814v3-abstract-full').style.display = 'none'; document.getElementById('2407.11814v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 16 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.06189">arXiv:2407.06189</a> <span> [<a href="https://arxiv.org/pdf/2407.06189">pdf</a>, <a href="https://arxiv.org/format/2407.06189">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Video-STaR: Self-Training Enables Video Instruction Tuning with Any Supervision </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zohar%2C+O">Orr Zohar</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xiaohan Wang</a>, <a href="/search/cs?searchtype=author&query=Bitton%2C+Y">Yonatan Bitton</a>, <a href="/search/cs?searchtype=author&query=Szpektor%2C+I">Idan Szpektor</a>, <a href="/search/cs?searchtype=author&query=Yeung-Levy%2C+S">Serena Yeung-Levy</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.06189v1-abstract-short" style="display: inline;"> The performance of Large Vision Language Models (LVLMs) is dependent on the size and quality of their training datasets. Existing video instruction tuning datasets lack diversity as they are derived by prompting large language models with video captions to generate question-answer pairs, and are therefore mostly descriptive. Meanwhile, many labeled video datasets with diverse labels and supervisio… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.06189v1-abstract-full').style.display = 'inline'; document.getElementById('2407.06189v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.06189v1-abstract-full" style="display: none;"> The performance of Large Vision Language Models (LVLMs) is dependent on the size and quality of their training datasets. Existing video instruction tuning datasets lack diversity as they are derived by prompting large language models with video captions to generate question-answer pairs, and are therefore mostly descriptive. Meanwhile, many labeled video datasets with diverse labels and supervision exist - however, we find that their integration into LVLMs is non-trivial. Herein, we present Video Self-Training with augmented Reasoning (Video-STaR), the first video self-training approach. Video-STaR allows the utilization of any labeled video dataset for video instruction tuning. In Video-STaR, an LVLM cycles between instruction generation and finetuning, which we show (I) improves general video understanding and (II) adapts LVLMs to novel downstream tasks with existing supervision. During generation, an LVLM is prompted to propose an answer. The answers are then filtered only to those that contain the original video labels, and the LVLM is then re-trained on the generated dataset. By only training on generated answers that contain the correct video labels, Video-STaR utilizes these existing video labels as weak supervision for video instruction tuning. Our results demonstrate that Video-STaR-enhanced LVLMs exhibit improved performance in (I) general video QA, where TempCompass performance improved by 10%, and (II) on downstream tasks, where Video-STaR improved Kinetics700-QA accuracy by 20% and action quality assessment on FineDiving by 15%. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.06189v1-abstract-full').style.display = 'none'; document.getElementById('2407.06189v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Project page: https://orrzohar.github.io/projects/video-star/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.16807">arXiv:2406.16807</a> <span> [<a href="https://arxiv.org/pdf/2406.16807">pdf</a>, <a href="https://arxiv.org/format/2406.16807">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Beyond Thumbs Up/Down: Untangling Challenges of Fine-Grained Feedback for Text-to-Image Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Collins%2C+K+M">Katherine M. Collins</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+N">Najoung Kim</a>, <a href="/search/cs?searchtype=author&query=Bitton%2C+Y">Yonatan Bitton</a>, <a href="/search/cs?searchtype=author&query=Rieser%2C+V">Verena Rieser</a>, <a href="/search/cs?searchtype=author&query=Omidshafiei%2C+S">Shayegan Omidshafiei</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+Y">Yushi Hu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+S">Sherol Chen</a>, <a href="/search/cs?searchtype=author&query=Dutta%2C+S">Senjuti Dutta</a>, <a href="/search/cs?searchtype=author&query=Chang%2C+M">Minsuk Chang</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+K">Kimin Lee</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+Y">Youwei Liang</a>, <a href="/search/cs?searchtype=author&query=Evans%2C+G">Georgina Evans</a>, <a href="/search/cs?searchtype=author&query=Singla%2C+S">Sahil Singla</a>, <a href="/search/cs?searchtype=author&query=Li%2C+G">Gang Li</a>, <a href="/search/cs?searchtype=author&query=Weller%2C+A">Adrian Weller</a>, <a href="/search/cs?searchtype=author&query=He%2C+J">Junfeng He</a>, <a href="/search/cs?searchtype=author&query=Ramachandran%2C+D">Deepak Ramachandran</a>, <a href="/search/cs?searchtype=author&query=Dvijotham%2C+K+D">Krishnamurthy Dj Dvijotham</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.16807v2-abstract-short" style="display: inline;"> Human feedback plays a critical role in learning and refining reward models for text-to-image generation, but the optimal form the feedback should take for learning an accurate reward function has not been conclusively established. This paper investigates the effectiveness of fine-grained feedback which captures nuanced distinctions in image quality and prompt-alignment, compared to traditional co… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.16807v2-abstract-full').style.display = 'inline'; document.getElementById('2406.16807v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.16807v2-abstract-full" style="display: none;"> Human feedback plays a critical role in learning and refining reward models for text-to-image generation, but the optimal form the feedback should take for learning an accurate reward function has not been conclusively established. This paper investigates the effectiveness of fine-grained feedback which captures nuanced distinctions in image quality and prompt-alignment, compared to traditional coarse-grained feedback (for example, thumbs up/down or ranking between a set of options). While fine-grained feedback holds promise, particularly for systems catering to diverse societal preferences, we show that demonstrating its superiority to coarse-grained feedback is not automatic. Through experiments on real and synthetic preference data, we surface the complexities of building effective models due to the interplay of model choice, feedback type, and the alignment between human judgment and computational interpretation. We identify key challenges in eliciting and utilizing fine-grained feedback, prompting a reassessment of its assumed benefits and practicality. Our findings -- e.g., that fine-grained feedback can lead to worse models for a fixed budget, in some settings; however, in controlled settings with known attributes, fine grained rewards can indeed be more helpful -- call for careful consideration of feedback attributes and potentially beckon novel modeling approaches to appropriately unlock the potential value of fine-grained feedback in-the-wild. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.16807v2-abstract-full').style.display = 'none'; document.getElementById('2406.16807v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 24 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.11794">arXiv:2406.11794</a> <span> [<a href="https://arxiv.org/pdf/2406.11794">pdf</a>, <a href="https://arxiv.org/format/2406.11794">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> DataComp-LM: In search of the next generation of training sets for language models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+J">Jeffrey Li</a>, <a href="/search/cs?searchtype=author&query=Fang%2C+A">Alex Fang</a>, <a href="/search/cs?searchtype=author&query=Smyrnis%2C+G">Georgios Smyrnis</a>, <a href="/search/cs?searchtype=author&query=Ivgi%2C+M">Maor Ivgi</a>, <a href="/search/cs?searchtype=author&query=Jordan%2C+M">Matt Jordan</a>, <a href="/search/cs?searchtype=author&query=Gadre%2C+S">Samir Gadre</a>, <a href="/search/cs?searchtype=author&query=Bansal%2C+H">Hritik Bansal</a>, <a href="/search/cs?searchtype=author&query=Guha%2C+E">Etash Guha</a>, <a href="/search/cs?searchtype=author&query=Keh%2C+S">Sedrick Keh</a>, <a href="/search/cs?searchtype=author&query=Arora%2C+K">Kushal Arora</a>, <a href="/search/cs?searchtype=author&query=Garg%2C+S">Saurabh Garg</a>, <a href="/search/cs?searchtype=author&query=Xin%2C+R">Rui Xin</a>, <a href="/search/cs?searchtype=author&query=Muennighoff%2C+N">Niklas Muennighoff</a>, <a href="/search/cs?searchtype=author&query=Heckel%2C+R">Reinhard Heckel</a>, <a href="/search/cs?searchtype=author&query=Mercat%2C+J">Jean Mercat</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+M">Mayee Chen</a>, <a href="/search/cs?searchtype=author&query=Gururangan%2C+S">Suchin Gururangan</a>, <a href="/search/cs?searchtype=author&query=Wortsman%2C+M">Mitchell Wortsman</a>, <a href="/search/cs?searchtype=author&query=Albalak%2C+A">Alon Albalak</a>, <a href="/search/cs?searchtype=author&query=Bitton%2C+Y">Yonatan Bitton</a>, <a href="/search/cs?searchtype=author&query=Nezhurina%2C+M">Marianna Nezhurina</a>, <a href="/search/cs?searchtype=author&query=Abbas%2C+A">Amro Abbas</a>, <a href="/search/cs?searchtype=author&query=Hsieh%2C+C">Cheng-Yu Hsieh</a>, <a href="/search/cs?searchtype=author&query=Ghosh%2C+D">Dhruba Ghosh</a>, <a href="/search/cs?searchtype=author&query=Gardner%2C+J">Josh Gardner</a> , et al. (34 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.11794v3-abstract-short" style="display: inline;"> We introduce DataComp for Language Models (DCLM), a testbed for controlled dataset experiments with the goal of improving language models. As part of DCLM, we provide a standardized corpus of 240T tokens extracted from Common Crawl, effective pretraining recipes based on the OpenLM framework, and a broad suite of 53 downstream evaluations. Participants in the DCLM benchmark can experiment with dat… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.11794v3-abstract-full').style.display = 'inline'; document.getElementById('2406.11794v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.11794v3-abstract-full" style="display: none;"> We introduce DataComp for Language Models (DCLM), a testbed for controlled dataset experiments with the goal of improving language models. As part of DCLM, we provide a standardized corpus of 240T tokens extracted from Common Crawl, effective pretraining recipes based on the OpenLM framework, and a broad suite of 53 downstream evaluations. Participants in the DCLM benchmark can experiment with data curation strategies such as deduplication, filtering, and data mixing at model scales ranging from 412M to 7B parameters. As a baseline for DCLM, we conduct extensive experiments and find that model-based filtering is key to assembling a high-quality training set. The resulting dataset, DCLM-Baseline enables training a 7B parameter language model from scratch to 64% 5-shot accuracy on MMLU with 2.6T training tokens. Compared to MAP-Neo, the previous state-of-the-art in open-data language models, DCLM-Baseline represents a 6.6 percentage point improvement on MMLU while being trained with 40% less compute. Our baseline model is also comparable to Mistral-7B-v0.3 and Llama 3 8B on MMLU (63% & 66%), and performs similarly on an average of 53 natural language understanding tasks while being trained with 6.6x less compute than Llama 3 8B. Our results highlight the importance of dataset design for training language models and offer a starting point for further research on data curation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.11794v3-abstract-full').style.display = 'none'; document.getElementById('2406.11794v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 17 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Project page: https://www.datacomp.ai/dclm/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.03520">arXiv:2406.03520</a> <span> [<a href="https://arxiv.org/pdf/2406.03520">pdf</a>, <a href="https://arxiv.org/format/2406.03520">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> VideoPhy: Evaluating Physical Commonsense for Video Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Bansal%2C+H">Hritik Bansal</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+Z">Zongyu Lin</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+T">Tianyi Xie</a>, <a href="/search/cs?searchtype=author&query=Zong%2C+Z">Zeshun Zong</a>, <a href="/search/cs?searchtype=author&query=Yarom%2C+M">Michal Yarom</a>, <a href="/search/cs?searchtype=author&query=Bitton%2C+Y">Yonatan Bitton</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+C">Chenfanfu Jiang</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+Y">Yizhou Sun</a>, <a href="/search/cs?searchtype=author&query=Chang%2C+K">Kai-Wei Chang</a>, <a href="/search/cs?searchtype=author&query=Grover%2C+A">Aditya Grover</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.03520v2-abstract-short" style="display: inline;"> Recent advances in internet-scale video data pretraining have led to the development of text-to-video generative models that can create high-quality videos across a broad range of visual concepts, synthesize realistic motions and render complex objects. Hence, these generative models have the potential to become general-purpose simulators of the physical world. However, it is unclear how far we ar… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.03520v2-abstract-full').style.display = 'inline'; document.getElementById('2406.03520v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.03520v2-abstract-full" style="display: none;"> Recent advances in internet-scale video data pretraining have led to the development of text-to-video generative models that can create high-quality videos across a broad range of visual concepts, synthesize realistic motions and render complex objects. Hence, these generative models have the potential to become general-purpose simulators of the physical world. However, it is unclear how far we are from this goal with the existing text-to-video generative models. To this end, we present VideoPhy, a benchmark designed to assess whether the generated videos follow physical commonsense for real-world activities (e.g. marbles will roll down when placed on a slanted surface). Specifically, we curate diverse prompts that involve interactions between various material types in the physical world (e.g., solid-solid, solid-fluid, fluid-fluid). We then generate videos conditioned on these captions from diverse state-of-the-art text-to-video generative models, including open models (e.g., CogVideoX) and closed models (e.g., Lumiere, Dream Machine). Our human evaluation reveals that the existing models severely lack the ability to generate videos adhering to the given text prompts, while also lack physical commonsense. Specifically, the best performing model, CogVideoX-5B, generates videos that adhere to the caption and physical laws for 39.6% of the instances. VideoPhy thus highlights that the video generative models are far from accurately simulating the physical world. Finally, we propose an auto-evaluator, VideoCon-Physics, to assess the performance reliably for the newly released models. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.03520v2-abstract-full').style.display = 'none'; document.getElementById('2406.03520v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 5 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">43 pages, 29 figures, 12 tables. Added CogVideo and Dream Machine in v2</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.10122">arXiv:2405.10122</a> <span> [<a href="https://arxiv.org/pdf/2405.10122">pdf</a>, <a href="https://arxiv.org/format/2405.10122">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Generating Coherent Sequences of Visual Illustrations for Real-World Manual Tasks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Bordalo%2C+J">Jo茫o Bordalo</a>, <a href="/search/cs?searchtype=author&query=Ramos%2C+V">Vasco Ramos</a>, <a href="/search/cs?searchtype=author&query=Val%C3%A9rio%2C+R">Rodrigo Val茅rio</a>, <a href="/search/cs?searchtype=author&query=Gl%C3%B3ria-Silva%2C+D">Diogo Gl贸ria-Silva</a>, <a href="/search/cs?searchtype=author&query=Bitton%2C+Y">Yonatan Bitton</a>, <a href="/search/cs?searchtype=author&query=Yarom%2C+M">Michal Yarom</a>, <a href="/search/cs?searchtype=author&query=Szpektor%2C+I">Idan Szpektor</a>, <a href="/search/cs?searchtype=author&query=Magalhaes%2C+J">Joao Magalhaes</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.10122v1-abstract-short" style="display: inline;"> Multistep instructions, such as recipes and how-to guides, greatly benefit from visual aids, such as a series of images that accompany the instruction steps. While Large Language Models (LLMs) have become adept at generating coherent textual steps, Large Vision/Language Models (LVLMs) are less capable of generating accompanying image sequences. The most challenging aspect is that each generated im… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.10122v1-abstract-full').style.display = 'inline'; document.getElementById('2405.10122v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.10122v1-abstract-full" style="display: none;"> Multistep instructions, such as recipes and how-to guides, greatly benefit from visual aids, such as a series of images that accompany the instruction steps. While Large Language Models (LLMs) have become adept at generating coherent textual steps, Large Vision/Language Models (LVLMs) are less capable of generating accompanying image sequences. The most challenging aspect is that each generated image needs to adhere to the relevant textual step instruction, as well as be visually consistent with earlier images in the sequence. To address this problem, we propose an approach for generating consistent image sequences, which integrates a Latent Diffusion Model (LDM) with an LLM to transform the sequence into a caption to maintain the semantic coherence of the sequence. In addition, to maintain the visual coherence of the image sequence, we introduce a copy mechanism to initialise reverse diffusion processes with a latent vector iteration from a previously generated image from a relevant step. Both strategies will condition the reverse diffusion process on the sequence of instruction steps and tie the contents of the current image to previous instruction steps and corresponding images. Experiments show that the proposed approach is preferred by humans in 46.6% of the cases against 26.6% for the second best method. In addition, automatic metrics showed that the proposed method maintains semantic coherence and visual consistency across steps in both domains. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.10122v1-abstract-full').style.display = 'none'; document.getElementById('2405.10122v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.04682">arXiv:2405.04682</a> <span> [<a href="https://arxiv.org/pdf/2405.04682">pdf</a>, <a href="https://arxiv.org/format/2405.04682">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> TALC: Time-Aligned Captions for Multi-Scene Text-to-Video Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Bansal%2C+H">Hritik Bansal</a>, <a href="/search/cs?searchtype=author&query=Bitton%2C+Y">Yonatan Bitton</a>, <a href="/search/cs?searchtype=author&query=Yarom%2C+M">Michal Yarom</a>, <a href="/search/cs?searchtype=author&query=Szpektor%2C+I">Idan Szpektor</a>, <a href="/search/cs?searchtype=author&query=Grover%2C+A">Aditya Grover</a>, <a href="/search/cs?searchtype=author&query=Chang%2C+K">Kai-Wei Chang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.04682v4-abstract-short" style="display: inline;"> Most of these text-to-video (T2V) generative models often produce single-scene video clips that depict an entity performing a particular action (e.g., 'a red panda climbing a tree'). However, it is pertinent to generate multi-scene videos since they are ubiquitous in the real-world (e.g., 'a red panda climbing a tree' followed by 'the red panda sleeps on the top of the tree'). To generate multi-sc… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.04682v4-abstract-full').style.display = 'inline'; document.getElementById('2405.04682v4-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.04682v4-abstract-full" style="display: none;"> Most of these text-to-video (T2V) generative models often produce single-scene video clips that depict an entity performing a particular action (e.g., 'a red panda climbing a tree'). However, it is pertinent to generate multi-scene videos since they are ubiquitous in the real-world (e.g., 'a red panda climbing a tree' followed by 'the red panda sleeps on the top of the tree'). To generate multi-scene videos from the pretrained T2V model, we introduce a simple and effective Time-Aligned Captions (TALC) framework. Specifically, we enhance the text-conditioning mechanism in the T2V architecture to recognize the temporal alignment between the video scenes and scene descriptions. For instance, we condition the visual features of the earlier and later scenes of the generated video with the representations of the first scene description (e.g., 'a red panda climbing a tree') and second scene description (e.g., 'the red panda sleeps on the top of the tree'), respectively. As a result, we show that the T2V model can generate multi-scene videos that adhere to the multi-scene text descriptions and be visually consistent (e.g., entity and background). Further, we finetune the pretrained T2V model with multi-scene video-text data using the TALC framework. We show that the TALC-finetuned model outperforms the baseline by achieving a relative gain of 29% in the overall score, which averages visual consistency and text adherence using human evaluation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.04682v4-abstract-full').style.display = 'none'; document.getElementById('2405.04682v4-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 7 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">22 pages, 14 figures, 11 tables</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.02793">arXiv:2405.02793</a> <span> [<a href="https://arxiv.org/pdf/2405.02793">pdf</a>, <a href="https://arxiv.org/format/2405.02793">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> ImageInWords: Unlocking Hyper-Detailed Image Descriptions </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Garg%2C+R">Roopal Garg</a>, <a href="/search/cs?searchtype=author&query=Burns%2C+A">Andrea Burns</a>, <a href="/search/cs?searchtype=author&query=Ayan%2C+B+K">Burcu Karagol Ayan</a>, <a href="/search/cs?searchtype=author&query=Bitton%2C+Y">Yonatan Bitton</a>, <a href="/search/cs?searchtype=author&query=Montgomery%2C+C">Ceslee Montgomery</a>, <a href="/search/cs?searchtype=author&query=Onoe%2C+Y">Yasumasa Onoe</a>, <a href="/search/cs?searchtype=author&query=Bunner%2C+A">Andrew Bunner</a>, <a href="/search/cs?searchtype=author&query=Krishna%2C+R">Ranjay Krishna</a>, <a href="/search/cs?searchtype=author&query=Baldridge%2C+J">Jason Baldridge</a>, <a href="/search/cs?searchtype=author&query=Soricut%2C+R">Radu Soricut</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.02793v2-abstract-short" style="display: inline;"> Despite the longstanding adage "an image is worth a thousand words," generating accurate hyper-detailed image descriptions remains unsolved. Trained on short web-scraped image text, vision-language models often generate incomplete descriptions with visual inconsistencies. We address this via a novel data-centric approach with ImageInWords (IIW), a carefully designed human-in-the-loop framework for… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.02793v2-abstract-full').style.display = 'inline'; document.getElementById('2405.02793v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.02793v2-abstract-full" style="display: none;"> Despite the longstanding adage "an image is worth a thousand words," generating accurate hyper-detailed image descriptions remains unsolved. Trained on short web-scraped image text, vision-language models often generate incomplete descriptions with visual inconsistencies. We address this via a novel data-centric approach with ImageInWords (IIW), a carefully designed human-in-the-loop framework for curating hyper-detailed image descriptions. Human evaluations on IIW data show major gains compared to recent datasets (+66%) and GPT4V (+48%) across comprehensiveness, specificity, hallucinations, and more. We also show that fine-tuning with IIW data improves these metrics by +31% against models trained with prior work, even with only 9k samples. Lastly, we evaluate IIW models with text-to-image generation and vision-language reasoning tasks. Our generated descriptions result in the highest fidelity images, and boost compositional reasoning by up to 6% on ARO, SVO-Probes, and Winoground datasets. We release the IIW Eval benchmark with human judgement labels, object and image-level annotations from our framework, and existing image caption datasets enriched via IIW-model. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.02793v2-abstract-full').style.display = 'none'; document.getElementById('2405.02793v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 4 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Webpage (https://google.github.io/imageinwords), GitHub (https://github.com/google/imageinwords), HuggingFace (https://huggingface.co/datasets/google/imageinwords)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2404.19753">arXiv:2404.19753</a> <span> [<a href="https://arxiv.org/pdf/2404.19753">pdf</a>, <a href="https://arxiv.org/format/2404.19753">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> DOCCI: Descriptions of Connected and Contrasting Images </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Onoe%2C+Y">Yasumasa Onoe</a>, <a href="/search/cs?searchtype=author&query=Rane%2C+S">Sunayana Rane</a>, <a href="/search/cs?searchtype=author&query=Berger%2C+Z">Zachary Berger</a>, <a href="/search/cs?searchtype=author&query=Bitton%2C+Y">Yonatan Bitton</a>, <a href="/search/cs?searchtype=author&query=Cho%2C+J">Jaemin Cho</a>, <a href="/search/cs?searchtype=author&query=Garg%2C+R">Roopal Garg</a>, <a href="/search/cs?searchtype=author&query=Ku%2C+A">Alexander Ku</a>, <a href="/search/cs?searchtype=author&query=Parekh%2C+Z">Zarana Parekh</a>, <a href="/search/cs?searchtype=author&query=Pont-Tuset%2C+J">Jordi Pont-Tuset</a>, <a href="/search/cs?searchtype=author&query=Tanzer%2C+G">Garrett Tanzer</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+S">Su Wang</a>, <a href="/search/cs?searchtype=author&query=Baldridge%2C+J">Jason Baldridge</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2404.19753v1-abstract-short" style="display: inline;"> Vision-language datasets are vital for both text-to-image (T2I) and image-to-text (I2T) research. However, current datasets lack descriptions with fine-grained detail that would allow for richer associations to be learned by models. To fill the gap, we introduce Descriptions of Connected and Contrasting Images (DOCCI), a dataset with long, human-annotated English descriptions for 15k images that w… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.19753v1-abstract-full').style.display = 'inline'; document.getElementById('2404.19753v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2404.19753v1-abstract-full" style="display: none;"> Vision-language datasets are vital for both text-to-image (T2I) and image-to-text (I2T) research. However, current datasets lack descriptions with fine-grained detail that would allow for richer associations to be learned by models. To fill the gap, we introduce Descriptions of Connected and Contrasting Images (DOCCI), a dataset with long, human-annotated English descriptions for 15k images that were taken, curated and donated by a single researcher intent on capturing key challenges such as spatial relations, counting, text rendering, world knowledge, and more. We instruct human annotators to create comprehensive descriptions for each image; these average 136 words in length and are crafted to clearly distinguish each image from those that are related or similar. Each description is highly compositional and typically encompasses multiple challenges. Through both quantitative and qualitative analyses, we demonstrate that DOCCI serves as an effective training resource for image-to-text generation -- a PaLI 5B model finetuned on DOCCI shows equal or superior results compared to highly-performant larger models like LLaVA-1.5 7B and InstructBLIP 7B. Furthermore, we show that DOCCI is a useful testbed for text-to-image generation, highlighting the limitations of current text-to-image models in capturing long descriptions and fine details. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.19753v1-abstract-full').style.display = 'none'; document.getElementById('2404.19753v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2403.01139">arXiv:2403.01139</a> <span> [<a href="https://arxiv.org/pdf/2403.01139">pdf</a>, <a href="https://arxiv.org/format/2403.01139">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> ParallelPARC: A Scalable Pipeline for Generating Natural-Language Analogies </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Sultan%2C+O">Oren Sultan</a>, <a href="/search/cs?searchtype=author&query=Bitton%2C+Y">Yonatan Bitton</a>, <a href="/search/cs?searchtype=author&query=Yosef%2C+R">Ron Yosef</a>, <a href="/search/cs?searchtype=author&query=Shahaf%2C+D">Dafna Shahaf</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2403.01139v4-abstract-short" style="display: inline;"> Analogy-making is central to human cognition, allowing us to adapt to novel situations -- an ability that current AI systems still lack. Most analogy datasets today focus on simple analogies (e.g., word analogies); datasets including complex types of analogies are typically manually curated and very small. We believe that this holds back progress in computational analogy. In this work, we design a… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.01139v4-abstract-full').style.display = 'inline'; document.getElementById('2403.01139v4-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2403.01139v4-abstract-full" style="display: none;"> Analogy-making is central to human cognition, allowing us to adapt to novel situations -- an ability that current AI systems still lack. Most analogy datasets today focus on simple analogies (e.g., word analogies); datasets including complex types of analogies are typically manually curated and very small. We believe that this holds back progress in computational analogy. In this work, we design a data generation pipeline, ParallelPARC (Parallel Paragraph Creator) leveraging state-of-the-art Large Language Models (LLMs) to create complex, paragraph-based analogies, as well as distractors, both simple and challenging. We demonstrate our pipeline and create ProPara-Logy, a dataset of analogies between scientific processes. We publish a gold-set, validated by humans, and a silver-set, generated automatically. We test LLMs' and humans' analogy recognition in binary and multiple-choice settings, and found that humans outperform the best models (~13% gap) after a light supervision. We demonstrate that our silver-set is useful for training models. Lastly, we show challenging distractors confuse LLMs, but not humans. We hope our pipeline will encourage research in this emerging field. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.01139v4-abstract-full').style.display = 'none'; document.getElementById('2403.01139v4-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 2 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">NAACL 2024 (Main Conference)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2402.00559">arXiv:2402.00559</a> <span> [<a href="https://arxiv.org/pdf/2402.00559">pdf</a>, <a href="https://arxiv.org/format/2402.00559">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> A Chain-of-Thought Is as Strong as Its Weakest Link: A Benchmark for Verifiers of Reasoning Chains </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Jacovi%2C+A">Alon Jacovi</a>, <a href="/search/cs?searchtype=author&query=Bitton%2C+Y">Yonatan Bitton</a>, <a href="/search/cs?searchtype=author&query=Bohnet%2C+B">Bernd Bohnet</a>, <a href="/search/cs?searchtype=author&query=Herzig%2C+J">Jonathan Herzig</a>, <a href="/search/cs?searchtype=author&query=Honovich%2C+O">Or Honovich</a>, <a href="/search/cs?searchtype=author&query=Tseng%2C+M">Michael Tseng</a>, <a href="/search/cs?searchtype=author&query=Collins%2C+M">Michael Collins</a>, <a href="/search/cs?searchtype=author&query=Aharoni%2C+R">Roee Aharoni</a>, <a href="/search/cs?searchtype=author&query=Geva%2C+M">Mor Geva</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2402.00559v4-abstract-short" style="display: inline;"> Prompting language models to provide step-by-step answers (e.g., "Chain-of-Thought") is the prominent approach for complex reasoning tasks, where more accurate reasoning chains typically improve downstream task performance. Recent literature discusses automatic methods to verify reasoning to evaluate and improve their correctness. However, no fine-grained step-level datasets are available to enabl… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.00559v4-abstract-full').style.display = 'inline'; document.getElementById('2402.00559v4-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2402.00559v4-abstract-full" style="display: none;"> Prompting language models to provide step-by-step answers (e.g., "Chain-of-Thought") is the prominent approach for complex reasoning tasks, where more accurate reasoning chains typically improve downstream task performance. Recent literature discusses automatic methods to verify reasoning to evaluate and improve their correctness. However, no fine-grained step-level datasets are available to enable thorough evaluation of such verification methods, hindering progress in this direction. We introduce REVEAL: Reasoning Verification Evaluation, a dataset to benchmark automatic verifiers of complex Chain-of-Thought reasoning in open-domain question-answering settings. REVEAL includes comprehensive labels for the relevance, attribution to evidence passages, and logical correctness of each reasoning step in a language model's answer, across a variety of datasets and state-of-the-art language models. Evaluation on REVEAL shows that verifiers struggle at verifying reasoning chains - in particular, verifying logical correctness and detecting contradictions. Available at https://reveal-dataset.github.io/ . <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.00559v4-abstract-full').style.display = 'none'; document.getElementById('2402.00559v4-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 1 February, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to ACL 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2312.03766">arXiv:2312.03766</a> <span> [<a href="https://arxiv.org/pdf/2312.03766">pdf</a>, <a href="https://arxiv.org/format/2312.03766">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Mismatch Quest: Visual and Textual Feedback for Image-Text Misalignment </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Gordon%2C+B">Brian Gordon</a>, <a href="/search/cs?searchtype=author&query=Bitton%2C+Y">Yonatan Bitton</a>, <a href="/search/cs?searchtype=author&query=Shafir%2C+Y">Yonatan Shafir</a>, <a href="/search/cs?searchtype=author&query=Garg%2C+R">Roopal Garg</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+X">Xi Chen</a>, <a href="/search/cs?searchtype=author&query=Lischinski%2C+D">Dani Lischinski</a>, <a href="/search/cs?searchtype=author&query=Cohen-Or%2C+D">Daniel Cohen-Or</a>, <a href="/search/cs?searchtype=author&query=Szpektor%2C+I">Idan Szpektor</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2312.03766v2-abstract-short" style="display: inline;"> While existing image-text alignment models reach high quality binary assessments, they fall short of pinpointing the exact source of misalignment. In this paper, we present a method to provide detailed textual and visual explanation of detected misalignments between text-image pairs. We leverage large language models and visual grounding models to automatically construct a training set that holds… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.03766v2-abstract-full').style.display = 'inline'; document.getElementById('2312.03766v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2312.03766v2-abstract-full" style="display: none;"> While existing image-text alignment models reach high quality binary assessments, they fall short of pinpointing the exact source of misalignment. In this paper, we present a method to provide detailed textual and visual explanation of detected misalignments between text-image pairs. We leverage large language models and visual grounding models to automatically construct a training set that holds plausible misaligned captions for a given image and corresponding textual explanations and visual indicators. We also publish a new human curated test set comprising ground-truth textual and visual misalignment annotations. Empirical results show that fine-tuning vision language models on our training set enables them to articulate misalignments and visually indicate them within images, outperforming strong baselines both on the binary alignment classification and the explanation generation tasks. Our method code and human curated test set are available at: https://mismatch-quest.github.io/ <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.03766v2-abstract-full').style.display = 'none'; document.getElementById('2312.03766v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 5 December, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> ECCV 2024 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2311.10111">arXiv:2311.10111</a> <span> [<a href="https://arxiv.org/pdf/2311.10111">pdf</a>, <a href="https://arxiv.org/format/2311.10111">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> VideoCon: Robust Video-Language Alignment via Contrast Captions </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Bansal%2C+H">Hritik Bansal</a>, <a href="/search/cs?searchtype=author&query=Bitton%2C+Y">Yonatan Bitton</a>, <a href="/search/cs?searchtype=author&query=Szpektor%2C+I">Idan Szpektor</a>, <a href="/search/cs?searchtype=author&query=Chang%2C+K">Kai-Wei Chang</a>, <a href="/search/cs?searchtype=author&query=Grover%2C+A">Aditya Grover</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2311.10111v1-abstract-short" style="display: inline;"> Despite being (pre)trained on a massive amount of data, state-of-the-art video-language alignment models are not robust to semantically-plausible contrastive changes in the video captions. Our work addresses this by identifying a broad spectrum of contrast misalignments, such as replacing entities, actions, and flipping event order, which alignment models should be robust against. To this end, we… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2311.10111v1-abstract-full').style.display = 'inline'; document.getElementById('2311.10111v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2311.10111v1-abstract-full" style="display: none;"> Despite being (pre)trained on a massive amount of data, state-of-the-art video-language alignment models are not robust to semantically-plausible contrastive changes in the video captions. Our work addresses this by identifying a broad spectrum of contrast misalignments, such as replacing entities, actions, and flipping event order, which alignment models should be robust against. To this end, we introduce the VideoCon, a video-language alignment dataset constructed by a large language model that generates plausible contrast video captions and explanations for differences between original and contrast video captions. Then, a generative video-language model is finetuned with VideoCon to assess video-language entailment and generate explanations. Our VideoCon-based alignment model significantly outperforms current models. It exhibits a 12-point increase in AUC for the video-language alignment task on human-generated contrast captions. Finally, our model sets new state of the art zero-shot performance in temporally-extensive video-language tasks such as text-to-video retrieval (SSv2-Temporal) and video question answering (ATP-Hard). Moreover, our model shows superior performance on novel videos and human-crafted captions and explanations. Our code and data are available at https://github.com/Hritikbansal/videocon. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2311.10111v1-abstract-full').style.display = 'none'; document.getElementById('2311.10111v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 November, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">22 pages, 19 Figures, 7 Tables</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2308.06595">arXiv:2308.06595</a> <span> [<a href="https://arxiv.org/pdf/2308.06595">pdf</a>, <a href="https://arxiv.org/format/2308.06595">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> VisIT-Bench: A Benchmark for Vision-Language Instruction Following Inspired by Real-World Use </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Bitton%2C+Y">Yonatan Bitton</a>, <a href="/search/cs?searchtype=author&query=Bansal%2C+H">Hritik Bansal</a>, <a href="/search/cs?searchtype=author&query=Hessel%2C+J">Jack Hessel</a>, <a href="/search/cs?searchtype=author&query=Shao%2C+R">Rulin Shao</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+W">Wanrong Zhu</a>, <a href="/search/cs?searchtype=author&query=Awadalla%2C+A">Anas Awadalla</a>, <a href="/search/cs?searchtype=author&query=Gardner%2C+J">Josh Gardner</a>, <a href="/search/cs?searchtype=author&query=Taori%2C+R">Rohan Taori</a>, <a href="/search/cs?searchtype=author&query=Schmidt%2C+L">Ludwig Schmidt</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2308.06595v4-abstract-short" style="display: inline;"> We introduce VisIT-Bench (Visual InsTruction Benchmark), a benchmark for evaluation of instruction-following vision-language models for real-world use. Our starting point is curating 70 'instruction families' that we envision instruction tuned vision-language models should be able to address. Extending beyond evaluations like VQAv2 and COCO, tasks range from basic recognition to game playing and c… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2308.06595v4-abstract-full').style.display = 'inline'; document.getElementById('2308.06595v4-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2308.06595v4-abstract-full" style="display: none;"> We introduce VisIT-Bench (Visual InsTruction Benchmark), a benchmark for evaluation of instruction-following vision-language models for real-world use. Our starting point is curating 70 'instruction families' that we envision instruction tuned vision-language models should be able to address. Extending beyond evaluations like VQAv2 and COCO, tasks range from basic recognition to game playing and creative generation. Following curation, our dataset comprises 592 test queries, each with a human-authored instruction-conditioned caption. These descriptions surface instruction-specific factors, e.g., for an instruction asking about the accessibility of a storefront for wheelchair users, the instruction-conditioned caption describes ramps/potential obstacles. These descriptions enable 1) collecting human-verified reference outputs for each instance; and 2) automatic evaluation of candidate multimodal generations using a text-only LLM, aligning with human judgment. We quantify quality gaps between models and references using both human and automatic evaluations; e.g., the top-performing instruction-following model wins against the GPT-4 reference in just 27% of the comparison. VisIT-Bench is dynamic to participate, practitioners simply submit their model's response on the project website; Data, code and leaderboard is available at visit-bench.github.io. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2308.06595v4-abstract-full').style.display = 'none'; document.getElementById('2308.06595v4-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 December, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 12 August, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to NeurIPS 2023, Datasets and Benchmarks. Website: https://visit-bench.github.io/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2308.01390">arXiv:2308.01390</a> <span> [<a href="https://arxiv.org/pdf/2308.01390">pdf</a>, <a href="https://arxiv.org/format/2308.01390">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> OpenFlamingo: An Open-Source Framework for Training Large Autoregressive Vision-Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Awadalla%2C+A">Anas Awadalla</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+I">Irena Gao</a>, <a href="/search/cs?searchtype=author&query=Gardner%2C+J">Josh Gardner</a>, <a href="/search/cs?searchtype=author&query=Hessel%2C+J">Jack Hessel</a>, <a href="/search/cs?searchtype=author&query=Hanafy%2C+Y">Yusuf Hanafy</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+W">Wanrong Zhu</a>, <a href="/search/cs?searchtype=author&query=Marathe%2C+K">Kalyani Marathe</a>, <a href="/search/cs?searchtype=author&query=Bitton%2C+Y">Yonatan Bitton</a>, <a href="/search/cs?searchtype=author&query=Gadre%2C+S">Samir Gadre</a>, <a href="/search/cs?searchtype=author&query=Sagawa%2C+S">Shiori Sagawa</a>, <a href="/search/cs?searchtype=author&query=Jitsev%2C+J">Jenia Jitsev</a>, <a href="/search/cs?searchtype=author&query=Kornblith%2C+S">Simon Kornblith</a>, <a href="/search/cs?searchtype=author&query=Koh%2C+P+W">Pang Wei Koh</a>, <a href="/search/cs?searchtype=author&query=Ilharco%2C+G">Gabriel Ilharco</a>, <a href="/search/cs?searchtype=author&query=Wortsman%2C+M">Mitchell Wortsman</a>, <a href="/search/cs?searchtype=author&query=Schmidt%2C+L">Ludwig Schmidt</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2308.01390v2-abstract-short" style="display: inline;"> We introduce OpenFlamingo, a family of autoregressive vision-language models ranging from 3B to 9B parameters. OpenFlamingo is an ongoing effort to produce an open-source replication of DeepMind's Flamingo models. On seven vision-language datasets, OpenFlamingo models average between 80 - 89% of corresponding Flamingo performance. This technical report describes our models, training data, hyperpar… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2308.01390v2-abstract-full').style.display = 'inline'; document.getElementById('2308.01390v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2308.01390v2-abstract-full" style="display: none;"> We introduce OpenFlamingo, a family of autoregressive vision-language models ranging from 3B to 9B parameters. OpenFlamingo is an ongoing effort to produce an open-source replication of DeepMind's Flamingo models. On seven vision-language datasets, OpenFlamingo models average between 80 - 89% of corresponding Flamingo performance. This technical report describes our models, training data, hyperparameters, and evaluation suite. We share our models and code at https://github.com/mlfoundations/open_flamingo. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2308.01390v2-abstract-full').style.display = 'none'; document.getElementById('2308.01390v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 August, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 2 August, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2307.04532">arXiv:2307.04532</a> <span> [<a href="https://arxiv.org/pdf/2307.04532">pdf</a>, <a href="https://arxiv.org/format/2307.04532">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Read, Look or Listen? What's Needed for Solving a Multimodal Dataset </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Madvil%2C+N">Netta Madvil</a>, <a href="/search/cs?searchtype=author&query=Bitton%2C+Y">Yonatan Bitton</a>, <a href="/search/cs?searchtype=author&query=Schwartz%2C+R">Roy Schwartz</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2307.04532v1-abstract-short" style="display: inline;"> The prevalence of large-scale multimodal datasets presents unique challenges in assessing dataset quality. We propose a two-step method to analyze multimodal datasets, which leverages a small seed of human annotation to map each multimodal instance to the modalities required to process it. Our method sheds light on the importance of different modalities in datasets, as well as the relationship bet… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2307.04532v1-abstract-full').style.display = 'inline'; document.getElementById('2307.04532v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2307.04532v1-abstract-full" style="display: none;"> The prevalence of large-scale multimodal datasets presents unique challenges in assessing dataset quality. We propose a two-step method to analyze multimodal datasets, which leverages a small seed of human annotation to map each multimodal instance to the modalities required to process it. Our method sheds light on the importance of different modalities in datasets, as well as the relationship between them. We apply our approach to TVQA, a video question-answering dataset, and discover that most questions can be answered using a single modality, without a substantial bias towards any specific modality. Moreover, we find that more than 70% of the questions are solvable using several different single-modality strategies, e.g., by either looking at the video or listening to the audio, highlighting the limited integration of multiple modalities in TVQA. We leverage our annotation and analyze the MERLOT Reserve, finding that it struggles with image-based questions compared to text and audio, but also with auditory speaker identification. Based on our observations, we introduce a new test set that necessitates multiple modalities, observing a dramatic drop in model performance. Our methodology provides valuable insights into multimodal datasets and highlights the need for the development of more robust models. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2307.04532v1-abstract-full').style.display = 'none'; document.getElementById('2307.04532v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 July, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2305.15026">arXiv:2305.15026</a> <span> [<a href="https://arxiv.org/pdf/2305.15026">pdf</a>, <a href="https://arxiv.org/format/2305.15026">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Transferring Visual Attributes from Natural Language to Verified Image Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Valerio%2C+R">Rodrigo Valerio</a>, <a href="/search/cs?searchtype=author&query=Bordalo%2C+J">Joao Bordalo</a>, <a href="/search/cs?searchtype=author&query=Yarom%2C+M">Michal Yarom</a>, <a href="/search/cs?searchtype=author&query=Bitton%2C+Y">Yonatan Bitton</a>, <a href="/search/cs?searchtype=author&query=Szpektor%2C+I">Idan Szpektor</a>, <a href="/search/cs?searchtype=author&query=Magalhaes%2C+J">Joao Magalhaes</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2305.15026v2-abstract-short" style="display: inline;"> Text to image generation methods (T2I) are widely popular in generating art and other creative artifacts. While visual hallucinations can be a positive factor in scenarios where creativity is appreciated, such artifacts are poorly suited for cases where the generated image needs to be grounded in complex natural language without explicit visual elements. In this paper, we propose to strengthen the… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2305.15026v2-abstract-full').style.display = 'inline'; document.getElementById('2305.15026v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2305.15026v2-abstract-full" style="display: none;"> Text to image generation methods (T2I) are widely popular in generating art and other creative artifacts. While visual hallucinations can be a positive factor in scenarios where creativity is appreciated, such artifacts are poorly suited for cases where the generated image needs to be grounded in complex natural language without explicit visual elements. In this paper, we propose to strengthen the consistency property of T2I methods in the presence of natural complex language, which often breaks the limits of T2I methods by including non-visual information, and textual elements that require knowledge for accurate generation. To address these phenomena, we propose a Natural Language to Verified Image generation approach (NL2VI) that converts a natural prompt into a visual prompt, which is more suitable for image generation. A T2I model then generates an image for the visual prompt, which is then verified with VQA algorithms. Experimentally, aligning natural prompts with image generation can improve the consistency of the generated images by up to 11% over the state of the art. Moreover, improvements can generalize to challenging domains like cooking and DIY tasks, where the correctness of the generated image is crucial to illustrate actions. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2305.15026v2-abstract-full').style.display = 'none'; document.getElementById('2305.15026v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 May, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 24 May, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2305.10400">arXiv:2305.10400</a> <span> [<a href="https://arxiv.org/pdf/2305.10400">pdf</a>, <a href="https://arxiv.org/format/2305.10400">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> What You See is What You Read? Improving Text-Image Alignment Evaluation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yarom%2C+M">Michal Yarom</a>, <a href="/search/cs?searchtype=author&query=Bitton%2C+Y">Yonatan Bitton</a>, <a href="/search/cs?searchtype=author&query=Changpinyo%2C+S">Soravit Changpinyo</a>, <a href="/search/cs?searchtype=author&query=Aharoni%2C+R">Roee Aharoni</a>, <a href="/search/cs?searchtype=author&query=Herzig%2C+J">Jonathan Herzig</a>, <a href="/search/cs?searchtype=author&query=Lang%2C+O">Oran Lang</a>, <a href="/search/cs?searchtype=author&query=Ofek%2C+E">Eran Ofek</a>, <a href="/search/cs?searchtype=author&query=Szpektor%2C+I">Idan Szpektor</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2305.10400v4-abstract-short" style="display: inline;"> Automatically determining whether a text and a corresponding image are semantically aligned is a significant challenge for vision-language models, with applications in generative text-to-image and image-to-text tasks. In this work, we study methods for automatic text-image alignment evaluation. We first introduce SeeTRUE: a comprehensive evaluation set, spanning multiple datasets from both text-to… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2305.10400v4-abstract-full').style.display = 'inline'; document.getElementById('2305.10400v4-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2305.10400v4-abstract-full" style="display: none;"> Automatically determining whether a text and a corresponding image are semantically aligned is a significant challenge for vision-language models, with applications in generative text-to-image and image-to-text tasks. In this work, we study methods for automatic text-image alignment evaluation. We first introduce SeeTRUE: a comprehensive evaluation set, spanning multiple datasets from both text-to-image and image-to-text generation tasks, with human judgements for whether a given text-image pair is semantically aligned. We then describe two automatic methods to determine alignment: the first involving a pipeline based on question generation and visual question answering models, and the second employing an end-to-end classification approach by finetuning multimodal pretrained models. Both methods surpass prior approaches in various text-image alignment tasks, with significant improvements in challenging cases that involve complex composition or unnatural images. Finally, we demonstrate how our approaches can localize specific misalignments between an image and a given text, and how they can be used to automatically re-rank candidates in text-to-image generation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2305.10400v4-abstract-full').style.display = 'none'; document.getElementById('2305.10400v4-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 December, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 17 May, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to NeurIPS 2023. Website: https://wysiwyr-itm.github.io/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2304.14318">arXiv:2304.14318</a> <span> [<a href="https://arxiv.org/pdf/2304.14318">pdf</a>, <a href="https://arxiv.org/format/2304.14318">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> q2d: Turning Questions into Dialogs to Teach Models How to Search </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Bitton%2C+Y">Yonatan Bitton</a>, <a href="/search/cs?searchtype=author&query=Cohen-Ganor%2C+S">Shlomi Cohen-Ganor</a>, <a href="/search/cs?searchtype=author&query=Hakimi%2C+I">Ido Hakimi</a>, <a href="/search/cs?searchtype=author&query=Lewenberg%2C+Y">Yoad Lewenberg</a>, <a href="/search/cs?searchtype=author&query=Aharoni%2C+R">Roee Aharoni</a>, <a href="/search/cs?searchtype=author&query=Weinreb%2C+E">Enav Weinreb</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2304.14318v2-abstract-short" style="display: inline;"> One of the exciting capabilities of recent language models for dialog is their ability to independently search for relevant information to ground a given dialog response. However, obtaining training data to teach models how to issue search queries is time and resource consuming. In this work, we propose q2d: an automatic data generation pipeline that generates information-seeking dialogs from ques… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2304.14318v2-abstract-full').style.display = 'inline'; document.getElementById('2304.14318v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2304.14318v2-abstract-full" style="display: none;"> One of the exciting capabilities of recent language models for dialog is their ability to independently search for relevant information to ground a given dialog response. However, obtaining training data to teach models how to issue search queries is time and resource consuming. In this work, we propose q2d: an automatic data generation pipeline that generates information-seeking dialogs from questions. We prompt a large language model (PaLM) to create conversational versions of question answering datasets, and use it to improve query generation models that communicate with external search APIs to ground dialog responses. Unlike previous approaches which relied on human written dialogs with search queries, our method allows to automatically generate query-based grounded dialogs with better control and scale. Our experiments demonstrate that: (1) For query generation on the QReCC dataset, models trained on our synthetically-generated data achieve 90%--97% of the performance of models trained on the human-generated data; (2) We can successfully generate data for training dialog models in new domains without any existing dialog data as demonstrated on the multi-hop MuSiQue and Bamboogle QA datasets. (3) We perform a thorough analysis of the generated dialogs showing that humans find them of high quality and struggle to distinguish them from human-written dialogs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2304.14318v2-abstract-full').style.display = 'none'; document.getElementById('2304.14318v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 December, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 27 April, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to EMNLP 2023. Website: https://question2dialog.github.io/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2304.14108">arXiv:2304.14108</a> <span> [<a href="https://arxiv.org/pdf/2304.14108">pdf</a>, <a href="https://arxiv.org/format/2304.14108">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> DataComp: In search of the next generation of multimodal datasets </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Gadre%2C+S+Y">Samir Yitzhak Gadre</a>, <a href="/search/cs?searchtype=author&query=Ilharco%2C+G">Gabriel Ilharco</a>, <a href="/search/cs?searchtype=author&query=Fang%2C+A">Alex Fang</a>, <a href="/search/cs?searchtype=author&query=Hayase%2C+J">Jonathan Hayase</a>, <a href="/search/cs?searchtype=author&query=Smyrnis%2C+G">Georgios Smyrnis</a>, <a href="/search/cs?searchtype=author&query=Nguyen%2C+T">Thao Nguyen</a>, <a href="/search/cs?searchtype=author&query=Marten%2C+R">Ryan Marten</a>, <a href="/search/cs?searchtype=author&query=Wortsman%2C+M">Mitchell Wortsman</a>, <a href="/search/cs?searchtype=author&query=Ghosh%2C+D">Dhruba Ghosh</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jieyu Zhang</a>, <a href="/search/cs?searchtype=author&query=Orgad%2C+E">Eyal Orgad</a>, <a href="/search/cs?searchtype=author&query=Entezari%2C+R">Rahim Entezari</a>, <a href="/search/cs?searchtype=author&query=Daras%2C+G">Giannis Daras</a>, <a href="/search/cs?searchtype=author&query=Pratt%2C+S">Sarah Pratt</a>, <a href="/search/cs?searchtype=author&query=Ramanujan%2C+V">Vivek Ramanujan</a>, <a href="/search/cs?searchtype=author&query=Bitton%2C+Y">Yonatan Bitton</a>, <a href="/search/cs?searchtype=author&query=Marathe%2C+K">Kalyani Marathe</a>, <a href="/search/cs?searchtype=author&query=Mussmann%2C+S">Stephen Mussmann</a>, <a href="/search/cs?searchtype=author&query=Vencu%2C+R">Richard Vencu</a>, <a href="/search/cs?searchtype=author&query=Cherti%2C+M">Mehdi Cherti</a>, <a href="/search/cs?searchtype=author&query=Krishna%2C+R">Ranjay Krishna</a>, <a href="/search/cs?searchtype=author&query=Koh%2C+P+W">Pang Wei Koh</a>, <a href="/search/cs?searchtype=author&query=Saukh%2C+O">Olga Saukh</a>, <a href="/search/cs?searchtype=author&query=Ratner%2C+A">Alexander Ratner</a>, <a href="/search/cs?searchtype=author&query=Song%2C+S">Shuran Song</a> , et al. (9 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2304.14108v5-abstract-short" style="display: inline;"> Multimodal datasets are a critical component in recent breakthroughs such as Stable Diffusion and GPT-4, yet their design does not receive the same research attention as model architectures or training algorithms. To address this shortcoming in the ML ecosystem, we introduce DataComp, a testbed for dataset experiments centered around a new candidate pool of 12.8 billion image-text pairs from Commo… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2304.14108v5-abstract-full').style.display = 'inline'; document.getElementById('2304.14108v5-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2304.14108v5-abstract-full" style="display: none;"> Multimodal datasets are a critical component in recent breakthroughs such as Stable Diffusion and GPT-4, yet their design does not receive the same research attention as model architectures or training algorithms. To address this shortcoming in the ML ecosystem, we introduce DataComp, a testbed for dataset experiments centered around a new candidate pool of 12.8 billion image-text pairs from Common Crawl. Participants in our benchmark design new filtering techniques or curate new data sources and then evaluate their new dataset by running our standardized CLIP training code and testing the resulting model on 38 downstream test sets. Our benchmark consists of multiple compute scales spanning four orders of magnitude, which enables the study of scaling trends and makes the benchmark accessible to researchers with varying resources. Our baseline experiments show that the DataComp workflow leads to better training sets. In particular, our best baseline, DataComp-1B, enables training a CLIP ViT-L/14 from scratch to 79.2% zero-shot accuracy on ImageNet, outperforming OpenAI's CLIP ViT-L/14 by 3.7 percentage points while using the same training procedure and compute. We release DataComp and all accompanying code at www.datacomp.ai. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2304.14108v5-abstract-full').style.display = 'none'; document.getElementById('2304.14108v5-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 October, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 27 April, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">NeurIPS 2023 Datasets and Benchmarks Track</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2303.15445">arXiv:2303.15445</a> <span> [<a href="https://arxiv.org/pdf/2303.15445">pdf</a>, <a href="https://arxiv.org/format/2303.15445">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> IRFL: Image Recognition of Figurative Language </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yosef%2C+R">Ron Yosef</a>, <a href="/search/cs?searchtype=author&query=Bitton%2C+Y">Yonatan Bitton</a>, <a href="/search/cs?searchtype=author&query=Shahaf%2C+D">Dafna Shahaf</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2303.15445v3-abstract-short" style="display: inline;"> Figures of speech such as metaphors, similes, and idioms are integral parts of human communication. They are ubiquitous in many forms of discourse, allowing people to convey complex, abstract ideas and evoke emotion. As figurative forms are often conveyed through multiple modalities (e.g., both text and images), understanding multimodal figurative language is an important AI challenge, weaving tog… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2303.15445v3-abstract-full').style.display = 'inline'; document.getElementById('2303.15445v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2303.15445v3-abstract-full" style="display: none;"> Figures of speech such as metaphors, similes, and idioms are integral parts of human communication. They are ubiquitous in many forms of discourse, allowing people to convey complex, abstract ideas and evoke emotion. As figurative forms are often conveyed through multiple modalities (e.g., both text and images), understanding multimodal figurative language is an important AI challenge, weaving together profound vision, language, commonsense and cultural knowledge. In this work, we develop the Image Recognition of Figurative Language (IRFL) dataset. We leverage human annotation and an automatic pipeline we created to generate a multimodal dataset, and introduce two novel tasks as a benchmark for multimodal figurative language understanding. We experimented with state-of-the-art vision and language models and found that the best (22%) performed substantially worse than humans (97%). We release our dataset, benchmark, and code, in hopes of driving the development of models that can better understand figurative language. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2303.15445v3-abstract-full').style.display = 'none'; document.getElementById('2303.15445v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 November, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 27 March, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2303.07274">arXiv:2303.07274</a> <span> [<a href="https://arxiv.org/pdf/2303.07274">pdf</a>, <a href="https://arxiv.org/format/2303.07274">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Breaking Common Sense: WHOOPS! A Vision-and-Language Benchmark of Synthetic and Compositional Images </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Bitton-Guetta%2C+N">Nitzan Bitton-Guetta</a>, <a href="/search/cs?searchtype=author&query=Bitton%2C+Y">Yonatan Bitton</a>, <a href="/search/cs?searchtype=author&query=Hessel%2C+J">Jack Hessel</a>, <a href="/search/cs?searchtype=author&query=Schmidt%2C+L">Ludwig Schmidt</a>, <a href="/search/cs?searchtype=author&query=Elovici%2C+Y">Yuval Elovici</a>, <a href="/search/cs?searchtype=author&query=Stanovsky%2C+G">Gabriel Stanovsky</a>, <a href="/search/cs?searchtype=author&query=Schwartz%2C+R">Roy Schwartz</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2303.07274v4-abstract-short" style="display: inline;"> Weird, unusual, and uncanny images pique the curiosity of observers because they challenge commonsense. For example, an image released during the 2022 world cup depicts the famous soccer stars Lionel Messi and Cristiano Ronaldo playing chess, which playfully violates our expectation that their competition should occur on the football field. Humans can easily recognize and interpret these unconvent… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2303.07274v4-abstract-full').style.display = 'inline'; document.getElementById('2303.07274v4-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2303.07274v4-abstract-full" style="display: none;"> Weird, unusual, and uncanny images pique the curiosity of observers because they challenge commonsense. For example, an image released during the 2022 world cup depicts the famous soccer stars Lionel Messi and Cristiano Ronaldo playing chess, which playfully violates our expectation that their competition should occur on the football field. Humans can easily recognize and interpret these unconventional images, but can AI models do the same? We introduce WHOOPS!, a new dataset and benchmark for visual commonsense. The dataset is comprised of purposefully commonsense-defying images created by designers using publicly-available image generation tools like Midjourney. We consider several tasks posed over the dataset. In addition to image captioning, cross-modal matching, and visual question answering, we introduce a difficult explanation generation task, where models must identify and explain why a given image is unusual. Our results show that state-of-the-art models such as GPT3 and BLIP2 still lag behind human performance on WHOOPS!. We hope our dataset will inspire the development of AI models with stronger visual commonsense reasoning abilities. Data, models and code are available at the project website: whoops-benchmark.github.io <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2303.07274v4-abstract-full').style.display = 'none'; document.getElementById('2303.07274v4-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 August, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 13 March, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to ICCV 2023. Website: whoops-benchmark.github.io</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2212.04542">arXiv:2212.04542</a> <span> [<a href="https://arxiv.org/pdf/2212.04542">pdf</a>, <a href="https://arxiv.org/format/2212.04542">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> VASR: Visual Analogies of Situation Recognition </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Bitton%2C+Y">Yonatan Bitton</a>, <a href="/search/cs?searchtype=author&query=Yosef%2C+R">Ron Yosef</a>, <a href="/search/cs?searchtype=author&query=Strugo%2C+E">Eli Strugo</a>, <a href="/search/cs?searchtype=author&query=Shahaf%2C+D">Dafna Shahaf</a>, <a href="/search/cs?searchtype=author&query=Schwartz%2C+R">Roy Schwartz</a>, <a href="/search/cs?searchtype=author&query=Stanovsky%2C+G">Gabriel Stanovsky</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2212.04542v1-abstract-short" style="display: inline;"> A core process in human cognition is analogical mapping: the ability to identify a similar relational structure between different situations. We introduce a novel task, Visual Analogies of Situation Recognition, adapting the classical word-analogy task into the visual domain. Given a triplet of images, the task is to select an image candidate B' that completes the analogy (A to A' is like B to wha… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2212.04542v1-abstract-full').style.display = 'inline'; document.getElementById('2212.04542v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2212.04542v1-abstract-full" style="display: none;"> A core process in human cognition is analogical mapping: the ability to identify a similar relational structure between different situations. We introduce a novel task, Visual Analogies of Situation Recognition, adapting the classical word-analogy task into the visual domain. Given a triplet of images, the task is to select an image candidate B' that completes the analogy (A to A' is like B to what?). Unlike previous work on visual analogy that focused on simple image transformations, we tackle complex analogies requiring understanding of scenes. We leverage situation recognition annotations and the CLIP model to generate a large set of 500k candidate analogies. Crowdsourced annotations for a sample of the data indicate that humans agree with the dataset label ~80% of the time (chance level 25%). Furthermore, we use human annotations to create a gold-standard dataset of 3,820 validated analogies. Our experiments demonstrate that state-of-the-art models do well when distractors are chosen randomly (~86%), but struggle with carefully chosen distractors (~53%, compared to 90% human accuracy). We hope our dataset will encourage the development of new analogy-making models. Website: https://vasr-dataset.github.io/ <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2212.04542v1-abstract-full').style.display = 'none'; document.getElementById('2212.04542v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 December, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to AAAI 2023. Website: https://vasr-dataset.github.io/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2207.12576">arXiv:2207.12576</a> <span> [<a href="https://arxiv.org/pdf/2207.12576">pdf</a>, <a href="https://arxiv.org/format/2207.12576">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> </div> </div> <p class="title is-5 mathjax"> WinoGAViL: Gamified Association Benchmark to Challenge Vision-and-Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Bitton%2C+Y">Yonatan Bitton</a>, <a href="/search/cs?searchtype=author&query=Guetta%2C+N+B">Nitzan Bitton Guetta</a>, <a href="/search/cs?searchtype=author&query=Yosef%2C+R">Ron Yosef</a>, <a href="/search/cs?searchtype=author&query=Elovici%2C+Y">Yuval Elovici</a>, <a href="/search/cs?searchtype=author&query=Bansal%2C+M">Mohit Bansal</a>, <a href="/search/cs?searchtype=author&query=Stanovsky%2C+G">Gabriel Stanovsky</a>, <a href="/search/cs?searchtype=author&query=Schwartz%2C+R">Roy Schwartz</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2207.12576v2-abstract-short" style="display: inline;"> While vision-and-language models perform well on tasks such as visual question answering, they struggle when it comes to basic human commonsense reasoning skills. In this work, we introduce WinoGAViL: an online game of vision-and-language associations (e.g., between werewolves and a full moon), used as a dynamic evaluation benchmark. Inspired by the popular card game Codenames, a spymaster gives a… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2207.12576v2-abstract-full').style.display = 'inline'; document.getElementById('2207.12576v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2207.12576v2-abstract-full" style="display: none;"> While vision-and-language models perform well on tasks such as visual question answering, they struggle when it comes to basic human commonsense reasoning skills. In this work, we introduce WinoGAViL: an online game of vision-and-language associations (e.g., between werewolves and a full moon), used as a dynamic evaluation benchmark. Inspired by the popular card game Codenames, a spymaster gives a textual cue related to several visual candidates, and another player tries to identify them. Human players are rewarded for creating associations that are challenging for a rival AI model but still solvable by other human players. We use the game to collect 3.5K instances, finding that they are intuitive for humans (>90% Jaccard index) but challenging for state-of-the-art AI models, where the best model (ViLT) achieves a score of 52%, succeeding mostly where the cue is visually salient. Our analysis as well as the feedback we collect from players indicate that the collected associations require diverse reasoning skills, including general knowledge, common sense, abstraction, and more. We release the dataset, the code and the interactive game, allowing future data collection that can be used to develop models with better association abilities. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2207.12576v2-abstract-full').style.display = 'none'; document.getElementById('2207.12576v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 October, 2022; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 25 July, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to NeurIPS 2022, Datasets and Benchmarks. Website: https://winogavil.github.io/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2109.02040">arXiv:2109.02040</a> <span> [<a href="https://arxiv.org/pdf/2109.02040">pdf</a>, <a href="https://arxiv.org/format/2109.02040">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Data Efficient Masked Language Modeling for Vision and Language </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Bitton%2C+Y">Yonatan Bitton</a>, <a href="/search/cs?searchtype=author&query=Stanovsky%2C+G">Gabriel Stanovsky</a>, <a href="/search/cs?searchtype=author&query=Elhadad%2C+M">Michael Elhadad</a>, <a href="/search/cs?searchtype=author&query=Schwartz%2C+R">Roy Schwartz</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2109.02040v1-abstract-short" style="display: inline;"> Masked language modeling (MLM) is one of the key sub-tasks in vision-language pretraining. In the cross-modal setting, tokens in the sentence are masked at random, and the model predicts the masked tokens given the image and the text. In this paper, we observe several key disadvantages of MLM in this setting. First, as captions tend to be short, in a third of the sentences no token is sampled. Sec… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2109.02040v1-abstract-full').style.display = 'inline'; document.getElementById('2109.02040v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2109.02040v1-abstract-full" style="display: none;"> Masked language modeling (MLM) is one of the key sub-tasks in vision-language pretraining. In the cross-modal setting, tokens in the sentence are masked at random, and the model predicts the masked tokens given the image and the text. In this paper, we observe several key disadvantages of MLM in this setting. First, as captions tend to be short, in a third of the sentences no token is sampled. Second, the majority of masked tokens are stop-words and punctuation, leading to under-utilization of the image. We investigate a range of alternative masking strategies specific to the cross-modal setting that address these shortcomings, aiming for better fusion of text and image in the learned representation. When pre-training the LXMERT model, our alternative masking strategies consistently improve over the original masking strategy on three downstream tasks, especially in low resource settings. Further, our pre-training approach substantially outperforms the baseline model on a prompt-based probing task designed to elicit image objects. These results and our analysis indicate that our method allows for better utilization of the training data. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2109.02040v1-abstract-full').style.display = 'none'; document.getElementById('2109.02040v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 September, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2021. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to Findings of EMNLP 2021</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2103.09591">arXiv:2103.09591</a> <span> [<a href="https://arxiv.org/pdf/2103.09591">pdf</a>, <a href="https://arxiv.org/format/2103.09591">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Automatic Generation of Contrast Sets from Scene Graphs: Probing the Compositional Consistency of GQA </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Bitton%2C+Y">Yonatan Bitton</a>, <a href="/search/cs?searchtype=author&query=Stanovsky%2C+G">Gabriel Stanovsky</a>, <a href="/search/cs?searchtype=author&query=Schwartz%2C+R">Roy Schwartz</a>, <a href="/search/cs?searchtype=author&query=Elhadad%2C+M">Michael Elhadad</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2103.09591v1-abstract-short" style="display: inline;"> Recent works have shown that supervised models often exploit data artifacts to achieve good test scores while their performance severely degrades on samples outside their training distribution. Contrast sets (Gardneret al., 2020) quantify this phenomenon by perturbing test samples in a minimal way such that the output label is modified. While most contrast sets were created manually, requiring int… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2103.09591v1-abstract-full').style.display = 'inline'; document.getElementById('2103.09591v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2103.09591v1-abstract-full" style="display: none;"> Recent works have shown that supervised models often exploit data artifacts to achieve good test scores while their performance severely degrades on samples outside their training distribution. Contrast sets (Gardneret al., 2020) quantify this phenomenon by perturbing test samples in a minimal way such that the output label is modified. While most contrast sets were created manually, requiring intensive annotation effort, we present a novel method which leverages rich semantic input representation to automatically generate contrast sets for the visual question answering task. Our method computes the answer of perturbed questions, thus vastly reducing annotation cost and enabling thorough evaluation of models' performance on various semantic aspects (e.g., spatial or relational reasoning). We demonstrate the effectiveness of our approach on the GQA dataset and its semantic scene graph image representation. We find that, despite GQA's compositionality and carefully balanced label distribution, two high-performing models drop 13-17% in accuracy compared to the original test set. Finally, we show that our automatic perturbation can be applied to the training set to mitigate the degradation in performance, opening the door to more robust models. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2103.09591v1-abstract-full').style.display = 'none'; document.getElementById('2103.09591v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 March, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2021. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to NAACL 2021</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/1212.3540">arXiv:1212.3540</a> <span> [<a href="https://arxiv.org/pdf/1212.3540">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Social and Information Networks">cs.SI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Physics and Society">physics.soc-ph</span> </div> </div> <p class="title is-5 mathjax"> Social Network Based Search for Experts </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Bitton%2C+Y">Yehonatan Bitton</a>, <a href="/search/cs?searchtype=author&query=Fire%2C+M">Michael Fire</a>, <a href="/search/cs?searchtype=author&query=Kagan%2C+D">Dima Kagan</a>, <a href="/search/cs?searchtype=author&query=Shapira%2C+B">Bracha Shapira</a>, <a href="/search/cs?searchtype=author&query=Rokach%2C+L">Lior Rokach</a>, <a href="/search/cs?searchtype=author&query=Bar-Ilan%2C+J">Judit Bar-Ilan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="1212.3540v1-abstract-short" style="display: inline;"> Our system illustrates how information retrieved from social networks can be used for suggesting experts for specific tasks. The system is designed to facilitate the task of finding the appropriate person(s) for a job, as a conference committee member, an advisor, etc. This short description will demonstrate how the system works in the context of the HCIR2012 published tasks. </span> <span class="abstract-full has-text-grey-dark mathjax" id="1212.3540v1-abstract-full" style="display: none;"> Our system illustrates how information retrieved from social networks can be used for suggesting experts for specific tasks. The system is designed to facilitate the task of finding the appropriate person(s) for a job, as a conference committee member, an advisor, etc. This short description will demonstrate how the system works in the context of the HCIR2012 published tasks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('1212.3540v1-abstract-full').style.display = 'none'; document.getElementById('1212.3540v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 December, 2012; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2012. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Participated in HCIR 2012</span> </p> </li> </ol> <div class="is-hidden-tablet">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary">  <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div>   <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div>  </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>

CINXE.COM

Search | arXiv e-print repository