CINXE.COM
Search | arXiv e-print repository
<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/> <!-- new favicon config and versions by realfavicongenerator.net --> <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b"> <!-- end favicon config --> <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\\(","\\)"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a> <!-- contains Cornell logo and sponsor statement --> <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div> <!-- contains arXiv identity and search bar --> <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div> <!-- closes identity --> <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1–50 of 127 results for author: <span class="mathjax">Snoek, C G M</span> </h1> </div> <div class="level-right is-hidden-mobile"> <!-- feedback for mobile is moved to footer --> <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a> </span> </div> </div> <div class="content"> <form method="GET" action="/search/cs" aria-role="search"> Searching in archive <strong>cs</strong>. <a href="/search/?searchtype=author&query=Snoek%2C+C+G+M">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Snoek, C G M"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Snoek%2C+C+G+M&terms-0-field=author&size=50&order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Snoek, C G M"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Snoek%2C+C+G+M&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Snoek%2C+C+G+M&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Snoek%2C+C+G+M&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Snoek%2C+C+G+M&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> </ul> </nav> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.12195">arXiv:2502.12195</a> <span> [<a href="https://arxiv.org/pdf/2502.12195">pdf</a>, <a href="https://arxiv.org/format/2502.12195">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> GeneralizeFormer: Layer-Adaptive Model Generation across Test-Time Distribution Shifts </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ambekar%2C+S">Sameer Ambekar</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+Z">Zehao Xiao</a>, <a href="/search/cs?searchtype=author&query=Zhen%2C+X">Xiantong Zhen</a>, <a href="/search/cs?searchtype=author&query=Snoek%2C+C+G+M">Cees G. M. Snoek</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.12195v1-abstract-short" style="display: inline;"> We consider the problem of test-time domain generalization, where a model is trained on several source domains and adjusted on target domains never seen during training. Different from the common methods that fine-tune the model or adjust the classifier parameters online, we propose to generate multiple layer parameters on the fly during inference by a lightweight meta-learned transformer, which w… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.12195v1-abstract-full').style.display = 'inline'; document.getElementById('2502.12195v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.12195v1-abstract-full" style="display: none;"> We consider the problem of test-time domain generalization, where a model is trained on several source domains and adjusted on target domains never seen during training. Different from the common methods that fine-tune the model or adjust the classifier parameters online, we propose to generate multiple layer parameters on the fly during inference by a lightweight meta-learned transformer, which we call \textit{GeneralizeFormer}. The layer-wise parameters are generated per target batch without fine-tuning or online adjustment. By doing so, our method is more effective in dynamic scenarios with multiple target distributions and also avoids forgetting valuable source distribution characteristics. Moreover, by considering layer-wise gradients, the proposed method adapts itself to various distribution shifts. To reduce the computational and time cost, we fix the convolutional parameters while only generating parameters of the Batch Normalization layers and the linear classifier. Experiments on six widely used domain generalization datasets demonstrate the benefits and abilities of the proposed method to efficiently handle various distribution shifts, generalize in dynamic scenarios, and avoid forgetting. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.12195v1-abstract-full').style.display = 'none'; document.getElementById('2502.12195v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">WACV 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.02338">arXiv:2502.02338</a> <span> [<a href="https://arxiv.org/pdf/2502.02338">pdf</a>, <a href="https://arxiv.org/format/2502.02338">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Geometric Neural Process Fields </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yin%2C+W">Wenzhe Yin</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+Z">Zehao Xiao</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+J">Jiayi Shen</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yunlu Chen</a>, <a href="/search/cs?searchtype=author&query=Snoek%2C+C+G+M">Cees G. M. Snoek</a>, <a href="/search/cs?searchtype=author&query=Sonke%2C+J">Jan-Jakob Sonke</a>, <a href="/search/cs?searchtype=author&query=Gavves%2C+E">Efstratios Gavves</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.02338v1-abstract-short" style="display: inline;"> This paper addresses the challenge of Neural Field (NeF) generalization, where models must efficiently adapt to new signals given only a few observations. To tackle this, we propose Geometric Neural Process Fields (G-NPF), a probabilistic framework for neural radiance fields that explicitly captures uncertainty. We formulate NeF generalization as a probabilistic problem, enabling direct inference… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.02338v1-abstract-full').style.display = 'inline'; document.getElementById('2502.02338v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.02338v1-abstract-full" style="display: none;"> This paper addresses the challenge of Neural Field (NeF) generalization, where models must efficiently adapt to new signals given only a few observations. To tackle this, we propose Geometric Neural Process Fields (G-NPF), a probabilistic framework for neural radiance fields that explicitly captures uncertainty. We formulate NeF generalization as a probabilistic problem, enabling direct inference of NeF function distributions from limited context observations. To incorporate structural inductive biases, we introduce a set of geometric bases that encode spatial structure and facilitate the inference of NeF function distributions. Building on these bases, we design a hierarchical latent variable model, allowing G-NPF to integrate structural information across multiple spatial levels and effectively parameterize INR functions. This hierarchical approach improves generalization to novel scenes and unseen signals. Experiments on novel-view synthesis for 3D scenes, as well as 2D image and 1D signal regression, demonstrate the effectiveness of our method in capturing uncertainty and leveraging structural information for improved generalization. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.02338v1-abstract-full').style.display = 'none'; document.getElementById('2502.02338v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.16404">arXiv:2501.16404</a> <span> [<a href="https://arxiv.org/pdf/2501.16404">pdf</a>, <a href="https://arxiv.org/format/2501.16404">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> DynaPrompt: Dynamic Test-Time Prompt Tuning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xiao%2C+Z">Zehao Xiao</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+S">Shilin Yan</a>, <a href="/search/cs?searchtype=author&query=Hong%2C+J">Jack Hong</a>, <a href="/search/cs?searchtype=author&query=Cai%2C+J">Jiayin Cai</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+X">Xiaolong Jiang</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+Y">Yao Hu</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+J">Jiayi Shen</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Q">Qi Wang</a>, <a href="/search/cs?searchtype=author&query=Snoek%2C+C+G+M">Cees G. M. Snoek</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.16404v1-abstract-short" style="display: inline;"> Test-time prompt tuning enhances zero-shot generalization of vision-language models but tends to ignore the relatedness among test samples during inference. Online test-time prompt tuning provides a simple way to leverage the information in previous test samples, albeit with the risk of prompt collapse due to error accumulation. To enhance test-time prompt tuning, we propose DynaPrompt, short for… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.16404v1-abstract-full').style.display = 'inline'; document.getElementById('2501.16404v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.16404v1-abstract-full" style="display: none;"> Test-time prompt tuning enhances zero-shot generalization of vision-language models but tends to ignore the relatedness among test samples during inference. Online test-time prompt tuning provides a simple way to leverage the information in previous test samples, albeit with the risk of prompt collapse due to error accumulation. To enhance test-time prompt tuning, we propose DynaPrompt, short for dynamic test-time prompt tuning, exploiting relevant data distribution information while reducing error accumulation. Built on an online prompt buffer, DynaPrompt adaptively selects and optimizes the relevant prompts for each test sample during tuning. Specifically, we introduce a dynamic prompt selection strategy based on two metrics: prediction entropy and probability difference. For unseen test data information, we develop dynamic prompt appending, which allows the buffer to append new prompts and delete the inactive ones. By doing so, the prompts are optimized to exploit beneficial information on specific test data, while alleviating error accumulation. Experiments on fourteen datasets demonstrate the effectiveness of dynamic test-time prompt tuning. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.16404v1-abstract-full').style.display = 'none'; document.getElementById('2501.16404v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ICLR 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.05069">arXiv:2501.05069</a> <span> [<a href="https://arxiv.org/pdf/2501.05069">pdf</a>, <a href="https://arxiv.org/format/2501.05069">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Commonsense Video Question Answering through Video-Grounded Entailment Tree Reasoning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+H">Huabin Liu</a>, <a href="/search/cs?searchtype=author&query=Ilievski%2C+F">Filip Ilievski</a>, <a href="/search/cs?searchtype=author&query=Snoek%2C+C+G+M">Cees G. M. Snoek</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.05069v1-abstract-short" style="display: inline;"> This paper proposes the first video-grounded entailment tree reasoning method for commonsense video question answering (VQA). Despite the remarkable progress of large visual-language models (VLMs), there are growing concerns that they learn spurious correlations between videos and likely answers, reinforced by their black-box nature and remaining benchmarking biases. Our method explicitly grounds… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.05069v1-abstract-full').style.display = 'inline'; document.getElementById('2501.05069v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.05069v1-abstract-full" style="display: none;"> This paper proposes the first video-grounded entailment tree reasoning method for commonsense video question answering (VQA). Despite the remarkable progress of large visual-language models (VLMs), there are growing concerns that they learn spurious correlations between videos and likely answers, reinforced by their black-box nature and remaining benchmarking biases. Our method explicitly grounds VQA tasks to video fragments in four steps: entailment tree construction, video-language entailment verification, tree reasoning, and dynamic tree expansion. A vital benefit of the method is its generalizability to current video and image-based VLMs across reasoning types. To support fair evaluation, we devise a de-biasing procedure based on large-language models that rewrites VQA benchmark answer sets to enforce model reasoning. Systematic experiments on existing and de-biased benchmarks highlight the impact of our method components across benchmarks, VLMs, and reasoning types. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.05069v1-abstract-full').style.display = 'none'; document.getElementById('2501.05069v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.11148">arXiv:2412.11148</a> <span> [<a href="https://arxiv.org/pdf/2412.11148">pdf</a>, <a href="https://arxiv.org/format/2412.11148">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Redefining Normal: A Novel Object-Level Approach for Multi-Object Novelty Detection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Salehi%2C+M">Mohammadreza Salehi</a>, <a href="/search/cs?searchtype=author&query=Apostolikas%2C+N">Nikolaos Apostolikas</a>, <a href="/search/cs?searchtype=author&query=Gavves%2C+E">Efstratios Gavves</a>, <a href="/search/cs?searchtype=author&query=Snoek%2C+C+G+M">Cees G. M. Snoek</a>, <a href="/search/cs?searchtype=author&query=Asano%2C+Y+M">Yuki M. Asano</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.11148v1-abstract-short" style="display: inline;"> In the realm of novelty detection, accurately identifying outliers in data without specific class information poses a significant challenge. While current methods excel in single-object scenarios, they struggle with multi-object situations due to their focus on individual objects. Our paper suggests a novel approach: redefining `normal' at the object level in training datasets. Rather than the usu… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.11148v1-abstract-full').style.display = 'inline'; document.getElementById('2412.11148v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.11148v1-abstract-full" style="display: none;"> In the realm of novelty detection, accurately identifying outliers in data without specific class information poses a significant challenge. While current methods excel in single-object scenarios, they struggle with multi-object situations due to their focus on individual objects. Our paper suggests a novel approach: redefining `normal' at the object level in training datasets. Rather than the usual image-level view, we consider the most dominant object in a dataset as the norm, offering a perspective that is more effective for real-world scenarios. Adapting to our object-level definition of `normal', we modify knowledge distillation frameworks, where a student network learns from a pre-trained teacher network. Our first contribution, DeFeND(Dense Feature Fine-tuning on Normal Data), integrates dense feature fine-tuning into the distillation process, allowing the teacher network to focus on object-level features with a self-supervised loss. The second is masked knowledge distillation, where the student network works with partially hidden inputs, honing its ability to deduce and generalize from incomplete data. This approach not only fares well in single-object novelty detection but also considerably surpasses existing methods in multi-object contexts. The implementation is available at: https://github.com/SMSD75/Redefining_Normal_ACCV24/tree/main <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.11148v1-abstract-full').style.display = 'none'; document.getElementById('2412.11148v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted at ACCV24(Oral)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.19534">arXiv:2411.19534</a> <span> [<a href="https://arxiv.org/pdf/2411.19534">pdf</a>, <a href="https://arxiv.org/format/2411.19534">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> QUOTA: Quantifying Objects with Text-to-Image Models for Any Domain </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Sun%2C+W">Wenfang Sun</a>, <a href="/search/cs?searchtype=author&query=Du%2C+Y">Yingjun Du</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+G">Gaowen Liu</a>, <a href="/search/cs?searchtype=author&query=Snoek%2C+C+G+M">Cees G. M. Snoek</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.19534v1-abstract-short" style="display: inline;"> We tackle the problem of quantifying the number of objects by a generative text-to-image model. Rather than retraining such a model for each new image domain of interest, which leads to high computational costs and limited scalability, we are the first to consider this problem from a domain-agnostic perspective. We propose QUOTA, an optimization framework for text-to-image models that enables effe… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.19534v1-abstract-full').style.display = 'inline'; document.getElementById('2411.19534v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.19534v1-abstract-full" style="display: none;"> We tackle the problem of quantifying the number of objects by a generative text-to-image model. Rather than retraining such a model for each new image domain of interest, which leads to high computational costs and limited scalability, we are the first to consider this problem from a domain-agnostic perspective. We propose QUOTA, an optimization framework for text-to-image models that enables effective object quantification across unseen domains without retraining. It leverages a dual-loop meta-learning strategy to optimize a domain-invariant prompt. Further, by integrating prompt learning with learnable counting and domain tokens, our method captures stylistic variations and maintains accuracy, even for object classes not encountered during training. For evaluation, we adopt a new benchmark specifically designed for object quantification in domain generalization, enabling rigorous assessment of object quantification accuracy and adaptability across unseen domains in text-to-image generation. Extensive experiments demonstrate that QUOTA outperforms conventional models in both object quantification accuracy and semantic consistency, setting a new benchmark for efficient and scalable text-to-image generation for any domain. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.19534v1-abstract-full').style.display = 'none'; document.getElementById('2411.19534v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">12 pages, 6 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.11222">arXiv:2411.11222</a> <span> [<a href="https://arxiv.org/pdf/2411.11222">pdf</a>, <a href="https://arxiv.org/format/2411.11222">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> The Sound of Water: Inferring Physical Properties from Pouring Liquids </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Bagad%2C+P">Piyush Bagad</a>, <a href="/search/cs?searchtype=author&query=Tapaswi%2C+M">Makarand Tapaswi</a>, <a href="/search/cs?searchtype=author&query=Snoek%2C+C+G+M">Cees G. M. Snoek</a>, <a href="/search/cs?searchtype=author&query=Zisserman%2C+A">Andrew Zisserman</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.11222v2-abstract-short" style="display: inline;"> We study the connection between audio-visual observations and the underlying physics of a mundane yet intriguing everyday activity: pouring liquids. Given only the sound of liquid pouring into a container, our objective is to automatically infer physical properties such as the liquid level, the shape and size of the container, the pouring rate and the time to fill. To this end, we: (i) show in the… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11222v2-abstract-full').style.display = 'inline'; document.getElementById('2411.11222v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.11222v2-abstract-full" style="display: none;"> We study the connection between audio-visual observations and the underlying physics of a mundane yet intriguing everyday activity: pouring liquids. Given only the sound of liquid pouring into a container, our objective is to automatically infer physical properties such as the liquid level, the shape and size of the container, the pouring rate and the time to fill. To this end, we: (i) show in theory that these properties can be determined from the fundamental frequency (pitch); (ii) train a pitch detection model with supervision from simulated data and visual data with a physics-inspired objective; (iii) introduce a new large dataset of real pouring videos for a systematic study; (iv) show that the trained model can indeed infer these physical properties for real data; and finally, (v) we demonstrate strong generalization to various container shapes, other datasets, and in-the-wild YouTube videos. Our work presents a keen understanding of a narrow yet rich problem at the intersection of acoustics, physics, and learning. It opens up applications to enhance multisensory perception in robotic pouring. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11222v2-abstract-full').style.display = 'none'; document.getElementById('2411.11222v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 17 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Project page at https://bpiyush.github.io/pouring-water-website. Short version accepted to ICASSP 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.04679">arXiv:2411.04679</a> <span> [<a href="https://arxiv.org/pdf/2411.04679">pdf</a>, <a href="https://arxiv.org/format/2411.04679">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multiagent Systems">cs.MA</span> </div> </div> <p class="title is-5 mathjax"> CaPo: Cooperative Plan Optimization for Efficient Embodied Multi-Agent Cooperation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+J">Jie Liu</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+P">Pan Zhou</a>, <a href="/search/cs?searchtype=author&query=Du%2C+Y">Yingjun Du</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+A">Ah-Hwee Tan</a>, <a href="/search/cs?searchtype=author&query=Snoek%2C+C+G+M">Cees G. M. Snoek</a>, <a href="/search/cs?searchtype=author&query=Sonke%2C+J">Jan-Jakob Sonke</a>, <a href="/search/cs?searchtype=author&query=Gavves%2C+E">Efstratios Gavves</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.04679v1-abstract-short" style="display: inline;"> In this work, we address the cooperation problem among large language model (LLM) based embodied agents, where agents must cooperate to achieve a common goal. Previous methods often execute actions extemporaneously and incoherently, without long-term strategic and cooperative planning, leading to redundant steps, failures, and even serious repercussions in complex tasks like search-and-rescue miss… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.04679v1-abstract-full').style.display = 'inline'; document.getElementById('2411.04679v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.04679v1-abstract-full" style="display: none;"> In this work, we address the cooperation problem among large language model (LLM) based embodied agents, where agents must cooperate to achieve a common goal. Previous methods often execute actions extemporaneously and incoherently, without long-term strategic and cooperative planning, leading to redundant steps, failures, and even serious repercussions in complex tasks like search-and-rescue missions where discussion and cooperative plan are crucial. To solve this issue, we propose Cooperative Plan Optimization (CaPo) to enhance the cooperation efficiency of LLM-based embodied agents. Inspired by human cooperation schemes, CaPo improves cooperation efficiency with two phases: 1) meta-plan generation, and 2) progress-adaptive meta-plan and execution. In the first phase, all agents analyze the task, discuss, and cooperatively create a meta-plan that decomposes the task into subtasks with detailed steps, ensuring a long-term strategic and coherent plan for efficient coordination. In the second phase, agents execute tasks according to the meta-plan and dynamically adjust it based on their latest progress (e.g., discovering a target object) through multi-turn discussions. This progress-based adaptation eliminates redundant actions, improving the overall cooperation efficiency of agents. Experimental results on the ThreeDworld Multi-Agent Transport and Communicative Watch-And-Help tasks demonstrate that CaPo achieves much higher task completion rate and efficiency compared with state-of-the-arts. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.04679v1-abstract-full').style.display = 'none'; document.getElementById('2411.04679v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Under review</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.03687">arXiv:2411.03687</a> <span> [<a href="https://arxiv.org/pdf/2411.03687">pdf</a>, <a href="https://arxiv.org/format/2411.03687">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Beyond Model Adaptation at Test Time: A Survey </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xiao%2C+Z">Zehao Xiao</a>, <a href="/search/cs?searchtype=author&query=Snoek%2C+C+G+M">Cees G. M. Snoek</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.03687v1-abstract-short" style="display: inline;"> Machine learning algorithms have achieved remarkable success across various disciplines, use cases and applications, under the prevailing assumption that training and test samples are drawn from the same distribution. Consequently, these algorithms struggle and become brittle even when samples in the test distribution start to deviate from the ones observed during training. Domain adaptation and d… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.03687v1-abstract-full').style.display = 'inline'; document.getElementById('2411.03687v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.03687v1-abstract-full" style="display: none;"> Machine learning algorithms have achieved remarkable success across various disciplines, use cases and applications, under the prevailing assumption that training and test samples are drawn from the same distribution. Consequently, these algorithms struggle and become brittle even when samples in the test distribution start to deviate from the ones observed during training. Domain adaptation and domain generalization have been studied extensively as approaches to address distribution shifts across test and train domains, but each has its limitations. Test-time adaptation, a recently emerging learning paradigm, combines the benefits of domain adaptation and domain generalization by training models only on source data and adapting them to target data during test-time inference. In this survey, we provide a comprehensive and systematic review on test-time adaptation, covering more than 400 recent papers. We structure our review by categorizing existing methods into five distinct categories based on what component of the method is adjusted for test-time adaptation: the model, the inference, the normalization, the sample, or the prompt, providing detailed analysis of each. We further discuss the various preparation and adaptation settings for methods within these categories, offering deeper insights into the effective deployment for the evaluation of distribution shifts and their real-world application in understanding images, video and 3D, as well as modalities beyond vision. We close the survey with an outlook on emerging research opportunities for test-time adaptation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.03687v1-abstract-full').style.display = 'none'; document.getElementById('2411.03687v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.20164">arXiv:2410.20164</a> <span> [<a href="https://arxiv.org/pdf/2410.20164">pdf</a>, <a href="https://arxiv.org/format/2410.20164">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Prompt Diffusion Robustifies Any-Modality Prompt Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Du%2C+Y">Yingjun Du</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+G">Gaowen Liu</a>, <a href="/search/cs?searchtype=author&query=Shang%2C+Y">Yuzhang Shang</a>, <a href="/search/cs?searchtype=author&query=Yao%2C+Y">Yuguang Yao</a>, <a href="/search/cs?searchtype=author&query=Kompella%2C+R">Ramana Kompella</a>, <a href="/search/cs?searchtype=author&query=Snoek%2C+C+G+M">Cees G. M. Snoek</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.20164v1-abstract-short" style="display: inline;"> Foundation models enable prompt-based classifiers for zero-shot and few-shot learning. Nonetheless, the conventional method of employing fixed prompts suffers from distributional shifts that negatively impact generalizability to unseen samples. This paper introduces prompt diffusion, which uses a diffusion model to gradually refine the prompts to obtain a customized prompt for each sample. Specifi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.20164v1-abstract-full').style.display = 'inline'; document.getElementById('2410.20164v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.20164v1-abstract-full" style="display: none;"> Foundation models enable prompt-based classifiers for zero-shot and few-shot learning. Nonetheless, the conventional method of employing fixed prompts suffers from distributional shifts that negatively impact generalizability to unseen samples. This paper introduces prompt diffusion, which uses a diffusion model to gradually refine the prompts to obtain a customized prompt for each sample. Specifically, we first optimize a collection of prompts to obtain over-fitted prompts per sample. Then, we propose a prompt diffusion model within the prompt space, enabling the training of a generative transition process from a random prompt to its overfitted prompt. As we cannot access the label of a test image during inference, our model gradually generates customized prompts solely from random prompts using our trained, prompt diffusion. Our prompt diffusion is generic, flexible, and modality-agnostic, making it a simple plug-and-play module seamlessly embedded into existing prompt learning methods for textual, visual, or multi-modal prompt learning. Our diffusion model uses a fast ODE-based sampling strategy to optimize test sample prompts in just five steps, offering a good trade-off between performance improvement and computational efficiency. For all prompt learning methods tested, adding prompt diffusion yields more robust results for base-to-new generalization, cross-dataset generalization, and domain generalization in classification tasks tested over 15 diverse datasets. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.20164v1-abstract-full').style.display = 'none'; document.getElementById('2410.20164v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Under review</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.15397">arXiv:2410.15397</a> <span> [<a href="https://arxiv.org/pdf/2410.15397">pdf</a>, <a href="https://arxiv.org/format/2410.15397">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> IPO: Interpretable Prompt Optimization for Vision-Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Du%2C+Y">Yingjun Du</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+W">Wenfang Sun</a>, <a href="/search/cs?searchtype=author&query=Snoek%2C+C+G+M">Cees G. M. Snoek</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.15397v1-abstract-short" style="display: inline;"> Pre-trained vision-language models like CLIP have remarkably adapted to various downstream tasks. Nonetheless, their performance heavily depends on the specificity of the input text prompts, which requires skillful prompt template engineering. Instead, current approaches to prompt optimization learn the prompts through gradient descent, where the prompts are treated as adjustable parameters. Howev… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.15397v1-abstract-full').style.display = 'inline'; document.getElementById('2410.15397v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.15397v1-abstract-full" style="display: none;"> Pre-trained vision-language models like CLIP have remarkably adapted to various downstream tasks. Nonetheless, their performance heavily depends on the specificity of the input text prompts, which requires skillful prompt template engineering. Instead, current approaches to prompt optimization learn the prompts through gradient descent, where the prompts are treated as adjustable parameters. However, these methods tend to lead to overfitting of the base classes seen during training and produce prompts that are no longer understandable by humans. This paper introduces a simple but interpretable prompt optimizer (IPO), that utilizes large language models (LLMs) to generate textual prompts dynamically. We introduce a Prompt Optimization Prompt that not only guides LLMs in creating effective prompts but also stores past prompts with their performance metrics, providing rich in-context information. Additionally, we incorporate a large multimodal model (LMM) to condition on visual content by generating image descriptions, which enhance the interaction between textual and visual modalities. This allows for thae creation of dataset-specific prompts that improve generalization performance, while maintaining human comprehension. Extensive testing across 11 datasets reveals that IPO not only improves the accuracy of existing gradient-descent-based prompt learning methods but also considerably enhances the interpretability of the generated prompts. By leveraging the strengths of LLMs, our approach ensures that the prompts remain human-understandable, thereby facilitating better transparency and oversight for vision-language models. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.15397v1-abstract-full').style.display = 'none'; document.getElementById('2410.15397v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by NeurIPS 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.12407">arXiv:2410.12407</a> <span> [<a href="https://arxiv.org/pdf/2410.12407">pdf</a>, <a href="https://arxiv.org/format/2410.12407">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> </div> </div> <p class="title is-5 mathjax"> Beyond Coarse-Grained Matching in Video-Text Retrieval </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+A">Aozhu Chen</a>, <a href="/search/cs?searchtype=author&query=Doughty%2C+H">Hazel Doughty</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xirong Li</a>, <a href="/search/cs?searchtype=author&query=Snoek%2C+C+G+M">Cees G. M. Snoek</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.12407v2-abstract-short" style="display: inline;"> Video-text retrieval has seen significant advancements, yet the ability of models to discern subtle differences in captions still requires verification. In this paper, we introduce a new approach for fine-grained evaluation. Our approach can be applied to existing datasets by automatically generating hard negative test captions with subtle single-word variations across nouns, verbs, adjectives, ad… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.12407v2-abstract-full').style.display = 'inline'; document.getElementById('2410.12407v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.12407v2-abstract-full" style="display: none;"> Video-text retrieval has seen significant advancements, yet the ability of models to discern subtle differences in captions still requires verification. In this paper, we introduce a new approach for fine-grained evaluation. Our approach can be applied to existing datasets by automatically generating hard negative test captions with subtle single-word variations across nouns, verbs, adjectives, adverbs, and prepositions. We perform comprehensive experiments using four state-of-the-art models across two standard benchmarks (MSR-VTT and VATEX) and two specially curated datasets enriched with detailed descriptions (VLN-UVO and VLN-OOPS), resulting in a number of novel insights: 1) our analyses show that the current evaluation benchmarks fall short in detecting a model's ability to perceive subtle single-word differences, 2) our fine-grained evaluation highlights the difficulty models face in distinguishing such subtle variations. To enhance fine-grained understanding, we propose a new baseline that can be easily combined with current methods. Experiments on our fine-grained evaluations demonstrate that this approach enhances a model's ability to understand fine-grained differences. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.12407v2-abstract-full').style.display = 'none'; document.getElementById('2410.12407v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 16 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to ACCV 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.12018">arXiv:2410.12018</a> <span> [<a href="https://arxiv.org/pdf/2410.12018">pdf</a>, <a href="https://arxiv.org/format/2410.12018">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> </div> </div> <p class="title is-5 mathjax"> LocoMotion: Learning Motion-Focused Video-Language Representations </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Doughty%2C+H">Hazel Doughty</a>, <a href="/search/cs?searchtype=author&query=Thoker%2C+F+M">Fida Mohammad Thoker</a>, <a href="/search/cs?searchtype=author&query=Snoek%2C+C+G+M">Cees G. M. Snoek</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.12018v2-abstract-short" style="display: inline;"> This paper strives for motion-focused video-language representations. Existing methods to learn video-language representations use spatial-focused data, where identifying the objects and scene is often enough to distinguish the relevant caption. We instead propose LocoMotion to learn from motion-focused captions that describe the movement and temporal progression of local object motions. We achiev… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.12018v2-abstract-full').style.display = 'inline'; document.getElementById('2410.12018v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.12018v2-abstract-full" style="display: none;"> This paper strives for motion-focused video-language representations. Existing methods to learn video-language representations use spatial-focused data, where identifying the objects and scene is often enough to distinguish the relevant caption. We instead propose LocoMotion to learn from motion-focused captions that describe the movement and temporal progression of local object motions. We achieve this by adding synthetic motions to videos and using the parameters of these motions to generate corresponding captions. Furthermore, we propose verb-variation paraphrasing to increase the caption variety and learn the link between primitive motions and high-level verbs. With this, we are able to learn a motion-focused video-language representation. Experiments demonstrate our approach is effective for a variety of downstream tasks, particularly when limited data is available for fine-tuning. Code is available: https://hazeldoughty.github.io/Papers/LocoMotion/ <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.12018v2-abstract-full').style.display = 'none'; document.getElementById('2410.12018v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 15 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ACCV 2024 Oral</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.10491">arXiv:2410.10491</a> <span> [<a href="https://arxiv.org/pdf/2410.10491">pdf</a>, <a href="https://arxiv.org/format/2410.10491">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Learning to Ground VLMs without Forgetting </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Bhowmik%2C+A">Aritra Bhowmik</a>, <a href="/search/cs?searchtype=author&query=Derakhshani%2C+M+M">Mohammad Mahdi Derakhshani</a>, <a href="/search/cs?searchtype=author&query=Koelma%2C+D">Dennis Koelma</a>, <a href="/search/cs?searchtype=author&query=Oswald%2C+M+R">Martin R. Oswald</a>, <a href="/search/cs?searchtype=author&query=Asano%2C+Y+M">Yuki M. Asano</a>, <a href="/search/cs?searchtype=author&query=Snoek%2C+C+G+M">Cees G. M. Snoek</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.10491v1-abstract-short" style="display: inline;"> Spatial awareness is key to enable embodied multimodal AI systems. Yet, without vast amounts of spatial supervision, current Visual Language Models (VLMs) struggle at this task. In this paper, we introduce LynX, a framework that equips pretrained VLMs with visual grounding ability without forgetting their existing image and language understanding skills. To this end, we propose a Dual Mixture of E… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.10491v1-abstract-full').style.display = 'inline'; document.getElementById('2410.10491v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.10491v1-abstract-full" style="display: none;"> Spatial awareness is key to enable embodied multimodal AI systems. Yet, without vast amounts of spatial supervision, current Visual Language Models (VLMs) struggle at this task. In this paper, we introduce LynX, a framework that equips pretrained VLMs with visual grounding ability without forgetting their existing image and language understanding skills. To this end, we propose a Dual Mixture of Experts module that modifies only the decoder layer of the language model, using one frozen Mixture of Experts (MoE) pre-trained on image and language understanding and another learnable MoE for new grounding capabilities. This allows the VLM to retain previously learned knowledge and skills, while acquiring what is missing. To train the model effectively, we generate a high-quality synthetic dataset we call SCouT, which mimics human reasoning in visual grounding. This dataset provides rich supervision signals, describing a step-by-step multimodal reasoning process, thereby simplifying the task of visual grounding. We evaluate LynX on several object detection and visual grounding datasets, demonstrating strong performance in object detection, zero-shot localization and grounded reasoning while maintaining its original image and language understanding capabilities on seven standard benchmark datasets. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.10491v1-abstract-full').style.display = 'none'; document.getElementById('2410.10491v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.10034">arXiv:2410.10034</a> <span> [<a href="https://arxiv.org/pdf/2410.10034">pdf</a>, <a href="https://arxiv.org/format/2410.10034">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> TULIP: Token-length Upgraded CLIP </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Najdenkoska%2C+I">Ivona Najdenkoska</a>, <a href="/search/cs?searchtype=author&query=Derakhshani%2C+M+M">Mohammad Mahdi Derakhshani</a>, <a href="/search/cs?searchtype=author&query=Asano%2C+Y+M">Yuki M. Asano</a>, <a href="/search/cs?searchtype=author&query=van+Noord%2C+N">Nanne van Noord</a>, <a href="/search/cs?searchtype=author&query=Worring%2C+M">Marcel Worring</a>, <a href="/search/cs?searchtype=author&query=Snoek%2C+C+G+M">Cees G. M. Snoek</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.10034v1-abstract-short" style="display: inline;"> We address the challenge of representing long captions in vision-language models, such as CLIP. By design these models are limited by fixed, absolute positional encodings, restricting inputs to a maximum of 77 tokens and hindering performance on tasks requiring longer descriptions. Although recent work has attempted to overcome this limit, their proposed approaches struggle to model token relation… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.10034v1-abstract-full').style.display = 'inline'; document.getElementById('2410.10034v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.10034v1-abstract-full" style="display: none;"> We address the challenge of representing long captions in vision-language models, such as CLIP. By design these models are limited by fixed, absolute positional encodings, restricting inputs to a maximum of 77 tokens and hindering performance on tasks requiring longer descriptions. Although recent work has attempted to overcome this limit, their proposed approaches struggle to model token relationships over longer distances and simply extend to a fixed new token length. Instead, we propose a generalizable method, named TULIP, able to upgrade the token length to any length for CLIP-like models. We do so by improving the architecture with relative position encodings, followed by a training procedure that (i) distills the original CLIP text encoder into an encoder with relative position encodings and (ii) enhances the model for aligning longer captions with images. By effectively encoding captions longer than the default 77 tokens, our model outperforms baselines on cross-modal tasks such as retrieval and text-to-image generation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.10034v1-abstract-full').style.display = 'none'; document.getElementById('2410.10034v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.07752">arXiv:2410.07752</a> <span> [<a href="https://arxiv.org/pdf/2410.07752">pdf</a>, <a href="https://arxiv.org/format/2410.07752">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> TVBench: Redesigning Video-Language Evaluation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Cores%2C+D">Daniel Cores</a>, <a href="/search/cs?searchtype=author&query=Dorkenwald%2C+M">Michael Dorkenwald</a>, <a href="/search/cs?searchtype=author&query=Mucientes%2C+M">Manuel Mucientes</a>, <a href="/search/cs?searchtype=author&query=Snoek%2C+C+G+M">Cees G. M. Snoek</a>, <a href="/search/cs?searchtype=author&query=Asano%2C+Y+M">Yuki M. Asano</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.07752v2-abstract-short" style="display: inline;"> Large language models have demonstrated impressive performance when integrated with vision models even enabling video understanding. However, evaluating these video models presents its own unique challenges, for which several benchmarks have been proposed. In this paper, we show that the currently most used video-language benchmarks can be solved without requiring much temporal reasoning. We ident… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.07752v2-abstract-full').style.display = 'inline'; document.getElementById('2410.07752v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.07752v2-abstract-full" style="display: none;"> Large language models have demonstrated impressive performance when integrated with vision models even enabling video understanding. However, evaluating these video models presents its own unique challenges, for which several benchmarks have been proposed. In this paper, we show that the currently most used video-language benchmarks can be solved without requiring much temporal reasoning. We identified three main issues in existing datasets: (i) static information from single frames is often sufficient to solve the tasks (ii) the text of the questions and candidate answers is overly informative, allowing models to answer correctly without relying on any visual input (iii) world knowledge alone can answer many of the questions, making the benchmarks a test of knowledge replication rather than visual reasoning. In addition, we found that open-ended question-answering benchmarks for video understanding suffer from similar issues while the automatic evaluation process with LLMs is unreliable, making it an unsuitable alternative. As a solution, we propose TVBench, a novel open-source video multiple-choice question-answering benchmark, and demonstrate through extensive evaluations that it requires a high level of temporal understanding. Surprisingly, we find that most recent state-of-the-art video-language models perform similarly to random performance on TVBench, with only a few models such as Qwen2-VL, and Tarsier clearly surpassing this baseline. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.07752v2-abstract-full').style.display = 'none'; document.getElementById('2410.07752v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 10 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.14371">arXiv:2408.14371</a> <span> [<a href="https://arxiv.org/pdf/2408.14371">pdf</a>, <a href="https://arxiv.org/format/2408.14371">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> SelEx: Self-Expertise in Fine-Grained Generalized Category Discovery </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Rastegar%2C+S">Sarah Rastegar</a>, <a href="/search/cs?searchtype=author&query=Salehi%2C+M">Mohammadreza Salehi</a>, <a href="/search/cs?searchtype=author&query=Asano%2C+Y+M">Yuki M. Asano</a>, <a href="/search/cs?searchtype=author&query=Doughty%2C+H">Hazel Doughty</a>, <a href="/search/cs?searchtype=author&query=Snoek%2C+C+G+M">Cees G. M. Snoek</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.14371v2-abstract-short" style="display: inline;"> In this paper, we address Generalized Category Discovery, aiming to simultaneously uncover novel categories and accurately classify known ones. Traditional methods, which lean heavily on self-supervision and contrastive learning, often fall short when distinguishing between fine-grained categories. To address this, we introduce a novel concept called `self-expertise', which enhances the model's ab… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.14371v2-abstract-full').style.display = 'inline'; document.getElementById('2408.14371v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.14371v2-abstract-full" style="display: none;"> In this paper, we address Generalized Category Discovery, aiming to simultaneously uncover novel categories and accurately classify known ones. Traditional methods, which lean heavily on self-supervision and contrastive learning, often fall short when distinguishing between fine-grained categories. To address this, we introduce a novel concept called `self-expertise', which enhances the model's ability to recognize subtle differences and uncover unknown categories. Our approach combines unsupervised and supervised self-expertise strategies to refine the model's discernment and generalization. Initially, hierarchical pseudo-labeling is used to provide `soft supervision', improving the effectiveness of self-expertise. Our supervised technique differs from traditional methods by utilizing more abstract positive and negative samples, aiding in the formation of clusters that can generalize to novel categories. Meanwhile, our unsupervised strategy encourages the model to sharpen its category distinctions by considering within-category examples as `hard' negatives. Supported by theoretical insights, our empirical results showcase that our method outperforms existing state-of-the-art techniques in Generalized Category Discovery across several fine-grained datasets. Our code is available at: https://github.com/SarahRastegar/SelEx. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.14371v2-abstract-full').style.display = 'none'; document.getElementById('2408.14371v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 26 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by ECCV 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.15447">arXiv:2407.15447</a> <span> [<a href="https://arxiv.org/pdf/2407.15447">pdf</a>, <a href="https://arxiv.org/format/2407.15447">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> SIGMA: Sinkhorn-Guided Masked Video Modeling </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Salehi%2C+M">Mohammadreza Salehi</a>, <a href="/search/cs?searchtype=author&query=Dorkenwald%2C+M">Michael Dorkenwald</a>, <a href="/search/cs?searchtype=author&query=Thoker%2C+F+M">Fida Mohammad Thoker</a>, <a href="/search/cs?searchtype=author&query=Gavves%2C+E">Efstratios Gavves</a>, <a href="/search/cs?searchtype=author&query=Snoek%2C+C+G+M">Cees G. M. Snoek</a>, <a href="/search/cs?searchtype=author&query=Asano%2C+Y+M">Yuki M. Asano</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.15447v1-abstract-short" style="display: inline;"> Video-based pretraining offers immense potential for learning strong visual representations on an unprecedented scale. Recently, masked video modeling methods have shown promising scalability, yet fall short in capturing higher-level semantics due to reconstructing predefined low-level targets such as pixels. To tackle this, we present Sinkhorn-guided Masked Video Modelling (SIGMA), a novel video… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.15447v1-abstract-full').style.display = 'inline'; document.getElementById('2407.15447v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.15447v1-abstract-full" style="display: none;"> Video-based pretraining offers immense potential for learning strong visual representations on an unprecedented scale. Recently, masked video modeling methods have shown promising scalability, yet fall short in capturing higher-level semantics due to reconstructing predefined low-level targets such as pixels. To tackle this, we present Sinkhorn-guided Masked Video Modelling (SIGMA), a novel video pretraining method that jointly learns the video model in addition to a target feature space using a projection network. However, this simple modification means that the regular L2 reconstruction loss will lead to trivial solutions as both networks are jointly optimized. As a solution, we distribute features of space-time tubes evenly across a limited number of learnable clusters. By posing this as an optimal transport problem, we enforce high entropy in the generated features across the batch, infusing semantic and temporal meaning into the feature space. The resulting cluster assignments are used as targets for a symmetric prediction task where the video model predicts cluster assignment of the projection network and vice versa. Experimental results on ten datasets across three benchmarks validate the effectiveness of SIGMA in learning more performant, temporally-aware, and robust video representations improving upon state-of-the-art methods. Our project website with code is available at: https://quva-lab.github.io/SIGMA. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.15447v1-abstract-full').style.display = 'none'; document.getElementById('2407.15447v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted at ECCV 24</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.12427">arXiv:2407.12427</a> <span> [<a href="https://arxiv.org/pdf/2407.12427">pdf</a>, <a href="https://arxiv.org/format/2407.12427">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> GeneralAD: Anomaly Detection Across Domains by Attending to Distorted Features </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Str%C3%A4ter%2C+L+P+J">Luc P. J. Str盲ter</a>, <a href="/search/cs?searchtype=author&query=Salehi%2C+M">Mohammadreza Salehi</a>, <a href="/search/cs?searchtype=author&query=Gavves%2C+E">Efstratios Gavves</a>, <a href="/search/cs?searchtype=author&query=Snoek%2C+C+G+M">Cees G. M. Snoek</a>, <a href="/search/cs?searchtype=author&query=Asano%2C+Y+M">Yuki M. Asano</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.12427v1-abstract-short" style="display: inline;"> In the domain of anomaly detection, methods often excel in either high-level semantic or low-level industrial benchmarks, rarely achieving cross-domain proficiency. Semantic anomalies are novelties that differ in meaning from the training set, like unseen objects in self-driving cars. In contrast, industrial anomalies are subtle defects that preserve semantic meaning, such as cracks in airplane co… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.12427v1-abstract-full').style.display = 'inline'; document.getElementById('2407.12427v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.12427v1-abstract-full" style="display: none;"> In the domain of anomaly detection, methods often excel in either high-level semantic or low-level industrial benchmarks, rarely achieving cross-domain proficiency. Semantic anomalies are novelties that differ in meaning from the training set, like unseen objects in self-driving cars. In contrast, industrial anomalies are subtle defects that preserve semantic meaning, such as cracks in airplane components. In this paper, we present GeneralAD, an anomaly detection framework designed to operate in semantic, near-distribution, and industrial settings with minimal per-task adjustments. In our approach, we capitalize on the inherent design of Vision Transformers, which are trained on image patches, thereby ensuring that the last hidden states retain a patch-based structure. We propose a novel self-supervised anomaly generation module that employs straightforward operations like noise addition and shuffling to patch features to construct pseudo-abnormal samples. These features are fed to an attention-based discriminator, which is trained to score every patch in the image. With this, our method can both accurately identify anomalies at the image level and also generate interpretable anomaly maps. We extensively evaluated our approach on ten datasets, achieving state-of-the-art results in six and on-par performance in the remaining for both localization and detection tasks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.12427v1-abstract-full').style.display = 'none'; document.getElementById('2407.12427v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted at ECCV 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.09415">arXiv:2406.09415</a> <span> [<a href="https://arxiv.org/pdf/2406.09415">pdf</a>, <a href="https://arxiv.org/format/2406.09415">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> An Image is Worth More Than 16x16 Patches: Exploring Transformers on Individual Pixels </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Nguyen%2C+D">Duy-Kien Nguyen</a>, <a href="/search/cs?searchtype=author&query=Assran%2C+M">Mahmoud Assran</a>, <a href="/search/cs?searchtype=author&query=Jain%2C+U">Unnat Jain</a>, <a href="/search/cs?searchtype=author&query=Oswald%2C+M+R">Martin R. Oswald</a>, <a href="/search/cs?searchtype=author&query=Snoek%2C+C+G+M">Cees G. M. Snoek</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+X">Xinlei Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.09415v1-abstract-short" style="display: inline;"> This work does not introduce a new method. Instead, we present an interesting finding that questions the necessity of the inductive bias -- locality in modern computer vision architectures. Concretely, we find that vanilla Transformers can operate by directly treating each individual pixel as a token and achieve highly performant results. This is substantially different from the popular design in… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.09415v1-abstract-full').style.display = 'inline'; document.getElementById('2406.09415v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.09415v1-abstract-full" style="display: none;"> This work does not introduce a new method. Instead, we present an interesting finding that questions the necessity of the inductive bias -- locality in modern computer vision architectures. Concretely, we find that vanilla Transformers can operate by directly treating each individual pixel as a token and achieve highly performant results. This is substantially different from the popular design in Vision Transformer, which maintains the inductive bias from ConvNets towards local neighborhoods (e.g. by treating each 16x16 patch as a token). We mainly showcase the effectiveness of pixels-as-tokens across three well-studied tasks in computer vision: supervised learning for object classification, self-supervised learning via masked autoencoding, and image generation with diffusion models. Although directly operating on individual pixels is less computationally practical, we believe the community must be aware of this surprising piece of knowledge when devising the next generation of neural architectures for computer vision. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.09415v1-abstract-full').style.display = 'none'; document.getElementById('2406.09415v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Technical report, 23 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2404.00701">arXiv:2404.00701</a> <span> [<a href="https://arxiv.org/pdf/2404.00701">pdf</a>, <a href="https://arxiv.org/format/2404.00701">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Training-Free Semantic Segmentation via LLM-Supervision </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Sun%2C+W">Wenfang Sun</a>, <a href="/search/cs?searchtype=author&query=Du%2C+Y">Yingjun Du</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+G">Gaowen Liu</a>, <a href="/search/cs?searchtype=author&query=Kompella%2C+R">Ramana Kompella</a>, <a href="/search/cs?searchtype=author&query=Snoek%2C+C+G+M">Cees G. M. Snoek</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2404.00701v1-abstract-short" style="display: inline;"> Recent advancements in open vocabulary models, like CLIP, have notably advanced zero-shot classification and segmentation by utilizing natural language for class-specific embeddings. However, most research has focused on improving model accuracy through prompt engineering, prompt learning, or fine-tuning with limited labeled data, thereby overlooking the importance of refining the class descriptor… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.00701v1-abstract-full').style.display = 'inline'; document.getElementById('2404.00701v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2404.00701v1-abstract-full" style="display: none;"> Recent advancements in open vocabulary models, like CLIP, have notably advanced zero-shot classification and segmentation by utilizing natural language for class-specific embeddings. However, most research has focused on improving model accuracy through prompt engineering, prompt learning, or fine-tuning with limited labeled data, thereby overlooking the importance of refining the class descriptors. This paper introduces a new approach to text-supervised semantic segmentation using supervision by a large language model (LLM) that does not require extra training. Our method starts from an LLM, like GPT-3, to generate a detailed set of subclasses for more accurate class representation. We then employ an advanced text-supervised semantic segmentation model to apply the generated subclasses as target labels, resulting in diverse segmentation results tailored to each subclass's unique characteristics. Additionally, we propose an assembly that merges the segmentation maps from the various subclass descriptors to ensure a more comprehensive representation of the different aspects in the test images. Through comprehensive experiments on three standard benchmarks, our method outperforms traditional text-supervised semantic segmentation methods by a marked margin. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.00701v1-abstract-full').style.display = 'none'; document.getElementById('2404.00701v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">22 pages,10 figures, conference</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2403.12143">arXiv:2403.12143</a> <span> [<a href="https://arxiv.org/pdf/2403.12143">pdf</a>, <a href="https://arxiv.org/format/2403.12143">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Graph Neural Networks for Learning Equivariant Representations of Neural Networks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Kofinas%2C+M">Miltiadis Kofinas</a>, <a href="/search/cs?searchtype=author&query=Knyazev%2C+B">Boris Knyazev</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yan Zhang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yunlu Chen</a>, <a href="/search/cs?searchtype=author&query=Burghouts%2C+G+J">Gertjan J. Burghouts</a>, <a href="/search/cs?searchtype=author&query=Gavves%2C+E">Efstratios Gavves</a>, <a href="/search/cs?searchtype=author&query=Snoek%2C+C+G+M">Cees G. M. Snoek</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+D+W">David W. Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2403.12143v3-abstract-short" style="display: inline;"> Neural networks that process the parameters of other neural networks find applications in domains as diverse as classifying implicit neural representations, generating neural network weights, and predicting generalization errors. However, existing approaches either overlook the inherent permutation symmetry in the neural network or rely on intricate weight-sharing patterns to achieve equivariance,… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.12143v3-abstract-full').style.display = 'inline'; document.getElementById('2403.12143v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2403.12143v3-abstract-full" style="display: none;"> Neural networks that process the parameters of other neural networks find applications in domains as diverse as classifying implicit neural representations, generating neural network weights, and predicting generalization errors. However, existing approaches either overlook the inherent permutation symmetry in the neural network or rely on intricate weight-sharing patterns to achieve equivariance, while ignoring the impact of the network architecture itself. In this work, we propose to represent neural networks as computational graphs of parameters, which allows us to harness powerful graph neural networks and transformers that preserve permutation symmetry. Consequently, our approach enables a single model to encode neural computational graphs with diverse architectures. We showcase the effectiveness of our method on a wide range of tasks, including classification and editing of implicit neural representations, predicting generalization performance, and learning to optimize, while consistently outperforming state-of-the-art methods. The source code is open-sourced at https://github.com/mkofinas/neural-graphs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.12143v3-abstract-full').style.display = 'none'; document.getElementById('2403.12143v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 18 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">In ICLR 2024. Source code: https://github.com/mkofinas/neural-graphs</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2402.10099">arXiv:2402.10099</a> <span> [<a href="https://arxiv.org/pdf/2402.10099">pdf</a>, <a href="https://arxiv.org/format/2402.10099">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Any-Shift Prompting for Generalization over Distributions </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xiao%2C+Z">Zehao Xiao</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+J">Jiayi Shen</a>, <a href="/search/cs?searchtype=author&query=Derakhshani%2C+M+M">Mohammad Mahdi Derakhshani</a>, <a href="/search/cs?searchtype=author&query=Liao%2C+S">Shengcai Liao</a>, <a href="/search/cs?searchtype=author&query=Snoek%2C+C+G+M">Cees G. M. Snoek</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2402.10099v1-abstract-short" style="display: inline;"> Image-language models with prompt learning have shown remarkable advances in numerous downstream vision tasks. Nevertheless, conventional prompt learning methods overfit their training distribution and lose the generalization ability on test distributions. To improve generalization across various distribution shifts, we propose any-shift prompting: a general probabilistic inference framework that… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.10099v1-abstract-full').style.display = 'inline'; document.getElementById('2402.10099v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2402.10099v1-abstract-full" style="display: none;"> Image-language models with prompt learning have shown remarkable advances in numerous downstream vision tasks. Nevertheless, conventional prompt learning methods overfit their training distribution and lose the generalization ability on test distributions. To improve generalization across various distribution shifts, we propose any-shift prompting: a general probabilistic inference framework that considers the relationship between training and test distributions during prompt learning. We explicitly connect training and test distributions in the latent space by constructing training and test prompts in a hierarchical architecture. Within this framework, the test prompt exploits the distribution relationships to guide the generalization of the CLIP image-language model from training to any test distribution. To effectively encode the distribution information and their relationships, we further introduce a transformer inference network with a pseudo-shift training mechanism. The network generates the tailored test prompt with both training and test information in a feedforward pass, avoiding extra training costs at test time. Extensive experiments on twenty-three datasets demonstrate the effectiveness of any-shift prompting on the generalization over various distribution shifts. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.10099v1-abstract-full').style.display = 'none'; document.getElementById('2402.10099v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 February, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2402.08657">arXiv:2402.08657</a> <span> [<a href="https://arxiv.org/pdf/2402.08657">pdf</a>, <a href="https://arxiv.org/format/2402.08657">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> PIN: Positional Insert Unlocks Object Localisation Abilities in VLMs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Dorkenwald%2C+M">Michael Dorkenwald</a>, <a href="/search/cs?searchtype=author&query=Barazani%2C+N">Nimrod Barazani</a>, <a href="/search/cs?searchtype=author&query=Snoek%2C+C+G+M">Cees G. M. Snoek</a>, <a href="/search/cs?searchtype=author&query=Asano%2C+Y+M">Yuki M. Asano</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2402.08657v1-abstract-short" style="display: inline;"> Vision-Language Models (VLMs), such as Flamingo and GPT-4V, have shown immense potential by integrating large language models with vision systems. Nevertheless, these models face challenges in the fundamental computer vision task of object localisation, due to their training on multimodal data containing mostly captions without explicit spatial grounding. While it is possible to construct custom,… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.08657v1-abstract-full').style.display = 'inline'; document.getElementById('2402.08657v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2402.08657v1-abstract-full" style="display: none;"> Vision-Language Models (VLMs), such as Flamingo and GPT-4V, have shown immense potential by integrating large language models with vision systems. Nevertheless, these models face challenges in the fundamental computer vision task of object localisation, due to their training on multimodal data containing mostly captions without explicit spatial grounding. While it is possible to construct custom, supervised training pipelines with bounding box annotations that integrate with VLMs, these result in specialized and hard-to-scale models. In this paper, we aim to explore the limits of caption-based VLMs and instead propose to tackle the challenge in a simpler manner by i) keeping the weights of a caption-based VLM frozen and ii) not using any supervised detection data. To this end, we introduce an input-agnostic Positional Insert (PIN), a learnable spatial prompt, containing a minimal set of parameters that are slid inside the frozen VLM, unlocking object localisation capabilities. Our PIN module is trained with a simple next-token prediction task on synthetic data without requiring the introduction of new output heads. Our experiments demonstrate strong zero-shot localisation performances on a variety of images, including Pascal VOC, COCO, LVIS, and diverse images like paintings or cartoons. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.08657v1-abstract-full').style.display = 'none'; document.getElementById('2402.08657v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 February, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2401.04716">arXiv:2401.04716</a> <span> [<a href="https://arxiv.org/pdf/2401.04716">pdf</a>, <a href="https://arxiv.org/format/2401.04716">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Low-Resource Vision Challenges for Foundation Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yunhua Zhang</a>, <a href="/search/cs?searchtype=author&query=Doughty%2C+H">Hazel Doughty</a>, <a href="/search/cs?searchtype=author&query=Snoek%2C+C+G+M">Cees G. M. Snoek</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2401.04716v3-abstract-short" style="display: inline;"> Low-resource settings are well-established in natural language processing, where many languages lack sufficient data for deep learning at scale. However, low-resource problems are under-explored in computer vision. In this paper, we address this gap and explore the challenges of low-resource image tasks with vision foundation models. We first collect a benchmark of genuinely low-resource image dat… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2401.04716v3-abstract-full').style.display = 'inline'; document.getElementById('2401.04716v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2401.04716v3-abstract-full" style="display: none;"> Low-resource settings are well-established in natural language processing, where many languages lack sufficient data for deep learning at scale. However, low-resource problems are under-explored in computer vision. In this paper, we address this gap and explore the challenges of low-resource image tasks with vision foundation models. We first collect a benchmark of genuinely low-resource image data, covering historic maps, circuit diagrams, and mechanical drawings. These low-resource settings all share three challenges: data scarcity, fine-grained differences, and the distribution shift from natural images to the specialized domain of interest. While existing foundation models have shown impressive generalizability, we find they cannot transfer well to our low-resource tasks. To begin to tackle the challenges of low-resource vision, we introduce one simple baseline per challenge. Specifically, we i) enlarge the data space by generative models, ii) adopt the best sub-kernels to encode local regions for fine-grained difference discovery and iii) learn attention for specialized domains. Experiments on our three low-resource tasks demonstrate our proposals already provide a better baseline than transfer learning, data augmentation, and fine-grained methods. This highlights the unique characteristics and challenges of low-resource vision for foundation models that warrant further investigation. Project page: https://xiaobai1217.github.io/Low-Resource-Vision/. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2401.04716v3-abstract-full').style.display = 'none'; document.getElementById('2401.04716v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 9 January, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted at CVPR2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2312.10825">arXiv:2312.10825</a> <span> [<a href="https://arxiv.org/pdf/2312.10825">pdf</a>, <a href="https://arxiv.org/format/2312.10825">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Latent Space Editing in Transformer-Based Flow Matching </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Hu%2C+V+T">Vincent Tao Hu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+D+W">David W Zhang</a>, <a href="/search/cs?searchtype=author&query=Mettes%2C+P">Pascal Mettes</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+M">Meng Tang</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+D">Deli Zhao</a>, <a href="/search/cs?searchtype=author&query=Snoek%2C+C+G+M">Cees G. M. Snoek</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2312.10825v1-abstract-short" style="display: inline;"> This paper strives for image editing via generative models. Flow Matching is an emerging generative modeling technique that offers the advantage of simple and efficient training. Simultaneously, a new transformer-based U-ViT has recently been proposed to replace the commonly used UNet for better scalability and performance in generative modeling. Hence, Flow Matching with a transformer backbone of… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.10825v1-abstract-full').style.display = 'inline'; document.getElementById('2312.10825v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2312.10825v1-abstract-full" style="display: none;"> This paper strives for image editing via generative models. Flow Matching is an emerging generative modeling technique that offers the advantage of simple and efficient training. Simultaneously, a new transformer-based U-ViT has recently been proposed to replace the commonly used UNet for better scalability and performance in generative modeling. Hence, Flow Matching with a transformer backbone offers the potential for scalable and high-quality generative modeling, but their latent structure and editing ability are as of yet unknown. Hence, we adopt this setting and explore how to edit images through latent space manipulation. We introduce an editing space, which we call $u$-space, that can be manipulated in a controllable, accumulative, and composable manner. Additionally, we propose a tailored sampling solution to enable sampling with the more efficient adaptive step-size ODE solvers. Lastly, we put forth a straightforward yet powerful method for achieving fine-grained and nuanced editing using text prompts. Our framework is simple and efficient, all while being highly effective at editing images while preserving the essence of the original content. Our code will be publicly available at https://taohu.me/lfm/ <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.10825v1-abstract-full').style.display = 'none'; document.getElementById('2312.10825v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 December, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">AAAI 2024 with Appendix</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2312.08895">arXiv:2312.08895</a> <span> [<a href="https://arxiv.org/pdf/2312.08895">pdf</a>, <a href="https://arxiv.org/format/2312.08895">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Motion Flow Matching for Human Motion Synthesis and Editing </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Hu%2C+V+T">Vincent Tao Hu</a>, <a href="/search/cs?searchtype=author&query=Yin%2C+W">Wenzhe Yin</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+P">Pingchuan Ma</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yunlu Chen</a>, <a href="/search/cs?searchtype=author&query=Fernando%2C+B">Basura Fernando</a>, <a href="/search/cs?searchtype=author&query=Asano%2C+Y+M">Yuki M Asano</a>, <a href="/search/cs?searchtype=author&query=Gavves%2C+E">Efstratios Gavves</a>, <a href="/search/cs?searchtype=author&query=Mettes%2C+P">Pascal Mettes</a>, <a href="/search/cs?searchtype=author&query=Ommer%2C+B">Bjorn Ommer</a>, <a href="/search/cs?searchtype=author&query=Snoek%2C+C+G+M">Cees G. M. Snoek</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2312.08895v1-abstract-short" style="display: inline;"> Human motion synthesis is a fundamental task in computer animation. Recent methods based on diffusion models or GPT structure demonstrate commendable performance but exhibit drawbacks in terms of slow sampling speeds and error accumulation. In this paper, we propose \emph{Motion Flow Matching}, a novel generative model designed for human motion generation featuring efficient sampling and effective… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.08895v1-abstract-full').style.display = 'inline'; document.getElementById('2312.08895v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2312.08895v1-abstract-full" style="display: none;"> Human motion synthesis is a fundamental task in computer animation. Recent methods based on diffusion models or GPT structure demonstrate commendable performance but exhibit drawbacks in terms of slow sampling speeds and error accumulation. In this paper, we propose \emph{Motion Flow Matching}, a novel generative model designed for human motion generation featuring efficient sampling and effectiveness in motion editing applications. Our method reduces the sampling complexity from thousand steps in previous diffusion models to just ten steps, while achieving comparable performance in text-to-motion and action-to-motion generation benchmarks. Noticeably, our approach establishes a new state-of-the-art Fr茅chet Inception Distance on the KIT-ML dataset. What is more, we tailor a straightforward motion editing paradigm named \emph{sampling trajectory rewriting} leveraging the ODE-style generative models and apply it to various editing scenarios including motion prediction, motion in-between prediction, motion interpolation, and upper-body editing. Our code will be released. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.08895v1-abstract-full').style.display = 'none'; document.getElementById('2312.08895v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 December, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">WIP</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2312.08825">arXiv:2312.08825</a> <span> [<a href="https://arxiv.org/pdf/2312.08825">pdf</a>, <a href="https://arxiv.org/format/2312.08825">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Guided Diffusion from Self-Supervised Diffusion Features </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Hu%2C+V+T">Vincent Tao Hu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yunlu Chen</a>, <a href="/search/cs?searchtype=author&query=Caron%2C+M">Mathilde Caron</a>, <a href="/search/cs?searchtype=author&query=Asano%2C+Y+M">Yuki M. Asano</a>, <a href="/search/cs?searchtype=author&query=Snoek%2C+C+G+M">Cees G. M. Snoek</a>, <a href="/search/cs?searchtype=author&query=Ommer%2C+B">Bjorn Ommer</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2312.08825v1-abstract-short" style="display: inline;"> Guidance serves as a key concept in diffusion models, yet its effectiveness is often limited by the need for extra data annotation or classifier pretraining. That is why guidance was harnessed from self-supervised learning backbones, like DINO. However, recent studies have revealed that the feature representation derived from diffusion model itself is discriminative for numerous downstream tasks a… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.08825v1-abstract-full').style.display = 'inline'; document.getElementById('2312.08825v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2312.08825v1-abstract-full" style="display: none;"> Guidance serves as a key concept in diffusion models, yet its effectiveness is often limited by the need for extra data annotation or classifier pretraining. That is why guidance was harnessed from self-supervised learning backbones, like DINO. However, recent studies have revealed that the feature representation derived from diffusion model itself is discriminative for numerous downstream tasks as well, which prompts us to propose a framework to extract guidance from, and specifically for, diffusion models. Our research has yielded several significant contributions. Firstly, the guidance signals from diffusion models are on par with those from class-conditioned diffusion models. Secondly, feature regularization, when based on the Sinkhorn-Knopp algorithm, can further enhance feature discriminability in comparison to unconditional diffusion models. Thirdly, we have constructed an online training approach that can concurrently derive guidance from diffusion models for diffusion models. Lastly, we have extended the application of diffusion models along the constant velocity path of ODE to achieve a more favorable balance between sampling steps and fidelity. The performance of our methods has been outstanding, outperforming related baseline comparisons in large-resolution datasets, such as ImageNet256, ImageNet256-100 and LSUN-Churches. Our code will be released. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.08825v1-abstract-full').style.display = 'none'; document.getElementById('2312.08825v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 December, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Work In Progress</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2311.18512">arXiv:2311.18512</a> <span> [<a href="https://arxiv.org/pdf/2311.18512">pdf</a>, <a href="https://arxiv.org/format/2311.18512">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Union-over-Intersections: Object Detection beyond Winner-Takes-All </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Bhowmik%2C+A">Aritra Bhowmik</a>, <a href="/search/cs?searchtype=author&query=Mettes%2C+P">Pascal Mettes</a>, <a href="/search/cs?searchtype=author&query=Oswald%2C+M+R">Martin R. Oswald</a>, <a href="/search/cs?searchtype=author&query=Snoek%2C+C+G+M">Cees G. M. Snoek</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2311.18512v2-abstract-short" style="display: inline;"> This paper revisits the problem of predicting box locations in object detection architectures. Typically, each box proposal or box query aims to directly maximize the intersection-over-union score with the ground truth, followed by a winner-takes-all non-maximum suppression where only the highest scoring box in each region is retained. We observe that both steps are sub-optimal: the first involves… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2311.18512v2-abstract-full').style.display = 'inline'; document.getElementById('2311.18512v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2311.18512v2-abstract-full" style="display: none;"> This paper revisits the problem of predicting box locations in object detection architectures. Typically, each box proposal or box query aims to directly maximize the intersection-over-union score with the ground truth, followed by a winner-takes-all non-maximum suppression where only the highest scoring box in each region is retained. We observe that both steps are sub-optimal: the first involves regressing proposals to the entire ground truth, which is a difficult task even with large receptive fields, and the second neglects valuable information from boxes other than the top candidate. Instead of regressing proposals to the whole ground truth, we propose a simpler approach: regress only to the area of intersection between the proposal and the ground truth. This avoids the need for proposals to extrapolate beyond their visual scope, improving localization accuracy. Rather than adopting a winner-takes-all strategy, we take the union over the regressed intersections of all boxes in a region to generate the final box outputs. Our plug-and-play method integrates seamlessly into proposal-based, grid-based, and query-based detection architectures with minimal modifications, consistently improving object localization and instance segmentation. We demonstrate its broad applicability and versatility across various detection and segmentation tasks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2311.18512v2-abstract-full').style.display = 'none'; document.getElementById('2311.18512v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 30 November, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">17 pages, 6 figures, 12 tables</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2311.17937">arXiv:2311.17937</a> <span> [<a href="https://arxiv.org/pdf/2311.17937">pdf</a>, <a href="https://arxiv.org/format/2311.17937">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Unlocking Spatial Comprehension in Text-to-Image Diffusion Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Derakhshani%2C+M+M">Mohammad Mahdi Derakhshani</a>, <a href="/search/cs?searchtype=author&query=Xia%2C+M">Menglin Xia</a>, <a href="/search/cs?searchtype=author&query=Behl%2C+H">Harkirat Behl</a>, <a href="/search/cs?searchtype=author&query=Snoek%2C+C+G+M">Cees G. M. Snoek</a>, <a href="/search/cs?searchtype=author&query=R%C3%BChle%2C+V">Victor R眉hle</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2311.17937v1-abstract-short" style="display: inline;"> We propose CompFuser, an image generation pipeline that enhances spatial comprehension and attribute assignment in text-to-image generative models. Our pipeline enables the interpretation of instructions defining spatial relationships between objects in a scene, such as `An image of a gray cat on the left of an orange dog', and generate corresponding images. This is especially important in order t… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2311.17937v1-abstract-full').style.display = 'inline'; document.getElementById('2311.17937v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2311.17937v1-abstract-full" style="display: none;"> We propose CompFuser, an image generation pipeline that enhances spatial comprehension and attribute assignment in text-to-image generative models. Our pipeline enables the interpretation of instructions defining spatial relationships between objects in a scene, such as `An image of a gray cat on the left of an orange dog', and generate corresponding images. This is especially important in order to provide more control to the user. CompFuser overcomes the limitation of existing text-to-image diffusion models by decoding the generation of multiple objects into iterative steps: first generating a single object and then editing the image by placing additional objects in their designated positions. To create training data for spatial comprehension and attribute assignment we introduce a synthetic data generation process, that leverages a frozen large language model and a frozen layout-based diffusion model for object placement. We compare our approach to strong baselines and show that our model outperforms state-of-the-art image generation models in spatial comprehension and attribute assignment, despite being 3x to 5x smaller in parameters. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2311.17937v1-abstract-full').style.display = 'none'; document.getElementById('2311.17937v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 November, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2311.13895">arXiv:2311.13895</a> <span> [<a href="https://arxiv.org/pdf/2311.13895">pdf</a>, <a href="https://arxiv.org/format/2311.13895">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Query by Activity Video in the Wild </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Hu%2C+T">Tao Hu</a>, <a href="/search/cs?searchtype=author&query=Thong%2C+W">William Thong</a>, <a href="/search/cs?searchtype=author&query=Mettes%2C+P">Pascal Mettes</a>, <a href="/search/cs?searchtype=author&query=Snoek%2C+C+G+M">Cees G. M. Snoek</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2311.13895v1-abstract-short" style="display: inline;"> This paper focuses on activity retrieval from a video query in an imbalanced scenario. In current query-by-activity-video literature, a common assumption is that all activities have sufficient labelled examples when learning an embedding. This assumption does however practically not hold, as only a portion of activities have many examples, while other activities are only described by few examples.… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2311.13895v1-abstract-full').style.display = 'inline'; document.getElementById('2311.13895v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2311.13895v1-abstract-full" style="display: none;"> This paper focuses on activity retrieval from a video query in an imbalanced scenario. In current query-by-activity-video literature, a common assumption is that all activities have sufficient labelled examples when learning an embedding. This assumption does however practically not hold, as only a portion of activities have many examples, while other activities are only described by few examples. In this paper, we propose a visual-semantic embedding network that explicitly deals with the imbalanced scenario for activity retrieval. Our network contains two novel modules. The visual alignment module performs a global alignment between the input video and fixed-sized visual bank representations for all activities. The semantic module performs an alignment between the input video and fixed-sized semantic activity representations. By matching videos with both visual and semantic activity representations that are of equal size over all activities, we no longer ignore infrequent activities during retrieval. Experiments on a new imbalanced activity retrieval benchmark show the effectiveness of our approach for all types of activities. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2311.13895v1-abstract-full').style.display = 'none'; document.getElementById('2311.13895v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 November, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">An extended version of ICIP 2023</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2311.08851">arXiv:2311.08851</a> <span> [<a href="https://arxiv.org/pdf/2311.08851">pdf</a>, <a href="https://arxiv.org/format/2311.08851">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Data Augmentations in Deep Weight Spaces </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Shamsian%2C+A">Aviv Shamsian</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+D+W">David W. Zhang</a>, <a href="/search/cs?searchtype=author&query=Navon%2C+A">Aviv Navon</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yan Zhang</a>, <a href="/search/cs?searchtype=author&query=Kofinas%2C+M">Miltiadis Kofinas</a>, <a href="/search/cs?searchtype=author&query=Achituve%2C+I">Idan Achituve</a>, <a href="/search/cs?searchtype=author&query=Valperga%2C+R">Riccardo Valperga</a>, <a href="/search/cs?searchtype=author&query=Burghouts%2C+G+J">Gertjan J. Burghouts</a>, <a href="/search/cs?searchtype=author&query=Gavves%2C+E">Efstratios Gavves</a>, <a href="/search/cs?searchtype=author&query=Snoek%2C+C+G+M">Cees G. M. Snoek</a>, <a href="/search/cs?searchtype=author&query=Fetaya%2C+E">Ethan Fetaya</a>, <a href="/search/cs?searchtype=author&query=Chechik%2C+G">Gal Chechik</a>, <a href="/search/cs?searchtype=author&query=Maron%2C+H">Haggai Maron</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2311.08851v1-abstract-short" style="display: inline;"> Learning in weight spaces, where neural networks process the weights of other deep neural networks, has emerged as a promising research direction with applications in various fields, from analyzing and editing neural fields and implicit neural representations, to network pruning and quantization. Recent works designed architectures for effective learning in that space, which takes into account its… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2311.08851v1-abstract-full').style.display = 'inline'; document.getElementById('2311.08851v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2311.08851v1-abstract-full" style="display: none;"> Learning in weight spaces, where neural networks process the weights of other deep neural networks, has emerged as a promising research direction with applications in various fields, from analyzing and editing neural fields and implicit neural representations, to network pruning and quantization. Recent works designed architectures for effective learning in that space, which takes into account its unique, permutation-equivariant, structure. Unfortunately, so far these architectures suffer from severe overfitting and were shown to benefit from large datasets. This poses a significant challenge because generating data for this learning setup is laborious and time-consuming since each data sample is a full set of network weights that has to be trained. In this paper, we address this difficulty by investigating data augmentations for weight spaces, a set of techniques that enable generating new data examples on the fly without having to train additional input weight space elements. We first review several recently proposed data augmentation schemes %that were proposed recently and divide them into categories. We then introduce a novel augmentation scheme based on the Mixup method. We evaluate the performance of these techniques on existing benchmarks as well as new benchmarks we generate, which can be valuable for future studies. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2311.08851v1-abstract-full').style.display = 'none'; document.getElementById('2311.08851v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 November, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to NeurIPS 2023 Workshop on Symmetry and Geometry in Neural Representations</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2310.19776">arXiv:2310.19776</a> <span> [<a href="https://arxiv.org/pdf/2310.19776">pdf</a>, <a href="https://arxiv.org/format/2310.19776">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Information Theory">cs.IT</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Learn to Categorize or Categorize to Learn? Self-Coding for Generalized Category Discovery </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Rastegar%2C+S">Sarah Rastegar</a>, <a href="/search/cs?searchtype=author&query=Doughty%2C+H">Hazel Doughty</a>, <a href="/search/cs?searchtype=author&query=Snoek%2C+C+G+M">Cees G. M. Snoek</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2310.19776v3-abstract-short" style="display: inline;"> In the quest for unveiling novel categories at test time, we confront the inherent limitations of traditional supervised recognition models that are restricted by a predefined category set. While strides have been made in the realms of self-supervised and open-world learning towards test-time category discovery, a crucial yet often overlooked question persists: what exactly delineates a category?… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2310.19776v3-abstract-full').style.display = 'inline'; document.getElementById('2310.19776v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2310.19776v3-abstract-full" style="display: none;"> In the quest for unveiling novel categories at test time, we confront the inherent limitations of traditional supervised recognition models that are restricted by a predefined category set. While strides have been made in the realms of self-supervised and open-world learning towards test-time category discovery, a crucial yet often overlooked question persists: what exactly delineates a category? In this paper, we conceptualize a category through the lens of optimization, viewing it as an optimal solution to a well-defined problem. Harnessing this unique conceptualization, we propose a novel, efficient and self-supervised method capable of discovering previously unknown categories at test time. A salient feature of our approach is the assignment of minimum length category codes to individual data instances, which encapsulates the implicit category hierarchy prevalent in real-world datasets. This mechanism affords us enhanced control over category granularity, thereby equipping our model to handle fine-grained categories adeptly. Experimental evaluations, bolstered by state-of-the-art benchmark comparisons, testify to the efficacy of our solution in managing unknown categories at test time. Furthermore, we fortify our proposition with a theoretical foundation, providing proof of its optimality. Our code is available at https://github.com/SarahRastegar/InfoSieve. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2310.19776v3-abstract-full').style.display = 'none'; document.getElementById('2310.19776v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 January, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 30 October, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by NeurIPS 2023</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> I.2.1.b; I.2.6.g; I.5.4.b; I.4 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2310.05920">arXiv:2310.05920</a> <span> [<a href="https://arxiv.org/pdf/2310.05920">pdf</a>, <a href="https://arxiv.org/format/2310.05920">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> SimPLR: A Simple and Plain Transformer for Scaling-Efficient Object Detection and Segmentation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Nguyen%2C+D">Duy-Kien Nguyen</a>, <a href="/search/cs?searchtype=author&query=Oswald%2C+M+R">Martin R. Oswald</a>, <a href="/search/cs?searchtype=author&query=Snoek%2C+C+G+M">Cees G. M. Snoek</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2310.05920v3-abstract-short" style="display: inline;"> The ability to detect objects in images at varying scales has played a pivotal role in the design of modern object detectors. Despite considerable progress in removing hand-crafted components and simplifying the architecture with transformers, multi-scale feature maps and/or pyramid design remain a key factor for their empirical success. In this paper, we show that this reliance on either feature… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2310.05920v3-abstract-full').style.display = 'inline'; document.getElementById('2310.05920v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2310.05920v3-abstract-full" style="display: none;"> The ability to detect objects in images at varying scales has played a pivotal role in the design of modern object detectors. Despite considerable progress in removing hand-crafted components and simplifying the architecture with transformers, multi-scale feature maps and/or pyramid design remain a key factor for their empirical success. In this paper, we show that this reliance on either feature pyramids or an hierarchical backbone is unnecessary and a transformer-based detector with scale-aware attention enables the plain detector `SimPLR' whose backbone and detection head are both non-hierarchical and operate on single-scale features. We find through our experiments that SimPLR with scale-aware attention is plain and simple, yet competitive with multi-scale vision transformer alternatives. Compared to the multi-scale and single-scale state-of-the-art, our model scales much better with bigger capacity (self-supervised) models and more pre-training data, allowing us to report a consistently better accuracy and faster runtime for object detection, instance segmentation as well as panoptic segmentation. Code will be released. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2310.05920v3-abstract-full').style.display = 'none'; document.getElementById('2310.05920v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 9 October, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2310.00500">arXiv:2310.00500</a> <span> [<a href="https://arxiv.org/pdf/2310.00500">pdf</a>, <a href="https://arxiv.org/format/2310.00500">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Self-Supervised Open-Ended Classification with Small Visual Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Derakhshani%2C+M+M">Mohammad Mahdi Derakhshani</a>, <a href="/search/cs?searchtype=author&query=Najdenkoska%2C+I">Ivona Najdenkoska</a>, <a href="/search/cs?searchtype=author&query=Snoek%2C+C+G+M">Cees G. M. Snoek</a>, <a href="/search/cs?searchtype=author&query=Worring%2C+M">Marcel Worring</a>, <a href="/search/cs?searchtype=author&query=Asano%2C+Y+M">Yuki M. Asano</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2310.00500v2-abstract-short" style="display: inline;"> We present Self-Context Adaptation (SeCAt), a self-supervised approach that unlocks few-shot abilities for open-ended classification with small visual language models. Our approach imitates image captions in a self-supervised way based on clustering a large pool of images followed by assigning semantically-unrelated names to clusters. By doing so, we construct a training signal consisting of inter… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2310.00500v2-abstract-full').style.display = 'inline'; document.getElementById('2310.00500v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2310.00500v2-abstract-full" style="display: none;"> We present Self-Context Adaptation (SeCAt), a self-supervised approach that unlocks few-shot abilities for open-ended classification with small visual language models. Our approach imitates image captions in a self-supervised way based on clustering a large pool of images followed by assigning semantically-unrelated names to clusters. By doing so, we construct a training signal consisting of interleaved sequences of image and pseudocaption pairs and a query image, which we denote as the 'self-context' sequence. Based on this signal the model is trained to produce the right pseudo-caption. We demonstrate the performance and flexibility of SeCAt on several multimodal few-shot datasets, spanning various granularities. By using models with approximately 1B parameters we outperform the few-shot abilities of much larger models, such as Frozen and FROMAGe. SeCAt opens new possibilities for research and applications in open-ended few-shot learning that otherwise requires access to large or proprietary models. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2310.00500v2-abstract-full').style.display = 'none'; document.getElementById('2310.00500v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 December, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 30 September, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2308.11796">arXiv:2308.11796</a> <span> [<a href="https://arxiv.org/pdf/2308.11796">pdf</a>, <a href="https://arxiv.org/format/2308.11796">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Time Does Tell: Self-Supervised Time-Tuning of Dense Image Representations </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Salehi%2C+M">Mohammadreza Salehi</a>, <a href="/search/cs?searchtype=author&query=Gavves%2C+E">Efstratios Gavves</a>, <a href="/search/cs?searchtype=author&query=Snoek%2C+C+G+M">Cees G. M. Snoek</a>, <a href="/search/cs?searchtype=author&query=Asano%2C+Y+M">Yuki M. Asano</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2308.11796v1-abstract-short" style="display: inline;"> Spatially dense self-supervised learning is a rapidly growing problem domain with promising applications for unsupervised segmentation and pretraining for dense downstream tasks. Despite the abundance of temporal data in the form of videos, this information-rich source has been largely overlooked. Our paper aims to address this gap by proposing a novel approach that incorporates temporal consisten… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2308.11796v1-abstract-full').style.display = 'inline'; document.getElementById('2308.11796v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2308.11796v1-abstract-full" style="display: none;"> Spatially dense self-supervised learning is a rapidly growing problem domain with promising applications for unsupervised segmentation and pretraining for dense downstream tasks. Despite the abundance of temporal data in the form of videos, this information-rich source has been largely overlooked. Our paper aims to address this gap by proposing a novel approach that incorporates temporal consistency in dense self-supervised learning. While methods designed solely for images face difficulties in achieving even the same performance on videos, our method improves not only the representation quality for videos-but also images. Our approach, which we call time-tuning, starts from image-pretrained models and fine-tunes them with a novel self-supervised temporal-alignment clustering loss on unlabeled videos. This effectively facilitates the transfer of high-level information from videos to image representations. Time-tuning improves the state-of-the-art by 8-10% for unsupervised semantic segmentation on videos and matches it for images. We believe this method paves the way for further self-supervised scaling by leveraging the abundant availability of videos. The implementation can be found here : https://github.com/SMSD75/Timetuning <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2308.11796v1-abstract-full').style.display = 'none'; document.getElementById('2308.11796v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 August, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2307.04033">arXiv:2307.04033</a> <span> [<a href="https://arxiv.org/pdf/2307.04033">pdf</a>, <a href="https://arxiv.org/format/2307.04033">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Probabilistic Test-Time Generalization by Variational Neighbor-Labeling </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ambekar%2C+S">Sameer Ambekar</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+Z">Zehao Xiao</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+J">Jiayi Shen</a>, <a href="/search/cs?searchtype=author&query=Zhen%2C+X">Xiantong Zhen</a>, <a href="/search/cs?searchtype=author&query=Snoek%2C+C+G+M">Cees G. M. Snoek</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2307.04033v3-abstract-short" style="display: inline;"> This paper strives for domain generalization, where models are trained exclusively on source domains before being deployed on unseen target domains. We follow the strict separation of source training and target testing, but exploit the value of the unlabeled target data itself during inference. We make three contributions. First, we propose probabilistic pseudo-labeling of target samples to genera… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2307.04033v3-abstract-full').style.display = 'inline'; document.getElementById('2307.04033v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2307.04033v3-abstract-full" style="display: none;"> This paper strives for domain generalization, where models are trained exclusively on source domains before being deployed on unseen target domains. We follow the strict separation of source training and target testing, but exploit the value of the unlabeled target data itself during inference. We make three contributions. First, we propose probabilistic pseudo-labeling of target samples to generalize the source-trained model to the target domain at test time. We formulate the generalization at test time as a variational inference problem, by modeling pseudo labels as distributions, to consider the uncertainty during generalization and alleviate the misleading signal of inaccurate pseudo labels. Second, we learn variational neighbor labels that incorporate the information of neighboring target samples to generate more robust pseudo labels. Third, to learn the ability to incorporate more representative target information and generate more precise and robust variational neighbor labels, we introduce a meta-generalization stage during training to simulate the generalization procedure. Experiments on seven widely-used datasets demonstrate the benefits, abilities, and effectiveness of our proposal. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2307.04033v3-abstract-full').style.display = 'none'; document.getElementById('2307.04033v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 8 July, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by CoLLAs 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2306.12795">arXiv:2306.12795</a> <span> [<a href="https://arxiv.org/pdf/2306.12795">pdf</a>, <a href="https://arxiv.org/format/2306.12795">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> </div> </div> <p class="title is-5 mathjax"> Learning Unseen Modality Interaction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yunhua Zhang</a>, <a href="/search/cs?searchtype=author&query=Doughty%2C+H">Hazel Doughty</a>, <a href="/search/cs?searchtype=author&query=Snoek%2C+C+G+M">Cees G. M. Snoek</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2306.12795v3-abstract-short" style="display: inline;"> Multimodal learning assumes all modality combinations of interest are available during training to learn cross-modal correspondences. In this paper, we challenge this modality-complete assumption for multimodal learning and instead strive for generalization to unseen modality combinations during inference. We pose the problem of unseen modality interaction and introduce a first solution. It exploi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2306.12795v3-abstract-full').style.display = 'inline'; document.getElementById('2306.12795v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2306.12795v3-abstract-full" style="display: none;"> Multimodal learning assumes all modality combinations of interest are available during training to learn cross-modal correspondences. In this paper, we challenge this modality-complete assumption for multimodal learning and instead strive for generalization to unseen modality combinations during inference. We pose the problem of unseen modality interaction and introduce a first solution. It exploits a module that projects the multidimensional features of different modalities into a common space with rich information preserved. This allows the information to be accumulated with a simple summation operation across available modalities. To reduce overfitting to less discriminative modality combinations during training, we further improve the model learning with pseudo-supervision indicating the reliability of a modality's prediction. We demonstrate that our approach is effective for diverse tasks and modalities by evaluating it for multimodal video classification, robot state regression, and multimedia retrieval. Project website: https://xiaobai1217.github.io/Unseen-Modality-Interaction/. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2306.12795v3-abstract-full').style.display = 'none'; document.getElementById('2306.12795v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 October, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 22 June, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Published at NeurIPS 2023</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2306.10122">arXiv:2306.10122</a> <span> [<a href="https://arxiv.org/pdf/2306.10122">pdf</a>, <a href="https://arxiv.org/format/2306.10122">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1145/3591106.3592267">10.1145/3591106.3592267 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Multi-Label Meta Weighting for Long-Tailed Dynamic Scene Graph Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+S">Shuo Chen</a>, <a href="/search/cs?searchtype=author&query=Du%2C+Y">Yingjun Du</a>, <a href="/search/cs?searchtype=author&query=Mettes%2C+P">Pascal Mettes</a>, <a href="/search/cs?searchtype=author&query=Snoek%2C+C+G+M">Cees G. M. Snoek</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2306.10122v1-abstract-short" style="display: inline;"> This paper investigates the problem of scene graph generation in videos with the aim of capturing semantic relations between subjects and objects in the form of $\langle$subject, predicate, object$\rangle$ triplets. Recognizing the predicate between subject and object pairs is imbalanced and multi-label in nature, ranging from ubiquitous interactions such as spatial relationships (\eg \emph{in fro… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2306.10122v1-abstract-full').style.display = 'inline'; document.getElementById('2306.10122v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2306.10122v1-abstract-full" style="display: none;"> This paper investigates the problem of scene graph generation in videos with the aim of capturing semantic relations between subjects and objects in the form of $\langle$subject, predicate, object$\rangle$ triplets. Recognizing the predicate between subject and object pairs is imbalanced and multi-label in nature, ranging from ubiquitous interactions such as spatial relationships (\eg \emph{in front of}) to rare interactions such as \emph{twisting}. In widely-used benchmarks such as Action Genome and VidOR, the imbalance ratio between the most and least frequent predicates reaches 3,218 and 3,408, respectively, surpassing even benchmarks specifically designed for long-tailed recognition. Due to the long-tailed distributions and label co-occurrences, recent state-of-the-art methods predominantly focus on the most frequently occurring predicate classes, ignoring those in the long tail. In this paper, we analyze the limitations of current approaches for scene graph generation in videos and identify a one-to-one correspondence between predicate frequency and recall performance. To make the step towards unbiased scene graph generation in videos, we introduce a multi-label meta-learning framework to deal with the biased predicate distribution. Our meta-learning framework learns a meta-weight network for each training sample over all possible label losses. We evaluate our approach on the Action Genome and VidOR benchmarks by building upon two current state-of-the-art methods for each benchmark. The experiments demonstrate that the multi-label meta-weight network improves the performance for predicates in the long tail without compromising performance for head classes, resulting in better overall performance and favorable generalizability. Code: \url{https://github.com/shanshuo/ML-MWN}. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2306.10122v1-abstract-full').style.display = 'none'; document.getElementById('2306.10122v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 June, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ICMR 2023</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> I.2.10 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2306.05411">arXiv:2306.05411</a> <span> [<a href="https://arxiv.org/pdf/2306.05411">pdf</a>, <a href="https://arxiv.org/format/2306.05411">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> R-MAE: Regions Meet Masked Autoencoders </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Nguyen%2C+D">Duy-Kien Nguyen</a>, <a href="/search/cs?searchtype=author&query=Aggarwal%2C+V">Vaibhav Aggarwal</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yanghao Li</a>, <a href="/search/cs?searchtype=author&query=Oswald%2C+M+R">Martin R. Oswald</a>, <a href="/search/cs?searchtype=author&query=Kirillov%2C+A">Alexander Kirillov</a>, <a href="/search/cs?searchtype=author&query=Snoek%2C+C+G+M">Cees G. M. Snoek</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+X">Xinlei Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2306.05411v2-abstract-short" style="display: inline;"> In this work, we explore regions as a potential visual analogue of words for self-supervised image representation learning. Inspired by Masked Autoencoding (MAE), a generative pre-training baseline, we propose masked region autoencoding to learn from groups of pixels or regions. Specifically, we design an architecture which efficiently addresses the one-to-many mapping between images and regions,… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2306.05411v2-abstract-full').style.display = 'inline'; document.getElementById('2306.05411v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2306.05411v2-abstract-full" style="display: none;"> In this work, we explore regions as a potential visual analogue of words for self-supervised image representation learning. Inspired by Masked Autoencoding (MAE), a generative pre-training baseline, we propose masked region autoencoding to learn from groups of pixels or regions. Specifically, we design an architecture which efficiently addresses the one-to-many mapping between images and regions, while being highly effective especially with high-quality regions. When integrated with MAE, our approach (R-MAE) demonstrates consistent improvements across various pre-training datasets and downstream detection and segmentation benchmarks, with negligible computational overheads. Beyond the quantitative evaluation, our analysis indicates the models pre-trained with masked region autoencoding unlock the potential for interactive segmentation. The code is provided at https://github.com/facebookresearch/r-mae. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2306.05411v2-abstract-full').style.display = 'none'; document.getElementById('2306.05411v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 January, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 8 June, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2306.05189">arXiv:2306.05189</a> <span> [<a href="https://arxiv.org/pdf/2306.05189">pdf</a>, <a href="https://arxiv.org/format/2306.05189">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> EMO: Episodic Memory Optimization for Few-Shot Meta-Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Du%2C+Y">Yingjun Du</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+J">Jiayi Shen</a>, <a href="/search/cs?searchtype=author&query=Zhen%2C+X">Xiantong Zhen</a>, <a href="/search/cs?searchtype=author&query=Snoek%2C+C+G+M">Cees G. M. Snoek</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2306.05189v3-abstract-short" style="display: inline;"> Few-shot meta-learning presents a challenge for gradient descent optimization due to the limited number of training samples per task. To address this issue, we propose an episodic memory optimization for meta-learning, we call EMO, which is inspired by the human ability to recall past learning experiences from the brain's memory. EMO retains the gradient history of past experienced tasks in extern… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2306.05189v3-abstract-full').style.display = 'inline'; document.getElementById('2306.05189v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2306.05189v3-abstract-full" style="display: none;"> Few-shot meta-learning presents a challenge for gradient descent optimization due to the limited number of training samples per task. To address this issue, we propose an episodic memory optimization for meta-learning, we call EMO, which is inspired by the human ability to recall past learning experiences from the brain's memory. EMO retains the gradient history of past experienced tasks in external memory, enabling few-shot learning in a memory-augmented way. By learning to retain and recall the learning process of past training tasks, EMO nudges parameter updates in the right direction, even when the gradients provided by a limited number of examples are uninformative. We prove theoretically that our algorithm converges for smooth, strongly convex objectives. EMO is generic, flexible, and model-agnostic, making it a simple plug-and-play optimizer that can be seamlessly embedded into existing optimization-based few-shot meta-learning approaches. Empirical results show that EMO scales well with most few-shot classification benchmarks and improves the performance of optimization-based meta-learning methods, resulting in accelerated convergence. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2306.05189v3-abstract-full').style.display = 'none'; document.getElementById('2306.05189v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 June, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 8 June, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by CoLLAs 2023</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2306.05129">arXiv:2306.05129</a> <span> [<a href="https://arxiv.org/pdf/2306.05129">pdf</a>, <a href="https://arxiv.org/format/2306.05129">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Focus for Free in Density-Based Counting </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Shi%2C+Z">Zenglin Shi</a>, <a href="/search/cs?searchtype=author&query=Mettes%2C+P">Pascal Mettes</a>, <a href="/search/cs?searchtype=author&query=Snoek%2C+C+G+M">Cees G. M. Snoek</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2306.05129v1-abstract-short" style="display: inline;"> This work considers supervised learning to count from images and their corresponding point annotations. Where density-based counting methods typically use the point annotations only to create Gaussian-density maps, which act as the supervision signal, the starting point of this work is that point annotations have counting potential beyond density map generation. We introduce two methods that repur… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2306.05129v1-abstract-full').style.display = 'inline'; document.getElementById('2306.05129v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2306.05129v1-abstract-full" style="display: none;"> This work considers supervised learning to count from images and their corresponding point annotations. Where density-based counting methods typically use the point annotations only to create Gaussian-density maps, which act as the supervision signal, the starting point of this work is that point annotations have counting potential beyond density map generation. We introduce two methods that repurpose the available point annotations to enhance counting performance. The first is a counting-specific augmentation that leverages point annotations to simulate occluded objects in both input and density images to enhance the network's robustness to occlusions. The second method, foreground distillation, generates foreground masks from the point annotations, from which we train an auxiliary network on images with blacked-out backgrounds. By doing so, it learns to extract foreground counting knowledge without interference from the background. These methods can be seamlessly integrated with existing counting advances and are adaptable to different loss functions. We demonstrate complementary effects of the approaches, allowing us to achieve robust counting results even in challenging scenarios such as background clutter, occlusion, and varying crowd densities. Our proposed approach achieves strong counting results on multiple datasets, including ShanghaiTech Part\_A and Part\_B, UCF\_QNRF, JHU-Crowd++, and NWPU-Crowd. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2306.05129v1-abstract-full').style.display = 'none'; document.getElementById('2306.05129v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 June, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">18 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2305.10309">arXiv:2305.10309</a> <span> [<a href="https://arxiv.org/pdf/2305.10309">pdf</a>, <a href="https://arxiv.org/format/2305.10309">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> MetaModulation: Learning Variational Feature Hierarchies for Few-Shot Learning with Fewer Tasks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Sun%2C+W">Wenfang Sun</a>, <a href="/search/cs?searchtype=author&query=Du%2C+Y">Yingjun Du</a>, <a href="/search/cs?searchtype=author&query=Zhen%2C+X">Xiantong Zhen</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+F">Fan Wang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+L">Ling Wang</a>, <a href="/search/cs?searchtype=author&query=Snoek%2C+C+G+M">Cees G. M. Snoek</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2305.10309v1-abstract-short" style="display: inline;"> Meta-learning algorithms are able to learn a new task using previously learned knowledge, but they often require a large number of meta-training tasks which may not be readily available. To address this issue, we propose a method for few-shot learning with fewer tasks, which we call MetaModulation. The key idea is to use a neural network to increase the density of the meta-training tasks by modula… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2305.10309v1-abstract-full').style.display = 'inline'; document.getElementById('2305.10309v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2305.10309v1-abstract-full" style="display: none;"> Meta-learning algorithms are able to learn a new task using previously learned knowledge, but they often require a large number of meta-training tasks which may not be readily available. To address this issue, we propose a method for few-shot learning with fewer tasks, which we call MetaModulation. The key idea is to use a neural network to increase the density of the meta-training tasks by modulating batch normalization parameters during meta-training. Additionally, we modify parameters at various network levels, rather than just a single layer, to increase task diversity. To account for the uncertainty caused by the limited training tasks, we propose a variational MetaModulation where the modulation parameters are treated as latent variables. We also introduce learning variational feature hierarchies by the variational MetaModulation, which modulates features at all layers and can consider task uncertainty and generate more diverse tasks. The ablation studies illustrate the advantages of utilizing a learnable task modulation at different levels and demonstrate the benefit of incorporating probabilistic variants in few-task meta-learning. Our MetaModulation and its variational variants consistently outperform state-of-the-art alternatives on four few-task meta-learning benchmarks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2305.10309v1-abstract-full').style.display = 'none'; document.getElementById('2305.10309v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 May, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by ICML 2023</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2304.00961">arXiv:2304.00961</a> <span> [<a href="https://arxiv.org/pdf/2304.00961">pdf</a>, <a href="https://arxiv.org/format/2304.00961">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Self-Ordering Point Clouds </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yang%2C+P">Pengwan Yang</a>, <a href="/search/cs?searchtype=author&query=Snoek%2C+C+G+M">Cees G. M. Snoek</a>, <a href="/search/cs?searchtype=author&query=Asano%2C+Y+M">Yuki M. Asano</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2304.00961v2-abstract-short" style="display: inline;"> In this paper we address the task of finding representative subsets of points in a 3D point cloud by means of a point-wise ordering. Only a few works have tried to address this challenging vision problem, all with the help of hard to obtain point and cloud labels. Different from these works, we introduce the task of point-wise ordering in 3D point clouds through self-supervision, which we call sel… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2304.00961v2-abstract-full').style.display = 'inline'; document.getElementById('2304.00961v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2304.00961v2-abstract-full" style="display: none;"> In this paper we address the task of finding representative subsets of points in a 3D point cloud by means of a point-wise ordering. Only a few works have tried to address this challenging vision problem, all with the help of hard to obtain point and cloud labels. Different from these works, we introduce the task of point-wise ordering in 3D point clouds through self-supervision, which we call self-ordering. We further contribute the first end-to-end trainable network that learns a point-wise ordering in a self-supervised fashion. It utilizes a novel differentiable point scoring-sorting strategy and it constructs an hierarchical contrastive scheme to obtain self-supervision signals. We extensively ablate the method and show its scalability and superior performance even compared to supervised ordering methods on multiple datasets and tasks including zero-shot ordering of point clouds from unseen categories. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2304.00961v2-abstract-full').style.display = 'none'; document.getElementById('2304.00961v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 April, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 3 April, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2304.00101">arXiv:2304.00101</a> <span> [<a href="https://arxiv.org/pdf/2304.00101">pdf</a>, <a href="https://arxiv.org/format/2304.00101">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> SuperDisco: Super-Class Discovery Improves Visual Recognition for the Long-Tail </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Du%2C+Y">Yingjun Du</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+J">Jiayi Shen</a>, <a href="/search/cs?searchtype=author&query=Zhen%2C+X">Xiantong Zhen</a>, <a href="/search/cs?searchtype=author&query=Snoek%2C+C+G+M">Cees G. M. Snoek</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2304.00101v1-abstract-short" style="display: inline;"> Modern image classifiers perform well on populated classes, while degrading considerably on tail classes with only a few instances. Humans, by contrast, effortlessly handle the long-tailed recognition challenge, since they can learn the tail representation based on different levels of semantic abstraction, making the learned tail features more discriminative. This phenomenon motivated us to propos… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2304.00101v1-abstract-full').style.display = 'inline'; document.getElementById('2304.00101v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2304.00101v1-abstract-full" style="display: none;"> Modern image classifiers perform well on populated classes, while degrading considerably on tail classes with only a few instances. Humans, by contrast, effortlessly handle the long-tailed recognition challenge, since they can learn the tail representation based on different levels of semantic abstraction, making the learned tail features more discriminative. This phenomenon motivated us to propose SuperDisco, an algorithm that discovers super-class representations for long-tailed recognition using a graph model. We learn to construct the super-class graph to guide the representation learning to deal with long-tailed distributions. Through message passing on the super-class graph, image representations are rectified and refined by attending to the most relevant entities based on the semantic similarity among their super-classes. Moreover, we propose to meta-learn the super-class graph under the supervision of a prototype graph constructed from a small amount of imbalanced data. By doing so, we obtain a more robust super-class graph that further improves the long-tailed recognition performance. The consistent state-of-the-art experiments on the long-tailed CIFAR-100, ImageNet, Places and iNaturalist demonstrate the benefit of the discovered super-class graph for dealing with long-tailed distributions. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2304.00101v1-abstract-full').style.display = 'none'; document.getElementById('2304.00101v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 March, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by CVPR 2023</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2303.05977">arXiv:2303.05977</a> <span> [<a href="https://arxiv.org/pdf/2303.05977">pdf</a>, <a href="https://arxiv.org/format/2303.05977">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Open-Ended Medical Visual Question Answering Through Prefix Tuning of Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=van+Sonsbeek%2C+T">Tom van Sonsbeek</a>, <a href="/search/cs?searchtype=author&query=Derakhshani%2C+M+M">Mohammad Mahdi Derakhshani</a>, <a href="/search/cs?searchtype=author&query=Najdenkoska%2C+I">Ivona Najdenkoska</a>, <a href="/search/cs?searchtype=author&query=Snoek%2C+C+G+M">Cees G. M. Snoek</a>, <a href="/search/cs?searchtype=author&query=Worring%2C+M">Marcel Worring</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2303.05977v2-abstract-short" style="display: inline;"> Medical Visual Question Answering (VQA) is an important challenge, as it would lead to faster and more accurate diagnoses and treatment decisions. Most existing methods approach it as a multi-class classification problem, which restricts the outcome to a predefined closed-set of curated answers. We focus on open-ended VQA and motivated by the recent advances in language models consider it as a gen… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2303.05977v2-abstract-full').style.display = 'inline'; document.getElementById('2303.05977v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2303.05977v2-abstract-full" style="display: none;"> Medical Visual Question Answering (VQA) is an important challenge, as it would lead to faster and more accurate diagnoses and treatment decisions. Most existing methods approach it as a multi-class classification problem, which restricts the outcome to a predefined closed-set of curated answers. We focus on open-ended VQA and motivated by the recent advances in language models consider it as a generative task. Leveraging pre-trained language models, we introduce a novel method particularly suited for small, domain-specific, medical datasets. To properly communicate the medical images to the language model, we develop a network that maps the extracted visual features to a set of learnable tokens. Then, alongside the question, these learnable tokens directly prompt the language model. We explore recent parameter-efficient fine-tuning strategies for language models, which allow for resource- and data-efficient fine-tuning. We evaluate our approach on the prime medical VQA benchmarks, namely, Slake, OVQA and PathVQA. The results demonstrate that our approach outperforms existing methods across various training settings while also being computationally efficient. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2303.05977v2-abstract-full').style.display = 'none'; document.getElementById('2303.05977v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 July, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 10 March, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">MSC Class:</span> 68T07 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2302.11215">arXiv:2302.11215</a> <span> [<a href="https://arxiv.org/pdf/2302.11215">pdf</a>, <a href="https://arxiv.org/format/2302.11215">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Energy-Based Test Sample Adaptation for Domain Generalization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xiao%2C+Z">Zehao Xiao</a>, <a href="/search/cs?searchtype=author&query=Zhen%2C+X">Xiantong Zhen</a>, <a href="/search/cs?searchtype=author&query=Liao%2C+S">Shengcai Liao</a>, <a href="/search/cs?searchtype=author&query=Snoek%2C+C+G+M">Cees G. M. Snoek</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2302.11215v1-abstract-short" style="display: inline;"> In this paper, we propose energy-based sample adaptation at test time for domain generalization. Where previous works adapt their models to target domains, we adapt the unseen target samples to source-trained models. To this end, we design a discriminative energy-based model, which is trained on source domains to jointly model the conditional distribution for classification and data distribution f… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2302.11215v1-abstract-full').style.display = 'inline'; document.getElementById('2302.11215v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2302.11215v1-abstract-full" style="display: none;"> In this paper, we propose energy-based sample adaptation at test time for domain generalization. Where previous works adapt their models to target domains, we adapt the unseen target samples to source-trained models. To this end, we design a discriminative energy-based model, which is trained on source domains to jointly model the conditional distribution for classification and data distribution for sample adaptation. The model is optimized to simultaneously learn a classifier and an energy function. To adapt target samples to source distributions, we iteratively update the samples by energy minimization with stochastic gradient Langevin dynamics. Moreover, to preserve the categorical information in the sample during adaptation, we introduce a categorical latent variable into the energy-based model. The latent variable is learned from the original sample before adaptation by variational inference and fixed as a condition to guide the sample update. Experiments on six benchmarks for classification of images and microblog threads demonstrate the effectiveness of our proposal. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2302.11215v1-abstract-full').style.display = 'none'; document.getElementById('2302.11215v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 February, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by ICLR 2023</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2301.13197">arXiv:2301.13197</a> <span> [<a href="https://arxiv.org/pdf/2301.13197">pdf</a>, <a href="https://arxiv.org/format/2301.13197">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Unlocking Slot Attention by Changing Optimal Transport Costs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yan Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+D+W">David W. Zhang</a>, <a href="/search/cs?searchtype=author&query=Lacoste-Julien%2C+S">Simon Lacoste-Julien</a>, <a href="/search/cs?searchtype=author&query=Burghouts%2C+G+J">Gertjan J. Burghouts</a>, <a href="/search/cs?searchtype=author&query=Snoek%2C+C+G+M">Cees G. M. Snoek</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2301.13197v2-abstract-short" style="display: inline;"> Slot attention is a powerful method for object-centric modeling in images and videos. However, its set-equivariance limits its ability to handle videos with a dynamic number of objects because it cannot break ties. To overcome this limitation, we first establish a connection between slot attention and optimal transport. Based on this new perspective we propose MESH (Minimize Entropy of Sinkhorn):… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2301.13197v2-abstract-full').style.display = 'inline'; document.getElementById('2301.13197v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2301.13197v2-abstract-full" style="display: none;"> Slot attention is a powerful method for object-centric modeling in images and videos. However, its set-equivariance limits its ability to handle videos with a dynamic number of objects because it cannot break ties. To overcome this limitation, we first establish a connection between slot attention and optimal transport. Based on this new perspective we propose MESH (Minimize Entropy of Sinkhorn): a cross-attention module that combines the tiebreaking properties of unregularized optimal transport with the speed of regularized optimal transport. We evaluate slot attention using MESH on multiple object-centric learning benchmarks and find significant improvements over slot attention in every setting. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2301.13197v2-abstract-full').style.display = 'none'; document.getElementById('2301.13197v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 May, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 30 January, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Published at International Conference on Machine Learning (ICML) 2023</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2301.02074">arXiv:2301.02074</a> <span> [<a href="https://arxiv.org/pdf/2301.02074">pdf</a>, <a href="https://arxiv.org/format/2301.02074">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Test of Time: Instilling Video-Language Models with a Sense of Time </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Bagad%2C+P">Piyush Bagad</a>, <a href="/search/cs?searchtype=author&query=Tapaswi%2C+M">Makarand Tapaswi</a>, <a href="/search/cs?searchtype=author&query=Snoek%2C+C+G+M">Cees G. M. Snoek</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2301.02074v2-abstract-short" style="display: inline;"> Modelling and understanding time remains a challenge in contemporary video understanding models. With language emerging as a key driver towards powerful generalization, it is imperative for foundational video-language models to have a sense of time. In this paper, we consider a specific aspect of temporal understanding: consistency of time order as elicited by before/after relations. We establish… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2301.02074v2-abstract-full').style.display = 'inline'; document.getElementById('2301.02074v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2301.02074v2-abstract-full" style="display: none;"> Modelling and understanding time remains a challenge in contemporary video understanding models. With language emerging as a key driver towards powerful generalization, it is imperative for foundational video-language models to have a sense of time. In this paper, we consider a specific aspect of temporal understanding: consistency of time order as elicited by before/after relations. We establish that seven existing video-language models struggle to understand even such simple temporal relations. We then question whether it is feasible to equip these foundational models with temporal awareness without re-training them from scratch. Towards this, we propose a temporal adaptation recipe on top of one such model, VideoCLIP, based on post-pretraining on a small amount of video-text data. We conduct a zero-shot evaluation of the adapted models on six datasets for three downstream tasks which require varying degrees of time awareness. We observe encouraging performance gains especially when the task needs higher time awareness. Our work serves as a first step towards probing and instilling a sense of time in existing video-language models without the need for data and compute-intense training from scratch. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2301.02074v2-abstract-full').style.display = 'none'; document.getElementById('2301.02074v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 March, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 5 January, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted for publication at CVPR 2023. Project page: https://bpiyush.github.io/testoftime-website/index.html</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2212.12395">arXiv:2212.12395</a> <span> [<a href="https://arxiv.org/pdf/2212.12395">pdf</a>, <a href="https://arxiv.org/format/2212.12395">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Detecting Objects with Context-Likelihood Graphs and Graph Refinement </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Bhowmik%2C+A">Aritra Bhowmik</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yu Wang</a>, <a href="/search/cs?searchtype=author&query=Baka%2C+N">Nora Baka</a>, <a href="/search/cs?searchtype=author&query=Oswald%2C+M+R">Martin R. Oswald</a>, <a href="/search/cs?searchtype=author&query=Snoek%2C+C+G+M">Cees G. M. Snoek</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2212.12395v3-abstract-short" style="display: inline;"> The goal of this paper is to detect objects by exploiting their interrelationships. Contrary to existing methods, which learn objects and relations separately, our key idea is to learn the object-relation distribution jointly. We first propose a novel way of creating a graphical representation of an image from inter-object relation priors and initial class predictions, we call a context-likelihood… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2212.12395v3-abstract-full').style.display = 'inline'; document.getElementById('2212.12395v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2212.12395v3-abstract-full" style="display: none;"> The goal of this paper is to detect objects by exploiting their interrelationships. Contrary to existing methods, which learn objects and relations separately, our key idea is to learn the object-relation distribution jointly. We first propose a novel way of creating a graphical representation of an image from inter-object relation priors and initial class predictions, we call a context-likelihood graph. We then learn the joint distribution with an energy-based modeling technique which allows to sample and refine the context-likelihood graph iteratively for a given image. Our formulation of jointly learning the distribution enables us to generate a more accurate graph representation of an image which leads to a better object detection performance. We demonstrate the benefits of our context-likelihood graph formulation and the energy-based graph refinement via experiments on the Visual Genome and MS-COCO datasets where we achieve a consistent improvement over object detectors like DETR and Faster-RCNN, as well as alternative methods modeling object interrelationships separately. Our method is detector agnostic, end-to-end trainable, and especially beneficial for rare object classes. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2212.12395v3-abstract-full').style.display = 'none'; document.getElementById('2212.12395v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 September, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 23 December, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">13 pages, 8 figures. In Proceedings of International Conference on Computer Vision (ICCV) 2023</span> </p> </li> </ol> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Snoek%2C+C+G+M&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Snoek%2C+C+G+M&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Snoek%2C+C+G+M&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Snoek%2C+C+G+M&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> </ul> </nav> <div class="is-hidden-tablet"> <!-- feedback for mobile only --> <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a> </span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary"> <!-- MetaColumn 1 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div> <!-- end MetaColumn 1 --> <!-- MetaColumn 2 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div> <!-- end MetaColumn 2 --> </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>