CINXE.COM

<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/>  <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b">  <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\$","\$"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a>  <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div>  <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div>  <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1–50 of 69 results for author: <span class="mathjax">Patras, I</span> </h1> </div> <div class="level-right is-hidden-mobile">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> <div class="content"> <form method="GET" action="/search/cs" aria-role="search"> Searching in archive <strong>cs</strong>. <a href="/search/?searchtype=author&query=Patras%2C+I">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Patras, I"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Patras%2C+I&terms-0-field=author&size=50&order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Patras, I"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Patras%2C+I&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Patras%2C+I&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Patras%2C+I&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> </ul> </nav> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.17813">arXiv:2501.17813</a> <span> [<a href="https://arxiv.org/pdf/2501.17813">pdf</a>, <a href="https://arxiv.org/format/2501.17813">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> P-TAME: Explain Any Image Classifier with Trained Perturbations </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ntrougkas%2C+M+V">Mariano V. Ntrougkas</a>, <a href="/search/cs?searchtype=author&query=Mezaris%2C+V">Vasileios Mezaris</a>, <a href="/search/cs?searchtype=author&query=Patras%2C+I">Ioannis Patras</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.17813v1-abstract-short" style="display: inline;"> The adoption of Deep Neural Networks (DNNs) in critical fields where predictions need to be accompanied by justifications is hindered by their inherent black-box nature. In this paper, we introduce P-TAME (Perturbation-based Trainable Attention Mechanism for Explanations), a model-agnostic method for explaining DNN-based image classifiers. P-TAME employs an auxiliary image classifier to extract fe… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.17813v1-abstract-full').style.display = 'inline'; document.getElementById('2501.17813v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.17813v1-abstract-full" style="display: none;"> The adoption of Deep Neural Networks (DNNs) in critical fields where predictions need to be accompanied by justifications is hindered by their inherent black-box nature. In this paper, we introduce P-TAME (Perturbation-based Trainable Attention Mechanism for Explanations), a model-agnostic method for explaining DNN-based image classifiers. P-TAME employs an auxiliary image classifier to extract features from the input image, bypassing the need to tailor the explanation method to the internal architecture of the backbone classifier being explained. Unlike traditional perturbation-based methods, which have high computational requirements, P-TAME offers an efficient alternative by generating high-resolution explanations in a single forward pass during inference. We apply P-TAME to explain the decisions of VGG-16, ResNet-50, and ViT-B-16, three distinct and widely used image classifiers. Quantitative and qualitative results show that our method matches or outperforms previous explainability methods, including model-specific approaches. Code and trained models will be released upon acceptance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.17813v1-abstract-full').style.display = 'none'; document.getElementById('2501.17813v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Submitted for publication</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.13223">arXiv:2501.13223</a> <span> [<a href="https://arxiv.org/pdf/2501.13223">pdf</a>, <a href="https://arxiv.org/format/2501.13223">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Scaling for Fairness? Analyzing Model Size, Data Composition, and Multilinguality in Vision-Language Bias </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Sahili%2C+Z+A">Zahraa Al Sahili</a>, <a href="/search/cs?searchtype=author&query=Patras%2C+I">Ioannis Patras</a>, <a href="/search/cs?searchtype=author&query=Purver%2C+M">Matthew Purver</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.13223v2-abstract-short" style="display: inline;"> As large scale vision language models become increasingly central to modern AI applications, understanding and mitigating social biases in these systems has never been more critical. We investigate how dataset composition, model size, and multilingual training affect gender and racial bias in a popular VLM, CLIP, and its open source variants. In particular, we systematically evaluate models traine… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.13223v2-abstract-full').style.display = 'inline'; document.getElementById('2501.13223v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.13223v2-abstract-full" style="display: none;"> As large scale vision language models become increasingly central to modern AI applications, understanding and mitigating social biases in these systems has never been more critical. We investigate how dataset composition, model size, and multilingual training affect gender and racial bias in a popular VLM, CLIP, and its open source variants. In particular, we systematically evaluate models trained on varying dataset scales and architectures, as well as multilingual versions encompassing English along with Persian, Turkish, and Finnish,languages with minimal gender marking. To assess social perception bias, we measure the zero-shot performance on face images featuring socially charged terms rooted in the psychological constructs of communion and agency, and demographic labeling bias using both the FairFace and PATA datasets. Our findings reveal three key insights. First, while larger training datasets can mitigate some biases, they may also introduce or amplify others when the data composition is imbalanced. Second, although increasing model size generally improves performance, it does not consistently reduce bias and can, in certain cases, exacerbate it. Finally, while multilingual training broadens linguistic coverage, it does not inherently neutralize bias and can transfer or intensify inequities across languages. Taken together, these results highlight the necessity of inclusive, carefully curated training data to foster fairness rather than relying solely on model scaling or language expansion. We provide a systematic evaluation for vision language bias across diverse demographics, underscoring the urgent need for intentional bias mitigation strategies in next-generation AI systems. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.13223v2-abstract-full').style.display = 'none'; document.getElementById('2501.13223v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 22 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.17415">arXiv:2412.17415</a> <span> [<a href="https://arxiv.org/pdf/2412.17415">pdf</a>, <a href="https://arxiv.org/format/2412.17415">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> </div> </div> <p class="title is-5 mathjax"> VidCtx: Context-aware Video Question Answering with Image Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Goulas%2C+A">Andreas Goulas</a>, <a href="/search/cs?searchtype=author&query=Mezaris%2C+V">Vasileios Mezaris</a>, <a href="/search/cs?searchtype=author&query=Patras%2C+I">Ioannis Patras</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.17415v1-abstract-short" style="display: inline;"> To address computational and memory limitations of Large Multimodal Models in the Video Question-Answering task, several recent methods extract textual representations per frame (e.g., by captioning) and feed them to a Large Language Model (LLM) that processes them to produce the final response. However, in this way, the LLM does not have access to visual information and often has to process repet… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.17415v1-abstract-full').style.display = 'inline'; document.getElementById('2412.17415v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.17415v1-abstract-full" style="display: none;"> To address computational and memory limitations of Large Multimodal Models in the Video Question-Answering task, several recent methods extract textual representations per frame (e.g., by captioning) and feed them to a Large Language Model (LLM) that processes them to produce the final response. However, in this way, the LLM does not have access to visual information and often has to process repetitive textual descriptions of nearby frames. To address those shortcomings, in this paper, we introduce VidCtx, a novel training-free VideoQA framework which integrates both modalities, i.e. both visual information from input frames and textual descriptions of others frames that give the appropriate context. More specifically, in the proposed framework a pre-trained Large Multimodal Model (LMM) is prompted to extract at regular intervals, question-aware textual descriptions (captions) of video frames. Those will be used as context when the same LMM will be prompted to answer the question at hand given as input a) a certain frame, b) the question and c) the context/caption of an appropriate frame. To avoid redundant information, we chose as context the descriptions of distant frames. Finally, a simple yet effective max pooling mechanism is used to aggregate the frame-level decisions. This methodology enables the model to focus on the relevant segments of the video and scale to a high number of frames. Experiments show that VidCtx achieves competitive performance among approaches that rely on open models on three public Video QA benchmarks, NExT-QA, IntentQA and STAR. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.17415v1-abstract-full').style.display = 'none'; document.getElementById('2412.17415v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Submitted for publication</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.17418">arXiv:2411.17418</a> <span> [<a href="https://arxiv.org/pdf/2411.17418">pdf</a>, <a href="https://arxiv.org/format/2411.17418">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Multimodal Outer Arithmetic Block Dual Fusion of Whole Slide Images and Omics Data for Precision Oncology </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Alwazzan%2C+O">Omnia Alwazzan</a>, <a href="/search/cs?searchtype=author&query=Gallagher-Syed%2C+A">Amaya Gallagher-Syed</a>, <a href="/search/cs?searchtype=author&query=Millner%2C+T+O">Thomas O. Millner</a>, <a href="/search/cs?searchtype=author&query=Brandner%2C+S">Sebastian Brandner</a>, <a href="/search/cs?searchtype=author&query=Patras%2C+I">Ioannis Patras</a>, <a href="/search/cs?searchtype=author&query=Marino%2C+S">Silvia Marino</a>, <a href="/search/cs?searchtype=author&query=Slabaugh%2C+G">Gregory Slabaugh</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.17418v2-abstract-short" style="display: inline;"> The integration of DNA methylation data with a Whole Slide Image (WSI) offers significant potential for enhancing the diagnostic precision of central nervous system (CNS) tumor classification in neuropathology. While existing approaches typically integrate encoded omic data with histology at either an early or late fusion stage, the potential of reintroducing omic data through dual fusion remains… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.17418v2-abstract-full').style.display = 'inline'; document.getElementById('2411.17418v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.17418v2-abstract-full" style="display: none;"> The integration of DNA methylation data with a Whole Slide Image (WSI) offers significant potential for enhancing the diagnostic precision of central nervous system (CNS) tumor classification in neuropathology. While existing approaches typically integrate encoded omic data with histology at either an early or late fusion stage, the potential of reintroducing omic data through dual fusion remains unexplored. In this paper, we propose the use of omic embeddings during early and late fusion to capture complementary information from local (patch-level) to global (slide-level) interactions, boosting performance through multimodal integration. In the early fusion stage, omic embeddings are projected onto WSI patches in latent-space, which generates embeddings that encapsulate per-patch molecular and morphological insights. This effectively incorporates omic information into the spatial representation of the WSI. These embeddings are then refined with a Multiple Instance Learning gated attention mechanism which attends to diagnostic patches. In the late fusion stage, we reintroduce the omic data by fusing it with slide-level omic-WSI embeddings using a Multimodal Outer Arithmetic Block (MOAB), which richly intermingles features from both modalities, capturing their correlations and complementarity. We demonstrate accurate CNS tumor subtyping across 20 fine-grained subtypes and validate our approach on benchmark datasets, achieving improved survival prediction on TCGA-BLCA and competitive performance on TCGA-BRCA compared to state-of-the-art methods. This dual fusion strategy enhances interpretability and classification performance, highlighting its potential for clinical diagnostics. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.17418v2-abstract-full').style.display = 'none'; document.getElementById('2411.17418v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 26 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Revised to 10 pages, with corrected typos, updated references (some added, others removed), improved figure quality, modified text for better method validation, added one more co-author, and identified the IEEE member</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.15556">arXiv:2411.15556</a> <span> [<a href="https://arxiv.org/pdf/2411.15556">pdf</a>, <a href="https://arxiv.org/format/2411.15556">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> ReWind: Understanding Long Videos with Instructed Learnable Memory </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Diko%2C+A">Anxhelo Diko</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+T">Tinghuai Wang</a>, <a href="/search/cs?searchtype=author&query=Swaileh%2C+W">Wassim Swaileh</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+S">Shiyan Sun</a>, <a href="/search/cs?searchtype=author&query=Patras%2C+I">Ioannis Patras</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.15556v1-abstract-short" style="display: inline;"> Vision-Language Models (VLMs) are crucial for applications requiring integrated understanding textual and visual information. However, existing VLMs struggle with long videos due to computational inefficiency, memory limitations, and difficulties in maintaining coherent understanding across extended sequences. To address these challenges, we introduce ReWind, a novel memory-based VLM designed for… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.15556v1-abstract-full').style.display = 'inline'; document.getElementById('2411.15556v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.15556v1-abstract-full" style="display: none;"> Vision-Language Models (VLMs) are crucial for applications requiring integrated understanding textual and visual information. However, existing VLMs struggle with long videos due to computational inefficiency, memory limitations, and difficulties in maintaining coherent understanding across extended sequences. To address these challenges, we introduce ReWind, a novel memory-based VLM designed for efficient long video understanding while preserving temporal fidelity. ReWind operates in a two-stage framework. In the first stage, ReWind maintains a dynamic learnable memory module with a novel \textbf{read-perceive-write} cycle that stores and updates instruction-relevant visual information as the video unfolds. This module utilizes learnable queries and cross-attentions between memory contents and the input stream, ensuring low memory requirements by scaling linearly with the number of tokens. In the second stage, we propose an adaptive frame selection mechanism guided by the memory content to identify instruction-relevant key moments. It enriches the memory representations with detailed spatial information by selecting a few high-resolution frames, which are then combined with the memory contents and fed into a Large Language Model (LLM) to generate the final answer. We empirically demonstrate ReWind's superior performance in visual question answering (VQA) and temporal grounding tasks, surpassing previous methods on long video benchmarks. Notably, ReWind achieves a +13\% score gain and a +12\% accuracy improvement on the MovieChat-1K VQA dataset and an +8\% mIoU increase on Charades-STA for temporal grounding. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.15556v1-abstract-full').style.display = 'none'; document.getElementById('2411.15556v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.18876">arXiv:2409.18876</a> <span> [<a href="https://arxiv.org/pdf/2409.18876">pdf</a>, <a href="https://arxiv.org/format/2409.18876">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> CemiFace: Center-based Semi-hard Synthetic Face Generation for Face Recognition </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Sun%2C+Z">Zhonglin Sun</a>, <a href="/search/cs?searchtype=author&query=Song%2C+S">Siyang Song</a>, <a href="/search/cs?searchtype=author&query=Patras%2C+I">Ioannis Patras</a>, <a href="/search/cs?searchtype=author&query=Tzimiropoulos%2C+G">Georgios Tzimiropoulos</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.18876v2-abstract-short" style="display: inline;"> Privacy issue is a main concern in developing face recognition techniques. Although synthetic face images can partially mitigate potential legal risks while maintaining effective face recognition (FR) performance, FR models trained by face images synthesized by existing generative approaches frequently suffer from performance degradation problems due to the insufficient discriminative quality of t… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.18876v2-abstract-full').style.display = 'inline'; document.getElementById('2409.18876v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.18876v2-abstract-full" style="display: none;"> Privacy issue is a main concern in developing face recognition techniques. Although synthetic face images can partially mitigate potential legal risks while maintaining effective face recognition (FR) performance, FR models trained by face images synthesized by existing generative approaches frequently suffer from performance degradation problems due to the insufficient discriminative quality of these synthesized samples. In this paper, we systematically investigate what contributes to solid face recognition model training, and reveal that face images with certain degree of similarities to their identity centers show great effectiveness in the performance of trained FR models. Inspired by this, we propose a novel diffusion-based approach (namely Center-based Semi-hard Synthetic Face Generation (CemiFace)) which produces facial samples with various levels of similarity to the subject center, thus allowing to generate face datasets containing effective discriminative samples for training face recognition. Experimental results show that with a modest degree of similarity, training on the generated dataset can produce competitive performance compared to previous generation methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.18876v2-abstract-full').style.display = 'none'; document.getElementById('2409.18876v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 27 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">accepted to NeurIPS 2024. Camera-ready version</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.17717">arXiv:2409.17717</a> <span> [<a href="https://arxiv.org/pdf/2409.17717">pdf</a>, <a href="https://arxiv.org/format/2409.17717">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Behaviour4All: in-the-wild Facial Behaviour Analysis Toolkit </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Kollias%2C+D">Dimitrios Kollias</a>, <a href="/search/cs?searchtype=author&query=Shao%2C+C">Chunchang Shao</a>, <a href="/search/cs?searchtype=author&query=Kaloidas%2C+O">Odysseus Kaloidas</a>, <a href="/search/cs?searchtype=author&query=Patras%2C+I">Ioannis Patras</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.17717v1-abstract-short" style="display: inline;"> In this paper, we introduce Behavior4All, a comprehensive, open-source toolkit for in-the-wild facial behavior analysis, integrating Face Localization, Valence-Arousal Estimation, Basic Expression Recognition and Action Unit Detection, all within a single framework. Available in both CPU-only and GPU-accelerated versions, Behavior4All leverages 12 large-scale, in-the-wild datasets consisting of ov… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.17717v1-abstract-full').style.display = 'inline'; document.getElementById('2409.17717v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.17717v1-abstract-full" style="display: none;"> In this paper, we introduce Behavior4All, a comprehensive, open-source toolkit for in-the-wild facial behavior analysis, integrating Face Localization, Valence-Arousal Estimation, Basic Expression Recognition and Action Unit Detection, all within a single framework. Available in both CPU-only and GPU-accelerated versions, Behavior4All leverages 12 large-scale, in-the-wild datasets consisting of over 5 million images from diverse demographic groups. It introduces a novel framework that leverages distribution matching and label co-annotation to address tasks with non-overlapping annotations, encoding prior knowledge of their relatedness. In the largest study of its kind, Behavior4All outperforms both state-of-the-art and toolkits in overall performance as well as fairness across all databases and tasks. It also demonstrates superior generalizability on unseen databases and on compound expression recognition. Finally, Behavior4All is way times faster than other toolkits. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.17717v1-abstract-full').style.display = 'none'; document.getElementById('2409.17717v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.11010">arXiv:2409.11010</a> <span> [<a href="https://arxiv.org/pdf/2409.11010">pdf</a>, <a href="https://arxiv.org/format/2409.11010">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> MM2Latent: Text-to-facial image generation and editing in GANs with multimodal assistance </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Meng%2C+D">Debin Meng</a>, <a href="/search/cs?searchtype=author&query=Tzelepis%2C+C">Christos Tzelepis</a>, <a href="/search/cs?searchtype=author&query=Patras%2C+I">Ioannis Patras</a>, <a href="/search/cs?searchtype=author&query=Tzimiropoulos%2C+G">Georgios Tzimiropoulos</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.11010v1-abstract-short" style="display: inline;"> Generating human portraits is a hot topic in the image generation area, e.g. mask-to-face generation and text-to-face generation. However, these unimodal generation methods lack controllability in image generation. Controllability can be enhanced by exploring the advantages and complementarities of various modalities. For instance, we can utilize the advantages of text in controlling diverse attri… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.11010v1-abstract-full').style.display = 'inline'; document.getElementById('2409.11010v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.11010v1-abstract-full" style="display: none;"> Generating human portraits is a hot topic in the image generation area, e.g. mask-to-face generation and text-to-face generation. However, these unimodal generation methods lack controllability in image generation. Controllability can be enhanced by exploring the advantages and complementarities of various modalities. For instance, we can utilize the advantages of text in controlling diverse attributes and masks in controlling spatial locations. Current state-of-the-art methods in multimodal generation face limitations due to their reliance on extensive hyperparameters, manual operations during the inference stage, substantial computational demands during training and inference, or inability to edit real images. In this paper, we propose a practical framework - MM2Latent - for multimodal image generation and editing. We use StyleGAN2 as our image generator, FaRL for text encoding, and train an autoencoders for spatial modalities like mask, sketch and 3DMM. We propose a strategy that involves training a mapping network to map the multimodal input into the w latent space of StyleGAN. The proposed framework 1) eliminates hyperparameters and manual operations in the inference stage, 2) ensures fast inference speeds, and 3) enables the editing of real images. Extensive experiments demonstrate that our method exhibits superior performance in multimodal image generation, surpassing recent GAN- and diffusion-based methods. Also, it proves effective in multimodal image editing and is faster than GAN- and diffusion-based methods. We make the code publicly available at: https://github.com/Open-Debin/MM2Latent <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.11010v1-abstract-full').style.display = 'none'; document.getElementById('2409.11010v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted at ECCV 2024 AIM workshop</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.10012">arXiv:2408.10012</a> <span> [<a href="https://arxiv.org/pdf/2408.10012">pdf</a>, <a href="https://arxiv.org/format/2408.10012">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1145/3664647.3680664">10.1145/3664647.3680664 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> CLIPCleaner: Cleaning Noisy Labels with CLIP </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Feng%2C+C">Chen Feng</a>, <a href="/search/cs?searchtype=author&query=Tzimiropoulos%2C+G">Georgios Tzimiropoulos</a>, <a href="/search/cs?searchtype=author&query=Patras%2C+I">Ioannis Patras</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.10012v2-abstract-short" style="display: inline;"> Learning with Noisy labels (LNL) poses a significant challenge for the Machine Learning community. Some of the most widely used approaches that select as clean samples for which the model itself (the in-training model) has high confidence, e.g., `small loss', can suffer from the so called `self-confirmation' bias. This bias arises because the in-training model, is at least partially trained on the… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.10012v2-abstract-full').style.display = 'inline'; document.getElementById('2408.10012v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.10012v2-abstract-full" style="display: none;"> Learning with Noisy labels (LNL) poses a significant challenge for the Machine Learning community. Some of the most widely used approaches that select as clean samples for which the model itself (the in-training model) has high confidence, e.g., `small loss', can suffer from the so called `self-confirmation' bias. This bias arises because the in-training model, is at least partially trained on the noisy labels. Furthermore, in the classification case, an additional challenge arises because some of the label noise is between classes that are visually very similar (`hard noise'). This paper addresses these challenges by proposing a method (\textit{CLIPCleaner}) that leverages CLIP, a powerful Vision-Language (VL) model for constructing a zero-shot classifier for efficient, offline, clean sample selection. This has the advantage that the sample selection is decoupled from the in-training model and that the sample selection is aware of the semantic and visual similarities between the classes due to the way that CLIP is trained. We provide theoretical justifications and empirical evidence to demonstrate the advantages of CLIP for LNL compared to conventional pre-trained models. Compared to current methods that combine iterative sample selection with various techniques, \textit{CLIPCleaner} offers a simple, single-step approach that achieves competitive or superior performance on benchmark datasets. To the best of our knowledge, this is the first time a VL model has been used for sample selection to address the problem of Learning with Noisy Labels (LNL), highlighting their potential in the domain. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.10012v2-abstract-full').style.display = 'none'; document.getElementById('2408.10012v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 19 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to ACMMM2024. Codes are available at https://github.com/MrChenFeng/CLIPCleaner_ACMMM2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.09153">arXiv:2408.09153</a> <span> [<a href="https://arxiv.org/pdf/2408.09153">pdf</a>, <a href="https://arxiv.org/format/2408.09153">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Are CLIP features all you need for Universal Synthetic Image Origin Attribution? </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Cioni%2C+D">Dario Cioni</a>, <a href="/search/cs?searchtype=author&query=Tzelepis%2C+C">Christos Tzelepis</a>, <a href="/search/cs?searchtype=author&query=Seidenari%2C+L">Lorenzo Seidenari</a>, <a href="/search/cs?searchtype=author&query=Patras%2C+I">Ioannis Patras</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.09153v1-abstract-short" style="display: inline;"> The steady improvement of Diffusion Models for visual synthesis has given rise to many new and interesting use cases of synthetic images but also has raised concerns about their potential abuse, which poses significant societal threats. To address this, fake images need to be detected and attributed to their source model, and given the frequent release of new generators, realistic applications nee… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.09153v1-abstract-full').style.display = 'inline'; document.getElementById('2408.09153v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.09153v1-abstract-full" style="display: none;"> The steady improvement of Diffusion Models for visual synthesis has given rise to many new and interesting use cases of synthetic images but also has raised concerns about their potential abuse, which poses significant societal threats. To address this, fake images need to be detected and attributed to their source model, and given the frequent release of new generators, realistic applications need to consider an Open-Set scenario where some models are unseen at training time. Existing forensic techniques are either limited to Closed-Set settings or to GAN-generated images, relying on fragile frequency-based "fingerprint" features. By contrast, we propose a simple yet effective framework that incorporates features from large pre-trained foundation models to perform Open-Set origin attribution of synthetic images produced by various generative models, including Diffusion Models. We show that our method leads to remarkable attribution performance, even in the low-data regime, exceeding the performance of existing methods and generalizes better on images obtained from a diverse set of architectures. We make the code publicly available at: https://github.com/ciodar/UniversalAttribution. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.09153v1-abstract-full').style.display = 'none'; document.getElementById('2408.09153v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted at ECCV 2024 TWYN workshop</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.04983">arXiv:2408.04983</a> <span> [<a href="https://arxiv.org/pdf/2408.04983">pdf</a>, <a href="https://arxiv.org/format/2408.04983">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Get Confused Cautiously: Textual Sequence Memorization Erasure with Selective Entropy Maximization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zhaohan Zhang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Ziquan Liu</a>, <a href="/search/cs?searchtype=author&query=Patras%2C+I">Ioannis Patras</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.04983v1-abstract-short" style="display: inline;"> Large Language Models (LLMs) have been found to memorize and recite some of the textual sequences from their training set verbatim, raising broad concerns about privacy and copyright issues when using LLMs. This Textual Sequence Memorization (TSM) phenomenon leads to a high demand to regulate LLM output to prevent it from generating certain memorized text to meet user requirements. However, our em… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.04983v1-abstract-full').style.display = 'inline'; document.getElementById('2408.04983v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.04983v1-abstract-full" style="display: none;"> Large Language Models (LLMs) have been found to memorize and recite some of the textual sequences from their training set verbatim, raising broad concerns about privacy and copyright issues when using LLMs. This Textual Sequence Memorization (TSM) phenomenon leads to a high demand to regulate LLM output to prevent it from generating certain memorized text to meet user requirements. However, our empirical study reveals that existing methods for TSM erasure fail to forget massive memorized samples without substantially jeopardizing the model utility. To achieve a better trade-off between the effectiveness of TSM erasure and model utility in LLMs, our paper proposes a new framework based on Entropy Maximization with Selective Optimization (EMSO), where the updated weights are chosen with a novel contrastive gradient metric without any participation of additional model or data. Our analysis shows that training with the entropy maximization loss has a more stable optimization process and better keeps model utility than existing methods. The contrastive gradient metric localizes the most influential weight for TSM erasure by taking both the gradient magnitude and direction into consideration. Extensive experiments across three model scales demonstrate that our method excels in handling large-scale forgetting requests while preserving model ability in language generation and reasoning. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.04983v1-abstract-full').style.display = 'none'; document.getElementById('2408.04983v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">15 pages, 7 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.16804">arXiv:2407.16804</a> <span> [<a href="https://arxiv.org/pdf/2407.16804">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computers and Society">cs.CY</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Emerging Technologies">cs.ET</span> </div> </div> <p class="title is-5 mathjax"> Multimodal Machine Learning in Mental Health: A Survey of Data, Algorithms, and Challenges </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Sahili%2C+Z+A">Zahraa Al Sahili</a>, <a href="/search/cs?searchtype=author&query=Patras%2C+I">Ioannis Patras</a>, <a href="/search/cs?searchtype=author&query=Purver%2C+M">Matthew Purver</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.16804v1-abstract-short" style="display: inline;"> The application of machine learning (ML) in detecting, diagnosing, and treating mental health disorders is garnering increasing attention. Traditionally, research has focused on single modalities, such as text from clinical notes, audio from speech samples, or video of interaction patterns. Recently, multimodal ML, which combines information from multiple modalities, has demonstrated significant p… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.16804v1-abstract-full').style.display = 'inline'; document.getElementById('2407.16804v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.16804v1-abstract-full" style="display: none;"> The application of machine learning (ML) in detecting, diagnosing, and treating mental health disorders is garnering increasing attention. Traditionally, research has focused on single modalities, such as text from clinical notes, audio from speech samples, or video of interaction patterns. Recently, multimodal ML, which combines information from multiple modalities, has demonstrated significant promise in offering novel insights into human behavior patterns and recognizing mental health symptoms and risk factors. Despite its potential, multimodal ML in mental health remains an emerging field, facing several complex challenges before practical applications can be effectively developed. This survey provides a comprehensive overview of the data availability and current state-of-the-art multimodal ML applications for mental health. It discusses key challenges that must be addressed to advance the field. The insights from this survey aim to deepen the understanding of the potential and limitations of multimodal ML in mental health, guiding future research and development in this evolving domain. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.16804v1-abstract-full').style.display = 'none'; document.getElementById('2407.16804v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.11168">arXiv:2407.11168</a> <span> [<a href="https://arxiv.org/pdf/2407.11168">pdf</a>, <a href="https://arxiv.org/format/2407.11168">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Efficient Unsupervised Visual Representation Learning with Explicit Cluster Balancing </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Metaxas%2C+I+M">Ioannis Maniadis Metaxas</a>, <a href="/search/cs?searchtype=author&query=Tzimiropoulos%2C+G">Georgios Tzimiropoulos</a>, <a href="/search/cs?searchtype=author&query=Patras%2C+I">Ioannis Patras</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.11168v1-abstract-short" style="display: inline;"> Self-supervised learning has recently emerged as the preeminent pretraining paradigm across and between modalities, with remarkable results. In the image domain specifically, group (or cluster) discrimination has been one of the most successful methods. However, such frameworks need to guard against heavily imbalanced cluster assignments to prevent collapse to trivial solutions. Existing works typ… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.11168v1-abstract-full').style.display = 'inline'; document.getElementById('2407.11168v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.11168v1-abstract-full" style="display: none;"> Self-supervised learning has recently emerged as the preeminent pretraining paradigm across and between modalities, with remarkable results. In the image domain specifically, group (or cluster) discrimination has been one of the most successful methods. However, such frameworks need to guard against heavily imbalanced cluster assignments to prevent collapse to trivial solutions. Existing works typically solve this by reweighing cluster assignments to promote balance, or with offline operations (e.g. regular re-clustering) that prevent collapse. However, the former typically requires large batch sizes, which leads to increased resource requirements, and the latter introduces scalability issues with regard to large datasets. In this work, we propose ExCB, a framework that tackles this problem with a novel cluster balancing method. ExCB estimates the relative size of the clusters across batches and balances them by adjusting cluster assignments, proportionately to their relative size and in an online manner. Thereby, it overcomes previous methods' dependence on large batch sizes and is fully online, and therefore scalable to any dataset. We conduct extensive experiments to evaluate our approach and demonstrate that ExCB: a) achieves state-of-the-art results with significantly reduced resource requirements compared to previous works, b) is fully online, and therefore scalable to large datasets, and c) is stable and effective even with very small batch sizes. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.11168v1-abstract-full').style.display = 'none'; document.getElementById('2407.11168v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted at ECCV 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.09070">arXiv:2406.09070</a> <span> [<a href="https://arxiv.org/pdf/2406.09070">pdf</a>, <a href="https://arxiv.org/format/2406.09070">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> FairCoT: Enhancing Fairness in Diffusion Models via Chain of Thought Reasoning of Multimodal Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Sahili%2C+Z+A">Zahraa Al Sahili</a>, <a href="/search/cs?searchtype=author&query=Patras%2C+I">Ioannis Patras</a>, <a href="/search/cs?searchtype=author&query=Purver%2C+M">Matthew Purver</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.09070v2-abstract-short" style="display: inline;"> In the domain of text-to-image generative models, biases inherent in training datasets often propagate into generated content, posing significant ethical challenges, particularly in socially sensitive contexts. We introduce FairCoT, a novel framework that enhances fairness in diffusion models through Chain-of-Thought (CoT) reasoning within multimodal generative large language models (LLMs). FairCo… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.09070v2-abstract-full').style.display = 'inline'; document.getElementById('2406.09070v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.09070v2-abstract-full" style="display: none;"> In the domain of text-to-image generative models, biases inherent in training datasets often propagate into generated content, posing significant ethical challenges, particularly in socially sensitive contexts. We introduce FairCoT, a novel framework that enhances fairness in diffusion models through Chain-of-Thought (CoT) reasoning within multimodal generative large language models (LLMs). FairCoT employs iterative CoT refinement and attire-based attribute prediction to systematically mitigate biases, ensuring diverse and equitable representation in generated images. By integrating iterative reasoning processes, FairCoT addresses the limitations of zero-shot CoT in sensitive scenarios, balancing creativity with ethical responsibility. Experimental evaluations across multiple models, including DALL-E and various Stable Diffusion variants, demonstrate that FairCoT significantly improves fairness and diversity metrics without compromising image quality or relevance. Our approach advances ethical AI practices in generative modeling, promoting socially responsible content generation and setting new standards for fairness in AI-generated imagery. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.09070v2-abstract-full').style.display = 'none'; document.getElementById('2406.09070v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 13 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.19100">arXiv:2405.19100</a> <span> [<a href="https://arxiv.org/pdf/2405.19100">pdf</a>, <a href="https://arxiv.org/format/2405.19100">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Enhancing Zero-Shot Facial Expression Recognition by LLM Knowledge Transfer </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhao%2C+Z">Zengqun Zhao</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+Y">Yu Cao</a>, <a href="/search/cs?searchtype=author&query=Gong%2C+S">Shaogang Gong</a>, <a href="/search/cs?searchtype=author&query=Patras%2C+I">Ioannis Patras</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.19100v3-abstract-short" style="display: inline;"> Current facial expression recognition (FER) models are often designed in a supervised learning manner and thus are constrained by the lack of large-scale facial expression images with high-quality annotations. Consequently, these models often fail to generalize well, performing poorly on unseen images in inference. Vision-language-based zero-shot models demonstrate a promising potential for addres… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.19100v3-abstract-full').style.display = 'inline'; document.getElementById('2405.19100v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.19100v3-abstract-full" style="display: none;"> Current facial expression recognition (FER) models are often designed in a supervised learning manner and thus are constrained by the lack of large-scale facial expression images with high-quality annotations. Consequently, these models often fail to generalize well, performing poorly on unseen images in inference. Vision-language-based zero-shot models demonstrate a promising potential for addressing such challenges. However, these models lack task-specific knowledge and therefore are not optimized for the nuances of recognizing facial expressions. To bridge this gap, this work proposes a novel method, Exp-CLIP, to enhance zero-shot FER by transferring the task knowledge from large language models (LLMs). Specifically, based on the pre-trained vision-language encoders, we incorporate a projection head designed to map the initial joint vision-language space into a space that captures representations of facial actions. To train this projection head for subsequent zero-shot predictions, we propose to align the projected visual representations with task-specific semantic meanings derived from the LLM encoder, and the text instruction-based strategy is employed to customize the LLM knowledge. Given unlabelled facial data and efficient training of the projection head, Exp-CLIP achieves superior zero-shot results to the CLIP models and several other large vision-language models (LVLMs) on seven in-the-wild FER datasets. The code and pre-trained models are available at https://github.com/zengqunzhao/Exp-CLIP. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.19100v3-abstract-full').style.display = 'none'; document.getElementById('2405.19100v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 29 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted at WACV 2025 (Camera-Ready Version)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2404.18591">arXiv:2404.18591</a> <span> [<a href="https://arxiv.org/pdf/2404.18591">pdf</a>, <a href="https://arxiv.org/format/2404.18591">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> FashionSD-X: Multimodal Fashion Garment Synthesis using Latent Diffusion </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Singh%2C+A+K">Abhishek Kumar Singh</a>, <a href="/search/cs?searchtype=author&query=Patras%2C+I">Ioannis Patras</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2404.18591v1-abstract-short" style="display: inline;"> The rapid evolution of the fashion industry increasingly intersects with technological advancements, particularly through the integration of generative AI. This study introduces a novel generative pipeline designed to transform the fashion design process by employing latent diffusion models. Utilizing ControlNet and LoRA fine-tuning, our approach generates high-quality images from multimodal input… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.18591v1-abstract-full').style.display = 'inline'; document.getElementById('2404.18591v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2404.18591v1-abstract-full" style="display: none;"> The rapid evolution of the fashion industry increasingly intersects with technological advancements, particularly through the integration of generative AI. This study introduces a novel generative pipeline designed to transform the fashion design process by employing latent diffusion models. Utilizing ControlNet and LoRA fine-tuning, our approach generates high-quality images from multimodal inputs such as text and sketches. We leverage and enhance state-of-the-art virtual try-on datasets, including Multimodal Dress Code and VITON-HD, by integrating sketch data. Our evaluation, utilizing metrics like FID, CLIP Score, and KID, demonstrates that our model significantly outperforms traditional stable diffusion models. The results not only highlight the effectiveness of our model in generating fashion-appropriate outputs but also underscore the potential of diffusion models in revolutionizing fashion design workflows. This research paves the way for more interactive, personalized, and technologically enriched methodologies in fashion design and representation, bridging the gap between creative vision and practical application. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.18591v1-abstract-full').style.display = 'none'; document.getElementById('2404.18591v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">9 pages, 8 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2404.07078">arXiv:2404.07078</a> <span> [<a href="https://arxiv.org/pdf/2404.07078">pdf</a>, <a href="https://arxiv.org/format/2404.07078">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> </div> </div> <p class="title is-5 mathjax"> VLLMs Provide Better Context for Emotion Understanding Through Common Sense Reasoning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xenos%2C+A">Alexandros Xenos</a>, <a href="/search/cs?searchtype=author&query=Foteinopoulou%2C+N+M">Niki Maria Foteinopoulou</a>, <a href="/search/cs?searchtype=author&query=Ntinou%2C+I">Ioanna Ntinou</a>, <a href="/search/cs?searchtype=author&query=Patras%2C+I">Ioannis Patras</a>, <a href="/search/cs?searchtype=author&query=Tzimiropoulos%2C+G">Georgios Tzimiropoulos</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2404.07078v1-abstract-short" style="display: inline;"> Recognising emotions in context involves identifying the apparent emotions of an individual, taking into account contextual cues from the surrounding scene. Previous approaches to this task have involved the design of explicit scene-encoding architectures or the incorporation of external scene-related information, such as captions. However, these methods often utilise limited contextual informatio… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.07078v1-abstract-full').style.display = 'inline'; document.getElementById('2404.07078v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2404.07078v1-abstract-full" style="display: none;"> Recognising emotions in context involves identifying the apparent emotions of an individual, taking into account contextual cues from the surrounding scene. Previous approaches to this task have involved the design of explicit scene-encoding architectures or the incorporation of external scene-related information, such as captions. However, these methods often utilise limited contextual information or rely on intricate training pipelines. In this work, we leverage the groundbreaking capabilities of Vision-and-Large-Language Models (VLLMs) to enhance in-context emotion classification without introducing complexity to the training process in a two-stage approach. In the first stage, we propose prompting VLLMs to generate descriptions in natural language of the subject's apparent emotion relative to the visual context. In the second stage, the descriptions are used as contextual information and, along with the image input, are used to train a transformer-based architecture that fuses text and visual features before the final classification task. Our experimental results show that the text and image features have complementary information, and our fused architecture significantly outperforms the individual modalities without any complex training methods. We evaluate our approach on three different datasets, namely, EMOTIC, CAER-S, and BoLD, and achieve state-of-the-art or comparable accuracy across all datasets and metrics compared to much more complex approaches. The code will be made publicly available on github: https://github.com/NickyFot/EmoCommonSense.git <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.07078v1-abstract-full').style.display = 'none'; document.getElementById('2404.07078v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">A. Xenos, N. Foteinopoulou and I. Ntinou contributed equally to this work; 14 pages, 5 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2403.17217">arXiv:2403.17217</a> <span> [<a href="https://arxiv.org/pdf/2403.17217">pdf</a>, <a href="https://arxiv.org/format/2403.17217">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> DiffusionAct: Controllable Diffusion Autoencoder for One-shot Face Reenactment </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Bounareli%2C+S">Stella Bounareli</a>, <a href="/search/cs?searchtype=author&query=Tzelepis%2C+C">Christos Tzelepis</a>, <a href="/search/cs?searchtype=author&query=Argyriou%2C+V">Vasileios Argyriou</a>, <a href="/search/cs?searchtype=author&query=Patras%2C+I">Ioannis Patras</a>, <a href="/search/cs?searchtype=author&query=Tzimiropoulos%2C+G">Georgios Tzimiropoulos</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2403.17217v1-abstract-short" style="display: inline;"> Video-driven neural face reenactment aims to synthesize realistic facial images that successfully preserve the identity and appearance of a source face, while transferring the target head pose and facial expressions. Existing GAN-based methods suffer from either distortions and visual artifacts or poor reconstruction quality, i.e., the background and several important appearance details, such as h… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.17217v1-abstract-full').style.display = 'inline'; document.getElementById('2403.17217v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2403.17217v1-abstract-full" style="display: none;"> Video-driven neural face reenactment aims to synthesize realistic facial images that successfully preserve the identity and appearance of a source face, while transferring the target head pose and facial expressions. Existing GAN-based methods suffer from either distortions and visual artifacts or poor reconstruction quality, i.e., the background and several important appearance details, such as hair style/color, glasses and accessories, are not faithfully reconstructed. Recent advances in Diffusion Probabilistic Models (DPMs) enable the generation of high-quality realistic images. To this end, in this paper we present DiffusionAct, a novel method that leverages the photo-realistic image generation of diffusion models to perform neural face reenactment. Specifically, we propose to control the semantic space of a Diffusion Autoencoder (DiffAE), in order to edit the facial pose of the input images, defined as the head pose orientation and the facial expressions. Our method allows one-shot, self, and cross-subject reenactment, without requiring subject-specific fine-tuning. We compare against state-of-the-art GAN-, StyleGAN2-, and diffusion-based methods, showing better or on-par reenactment performance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.17217v1-abstract-full').style.display = 'none'; document.getElementById('2403.17217v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Project page: https://stelabou.github.io/diffusionact/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2403.08161">arXiv:2403.08161</a> <span> [<a href="https://arxiv.org/pdf/2403.08161">pdf</a>, <a href="https://arxiv.org/format/2403.08161">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> LAFS: Landmark-based Facial Self-supervised Learning for Face Recognition </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Sun%2C+Z">Zhonglin Sun</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+C">Chen Feng</a>, <a href="/search/cs?searchtype=author&query=Patras%2C+I">Ioannis Patras</a>, <a href="/search/cs?searchtype=author&query=Tzimiropoulos%2C+G">Georgios Tzimiropoulos</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2403.08161v1-abstract-short" style="display: inline;"> In this work we focus on learning facial representations that can be adapted to train effective face recognition models, particularly in the absence of labels. Firstly, compared with existing labelled face datasets, a vastly larger magnitude of unlabeled faces exists in the real world. We explore the learning strategy of these unlabeled facial images through self-supervised pretraining to transfer… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.08161v1-abstract-full').style.display = 'inline'; document.getElementById('2403.08161v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2403.08161v1-abstract-full" style="display: none;"> In this work we focus on learning facial representations that can be adapted to train effective face recognition models, particularly in the absence of labels. Firstly, compared with existing labelled face datasets, a vastly larger magnitude of unlabeled faces exists in the real world. We explore the learning strategy of these unlabeled facial images through self-supervised pretraining to transfer generalized face recognition performance. Moreover, motivated by one recent finding, that is, the face saliency area is critical for face recognition, in contrast to utilizing random cropped blocks of images for constructing augmentations in pretraining, we utilize patches localized by extracted facial landmarks. This enables our method - namely LAndmark-based Facial Self-supervised learning LAFS), to learn key representation that is more critical for face recognition. We also incorporate two landmark-specific augmentations which introduce more diversity of landmark information to further regularize the learning. With learned landmark-based facial representations, we further adapt the representation for face recognition with regularization mitigating variations in landmark positions. Our method achieves significant improvement over the state-of-the-art on multiple face recognition benchmarks, especially on more challenging few-shot scenarios. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.08161v1-abstract-full').style.display = 'none'; document.getElementById('2403.08161v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">accepted to CVPR 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2403.06349">arXiv:2403.06349</a> <span> [<a href="https://arxiv.org/pdf/2403.06349">pdf</a>, <a href="https://arxiv.org/format/2403.06349">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1109/ISBI53787.2023.10230698">10.1109/ISBI53787.2023.10230698 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> MOAB: Multi-Modal Outer Arithmetic Block For Fusion Of Histopathological Images And Genetic Data For Brain Tumor Grading </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Alwazzan%2C+O">Omnia Alwazzan</a>, <a href="/search/cs?searchtype=author&query=Khan%2C+A">Abbas Khan</a>, <a href="/search/cs?searchtype=author&query=Patras%2C+I">Ioannis Patras</a>, <a href="/search/cs?searchtype=author&query=Slabaugh%2C+G">Gregory Slabaugh</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2403.06349v1-abstract-short" style="display: inline;"> Brain tumors are an abnormal growth of cells in the brain. They can be classified into distinct grades based on their growth. Often grading is performed based on a histological image and is one of the most significant predictors of a patients prognosis, the higher the grade, the more aggressive the tumor. Correct diagnosis of a tumor grade remains challenging. Though histopathological grading has… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.06349v1-abstract-full').style.display = 'inline'; document.getElementById('2403.06349v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2403.06349v1-abstract-full" style="display: none;"> Brain tumors are an abnormal growth of cells in the brain. They can be classified into distinct grades based on their growth. Often grading is performed based on a histological image and is one of the most significant predictors of a patients prognosis, the higher the grade, the more aggressive the tumor. Correct diagnosis of a tumor grade remains challenging. Though histopathological grading has been shown to be prognostic, results are subject to interobserver variability, even among experienced pathologists. Recently, the World Health Organization reported that advances in molecular genetics have led to improvements in tumor classification. This paper seeks to integrate histological images and genetic data for improved computer-aided diagnosis. We propose a novel Multi-modal Outer Arithmetic Block (MOAB) based on arithmetic operations to combine latent representations of the different modalities for predicting the tumor grade (Grade \rom{2}, \rom{3} and \rom{4}). Extensive experiments evaluate the effectiveness of our approach. By applying MOAB to The Cancer Genome Atlas (TCGA) glioma dataset, we show that it can improve separation between similar classes (Grade \rom{2} and \rom{3}) and outperform prior state-of-the-art grade classification techniques. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.06349v1-abstract-full').style.display = 'none'; document.getElementById('2403.06349v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> pages={1--5},year={2023},organization={IEEE} </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2403.06339">arXiv:2403.06339</a> <span> [<a href="https://arxiv.org/pdf/2403.06339">pdf</a>, <a href="https://arxiv.org/format/2403.06339">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> FOAA: Flattened Outer Arithmetic Attention For Multimodal Tumor Classification </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Alwazzan%2C+O">Omnia Alwazzan</a>, <a href="/search/cs?searchtype=author&query=Patras%2C+I">Ioannis Patras</a>, <a href="/search/cs?searchtype=author&query=Slabaugh%2C+G">Gregory Slabaugh</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2403.06339v1-abstract-short" style="display: inline;"> Fusion of multimodal healthcare data holds great promise to provide a holistic view of a patient's health, taking advantage of the complementarity of different modalities while leveraging their correlation. This paper proposes a simple and effective approach, inspired by attention, to fuse discriminative features from different modalities. We propose a novel attention mechanism, called Flattened O… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.06339v1-abstract-full').style.display = 'inline'; document.getElementById('2403.06339v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2403.06339v1-abstract-full" style="display: none;"> Fusion of multimodal healthcare data holds great promise to provide a holistic view of a patient's health, taking advantage of the complementarity of different modalities while leveraging their correlation. This paper proposes a simple and effective approach, inspired by attention, to fuse discriminative features from different modalities. We propose a novel attention mechanism, called Flattened Outer Arithmetic Attention (FOAA), which relies on outer arithmetic operators (addition, subtraction, product, and division) to compute attention scores from keys, queries and values derived from flattened embeddings of each modality. We demonstrate how FOAA can be implemented for self-attention and cross-attention, providing a reusable component in neural network architectures. We evaluate FOAA on two datasets for multimodal tumor classification and achieve state-of-the-art results, and we demonstrate that features enriched by FOAA are superior to those derived from other fusion approaches. The code is publicly available at \href{https://github.com/omniaalwazzan/FOAA}{https://github.com/omniaalwazzan/FOAA} <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.06339v1-abstract-full').style.display = 'none'; document.getElementById('2403.06339v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">This paper has been accepted for ISBI-2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2403.02138">arXiv:2403.02138</a> <span> [<a href="https://arxiv.org/pdf/2403.02138">pdf</a>, <a href="https://arxiv.org/format/2403.02138">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Self-Supervised Facial Representation Learning with Facial Region Awareness </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Gao%2C+Z">Zheng Gao</a>, <a href="/search/cs?searchtype=author&query=Patras%2C+I">Ioannis Patras</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2403.02138v1-abstract-short" style="display: inline;"> Self-supervised pre-training has been proved to be effective in learning transferable representations that benefit various visual tasks. This paper asks this question: can self-supervised pre-training learn general facial representations for various facial analysis tasks? Recent efforts toward this goal are limited to treating each face image as a whole, i.e., learning consistent facial representa… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.02138v1-abstract-full').style.display = 'inline'; document.getElementById('2403.02138v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2403.02138v1-abstract-full" style="display: none;"> Self-supervised pre-training has been proved to be effective in learning transferable representations that benefit various visual tasks. This paper asks this question: can self-supervised pre-training learn general facial representations for various facial analysis tasks? Recent efforts toward this goal are limited to treating each face image as a whole, i.e., learning consistent facial representations at the image-level, which overlooks the consistency of local facial representations (i.e., facial regions like eyes, nose, etc). In this work, we make a first attempt to propose a novel self-supervised facial representation learning framework to learn consistent global and local facial representations, Facial Region Awareness (FRA). Specifically, we explicitly enforce the consistency of facial regions by matching the local facial representations across views, which are extracted with learned heatmaps highlighting the facial regions. Inspired by the mask prediction in supervised semantic segmentation, we obtain the heatmaps via cosine similarity between the per-pixel projection of feature maps and facial mask embeddings computed from learnable positional embeddings, which leverage the attention mechanism to globally look up the facial image for facial regions. To learn such heatmaps, we formulate the learning of facial mask embeddings as a deep clustering problem by assigning the pixel features from the feature maps to them. The transfer learning results on facial classification and regression tasks show that our FRA outperforms previous pre-trained models and more importantly, using ResNet as the unified backbone for various tasks, our FRA achieves comparable or even better performance compared with SOTA methods in facial analysis tasks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.02138v1-abstract-full').style.display = 'none'; document.getElementById('2403.02138v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2402.12550">arXiv:2402.12550</a> <span> [<a href="https://arxiv.org/pdf/2402.12550">pdf</a>, <a href="https://arxiv.org/format/2402.12550">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Multilinear Mixture of Experts: Scalable Expert Specialization through Factorization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Oldfield%2C+J">James Oldfield</a>, <a href="/search/cs?searchtype=author&query=Georgopoulos%2C+M">Markos Georgopoulos</a>, <a href="/search/cs?searchtype=author&query=Chrysos%2C+G+G">Grigorios G. Chrysos</a>, <a href="/search/cs?searchtype=author&query=Tzelepis%2C+C">Christos Tzelepis</a>, <a href="/search/cs?searchtype=author&query=Panagakis%2C+Y">Yannis Panagakis</a>, <a href="/search/cs?searchtype=author&query=Nicolaou%2C+M+A">Mihalis A. Nicolaou</a>, <a href="/search/cs?searchtype=author&query=Deng%2C+J">Jiankang Deng</a>, <a href="/search/cs?searchtype=author&query=Patras%2C+I">Ioannis Patras</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2402.12550v4-abstract-short" style="display: inline;"> The Mixture of Experts (MoE) paradigm provides a powerful way to decompose dense layers into smaller, modular computations often more amenable to human interpretation, debugging, and editability. However, a major challenge lies in the computational cost of scaling the number of experts high enough to achieve fine-grained specialization. In this paper, we propose the Multilinear Mixture of Experts… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.12550v4-abstract-full').style.display = 'inline'; document.getElementById('2402.12550v4-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2402.12550v4-abstract-full" style="display: none;"> The Mixture of Experts (MoE) paradigm provides a powerful way to decompose dense layers into smaller, modular computations often more amenable to human interpretation, debugging, and editability. However, a major challenge lies in the computational cost of scaling the number of experts high enough to achieve fine-grained specialization. In this paper, we propose the Multilinear Mixture of Experts ($渭$MoE) layer to address this, focusing on vision models. $渭$MoE layers enable scalable expert specialization by performing an implicit computation on prohibitively large weight tensors entirely in factorized form. Consequently, $渭$MoEs (1) avoid the restrictively high inference-time costs of dense MoEs, yet (2) do not inherit the training issues of the popular sparse MoEs' discrete (non-differentiable) expert routing. We present both qualitative and quantitative evidence that scaling $渭$MoE layers when fine-tuning foundation models for vision tasks leads to more specialized experts at the class-level, further enabling manual bias correction in CelebA attribute classification. Finally, we show qualitative results demonstrating the expert specialism achieved when pre-training large GPT2 and MLP-Mixer models with parameter-matched $渭$MoE blocks at every layer, maintaining comparable accuracy. Our code is available at: https://github.com/james-oldfield/muMoE. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.12550v4-abstract-full').style.display = 'none'; document.getElementById('2402.12550v4-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 19 February, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted at NeurIPS 2024. Github: https://github.com/james-oldfield/muMoE. Project page: https://james-oldfield.github.io/muMoE</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2402.03553">arXiv:2402.03553</a> <span> [<a href="https://arxiv.org/pdf/2402.03553">pdf</a>, <a href="https://arxiv.org/format/2402.03553">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> One-shot Neural Face Reenactment via Finding Directions in GAN's Latent Space </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Bounareli%2C+S">Stella Bounareli</a>, <a href="/search/cs?searchtype=author&query=Tzelepis%2C+C">Christos Tzelepis</a>, <a href="/search/cs?searchtype=author&query=Argyriou%2C+V">Vasileios Argyriou</a>, <a href="/search/cs?searchtype=author&query=Patras%2C+I">Ioannis Patras</a>, <a href="/search/cs?searchtype=author&query=Tzimiropoulos%2C+G">Georgios Tzimiropoulos</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2402.03553v1-abstract-short" style="display: inline;"> In this paper, we present our framework for neural face/head reenactment whose goal is to transfer the 3D head orientation and expression of a target face to a source face. Previous methods focus on learning embedding networks for identity and head pose/expression disentanglement which proves to be a rather hard task, degrading the quality of the generated images. We take a different approach, byp… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.03553v1-abstract-full').style.display = 'inline'; document.getElementById('2402.03553v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2402.03553v1-abstract-full" style="display: none;"> In this paper, we present our framework for neural face/head reenactment whose goal is to transfer the 3D head orientation and expression of a target face to a source face. Previous methods focus on learning embedding networks for identity and head pose/expression disentanglement which proves to be a rather hard task, degrading the quality of the generated images. We take a different approach, bypassing the training of such networks, by using (fine-tuned) pre-trained GANs which have been shown capable of producing high-quality facial images. Because GANs are characterized by weak controllability, the core of our approach is a method to discover which directions in latent GAN space are responsible for controlling head pose and expression variations. We present a simple pipeline to learn such directions with the aid of a 3D shape model which, by construction, inherently captures disentangled directions for head pose, identity, and expression. Moreover, we show that by embedding real images in the GAN latent space, our method can be successfully used for the reenactment of real-world faces. Our method features several favorable properties including using a single source image (one-shot) and enabling cross-person reenactment. Extensive qualitative and quantitative results show that our approach typically produces reenacted faces of notably higher quality than those produced by state-of-the-art methods for the standard benchmarks of VoxCeleb1 & 2. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.03553v1-abstract-full').style.display = 'none'; document.getElementById('2402.03553v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 February, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Preprint version, accepted for publication in International Journal of Computer Vision (IJCV)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2311.01573">arXiv:2311.01573</a> <span> [<a href="https://arxiv.org/pdf/2311.01573">pdf</a>, <a href="https://arxiv.org/format/2311.01573">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computers and Society">cs.CY</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Improving Fairness using Vision-Language Driven Image Augmentation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=D%27Inc%C3%A0%2C+M">Moreno D'Inc脿</a>, <a href="/search/cs?searchtype=author&query=Tzelepis%2C+C">Christos Tzelepis</a>, <a href="/search/cs?searchtype=author&query=Patras%2C+I">Ioannis Patras</a>, <a href="/search/cs?searchtype=author&query=Sebe%2C+N">Nicu Sebe</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2311.01573v1-abstract-short" style="display: inline;"> Fairness is crucial when training a deep-learning discriminative model, especially in the facial domain. Models tend to correlate specific characteristics (such as age and skin color) with unrelated attributes (downstream tasks), resulting in biases which do not correspond to reality. It is common knowledge that these correlations are present in the data and are then transferred to the models duri… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2311.01573v1-abstract-full').style.display = 'inline'; document.getElementById('2311.01573v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2311.01573v1-abstract-full" style="display: none;"> Fairness is crucial when training a deep-learning discriminative model, especially in the facial domain. Models tend to correlate specific characteristics (such as age and skin color) with unrelated attributes (downstream tasks), resulting in biases which do not correspond to reality. It is common knowledge that these correlations are present in the data and are then transferred to the models during training. This paper proposes a method to mitigate these correlations to improve fairness. To do so, we learn interpretable and meaningful paths lying in the semantic space of a pre-trained diffusion model (DiffAE) -- such paths being supervised by contrastive text dipoles. That is, we learn to edit protected characteristics (age and skin color). These paths are then applied to augment images to improve the fairness of a given dataset. We test the proposed method on CelebA-HQ and UTKFace on several downstream tasks with age and skin color as protected characteristics. As a proxy for fairness, we compute the difference in accuracy with respect to the protected characteristics. Quantitative results show how the augmented images help the model improve the overall accuracy, the aforementioned metric, and the disparity of equal opportunity. Code is available at: https://github.com/Moreno98/Vision-Language-Bias-Control. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2311.01573v1-abstract-full').style.display = 'none'; document.getElementById('2311.01573v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 November, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted for publication in WACV 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2310.16677">arXiv:2310.16677</a> <span> [<a href="https://arxiv.org/pdf/2310.16677">pdf</a>, <a href="https://arxiv.org/format/2310.16677">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> </div> </div> <p class="title is-5 mathjax"> Machine Learning Approaches for Fine-Grained Symptom Estimation in Schizophrenia: A Comprehensive Review </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Foteinopoulou%2C+N+M">Niki Maria Foteinopoulou</a>, <a href="/search/cs?searchtype=author&query=Patras%2C+I">Ioannis Patras</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2310.16677v1-abstract-short" style="display: inline;"> Schizophrenia is a severe yet treatable mental disorder, it is diagnosed using a multitude of primary and secondary symptoms. Diagnosis and treatment for each individual depends on the severity of the symptoms, therefore there is a need for accurate, personalised assessments. However, the process can be both time-consuming and subjective; hence, there is a motivation to explore automated methods t… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2310.16677v1-abstract-full').style.display = 'inline'; document.getElementById('2310.16677v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2310.16677v1-abstract-full" style="display: none;"> Schizophrenia is a severe yet treatable mental disorder, it is diagnosed using a multitude of primary and secondary symptoms. Diagnosis and treatment for each individual depends on the severity of the symptoms, therefore there is a need for accurate, personalised assessments. However, the process can be both time-consuming and subjective; hence, there is a motivation to explore automated methods that can offer consistent diagnosis and precise symptom assessments, thereby complementing the work of healthcare practitioners. Machine Learning has demonstrated impressive capabilities across numerous domains, including medicine; the use of Machine Learning in patient assessment holds great promise for healthcare professionals and patients alike, as it can lead to more consistent and accurate symptom estimation.This survey aims to review methodologies that utilise Machine Learning for diagnosis and assessment of schizophrenia. Contrary to previous reviews that primarily focused on binary classification, this work recognises the complexity of the condition and instead, offers an overview of Machine Learning methods designed for fine-grained symptom estimation. We cover multiple modalities, namely Medical Imaging, Electroencephalograms and Audio-Visual, as the illness symptoms can manifest themselves both in a patient's pathology and behaviour. Finally, we analyse the datasets and methodologies used in the studies and identify trends, gaps as well as opportunities for future research. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2310.16677v1-abstract-full').style.display = 'none'; document.getElementById('2310.16677v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 October, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">19 pages, 5 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2310.16640">arXiv:2310.16640</a> <span> [<a href="https://arxiv.org/pdf/2310.16640">pdf</a>, <a href="https://arxiv.org/ps/2310.16640">ps</a>, <a href="https://arxiv.org/format/2310.16640">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> </div> </div> <p class="title is-5 mathjax"> EmoCLIP: A Vision-Language Method for Zero-Shot Video Facial Expression Recognition </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Foteinopoulou%2C+N+M">Niki Maria Foteinopoulou</a>, <a href="/search/cs?searchtype=author&query=Patras%2C+I">Ioannis Patras</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2310.16640v2-abstract-short" style="display: inline;"> Facial Expression Recognition (FER) is a crucial task in affective computing, but its conventional focus on the seven basic emotions limits its applicability to the complex and expanding emotional spectrum. To address the issue of new and unseen emotions present in dynamic in-the-wild FER, we propose a novel vision-language model that utilises sample-level text descriptions (i.e. captions of the c… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2310.16640v2-abstract-full').style.display = 'inline'; document.getElementById('2310.16640v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2310.16640v2-abstract-full" style="display: none;"> Facial Expression Recognition (FER) is a crucial task in affective computing, but its conventional focus on the seven basic emotions limits its applicability to the complex and expanding emotional spectrum. To address the issue of new and unseen emotions present in dynamic in-the-wild FER, we propose a novel vision-language model that utilises sample-level text descriptions (i.e. captions of the context, expressions or emotional cues) as natural language supervision, aiming to enhance the learning of rich latent representations, for zero-shot classification. To test this, we evaluate using zero-shot classification of the model trained on sample-level descriptions on four popular dynamic FER datasets. Our findings show that this approach yields significant improvements when compared to baseline methods. Specifically, for zero-shot video FER, we outperform CLIP by over 10\% in terms of Weighted Average Recall and 5\% in terms of Unweighted Average Recall on several datasets. Furthermore, we evaluate the representations obtained from the network trained using sample-level descriptions on the downstream task of mental health symptom estimation, achieving performance comparable or superior to state-of-the-art methods and strong agreement with human experts. Namely, we achieve a Pearson's Correlation Coefficient of up to 0.85 on schizophrenia symptom severity estimation, which is comparable to human experts' agreement. The code is publicly available at: https://github.com/NickyFot/EmoCLIP. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2310.16640v2-abstract-full').style.display = 'none'; document.getElementById('2310.16640v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 25 October, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted at FG'2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2310.13570">arXiv:2310.13570</a> <span> [<a href="https://arxiv.org/pdf/2310.13570">pdf</a>, <a href="https://arxiv.org/format/2310.13570">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> A Simple Baseline for Knowledge-Based Visual Question Answering </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xenos%2C+A">Alexandros Xenos</a>, <a href="/search/cs?searchtype=author&query=Stafylakis%2C+T">Themos Stafylakis</a>, <a href="/search/cs?searchtype=author&query=Patras%2C+I">Ioannis Patras</a>, <a href="/search/cs?searchtype=author&query=Tzimiropoulos%2C+G">Georgios Tzimiropoulos</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2310.13570v2-abstract-short" style="display: inline;"> This paper is on the problem of Knowledge-Based Visual Question Answering (KB-VQA). Recent works have emphasized the significance of incorporating both explicit (through external databases) and implicit (through LLMs) knowledge to answer questions requiring external knowledge effectively. A common limitation of such approaches is that they consist of relatively complicated pipelines and often heav… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2310.13570v2-abstract-full').style.display = 'inline'; document.getElementById('2310.13570v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2310.13570v2-abstract-full" style="display: none;"> This paper is on the problem of Knowledge-Based Visual Question Answering (KB-VQA). Recent works have emphasized the significance of incorporating both explicit (through external databases) and implicit (through LLMs) knowledge to answer questions requiring external knowledge effectively. A common limitation of such approaches is that they consist of relatively complicated pipelines and often heavily rely on accessing GPT-3 API. Our main contribution in this paper is to propose a much simpler and readily reproducible pipeline which, in a nutshell, is based on efficient in-context learning by prompting LLaMA (1 and 2) using question-informative captions as contextual information. Contrary to recent approaches, our method is training-free, does not require access to external databases or APIs, and yet achieves state-of-the-art accuracy on the OK-VQA and A-OK-VQA datasets. Finally, we perform several ablation studies to understand important aspects of our method. Our code is publicly available at https://github.com/alexandrosXe/ASimple-Baseline-For-Knowledge-Based-VQA <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2310.13570v2-abstract-full').style.display = 'none'; document.getElementById('2310.13570v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 October, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 20 October, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted at EMNLP 2023 (camera-ready version)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2308.13392">arXiv:2308.13392</a> <span> [<a href="https://arxiv.org/pdf/2308.13392">pdf</a>, <a href="https://arxiv.org/format/2308.13392">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Self-Supervised Representation Learning with Cross-Context Learning between Global and Hypercolumn Features </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Gao%2C+Z">Zheng Gao</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+C">Chen Feng</a>, <a href="/search/cs?searchtype=author&query=Patras%2C+I">Ioannis Patras</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2308.13392v2-abstract-short" style="display: inline;"> Whilst contrastive learning yields powerful representations by matching different augmented views of the same instance, it lacks the ability to capture the similarities between different instances. One popular way to address this limitation is by learning global features (after the global pooling) to capture inter-instance relationships based on knowledge distillation, where the global features of… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2308.13392v2-abstract-full').style.display = 'inline'; document.getElementById('2308.13392v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2308.13392v2-abstract-full" style="display: none;"> Whilst contrastive learning yields powerful representations by matching different augmented views of the same instance, it lacks the ability to capture the similarities between different instances. One popular way to address this limitation is by learning global features (after the global pooling) to capture inter-instance relationships based on knowledge distillation, where the global features of the teacher are used to guide the learning of the global features of the student. Inspired by cross-modality learning, we extend this existing framework that only learns from global features by encouraging the global features and intermediate layer features to learn from each other. This leads to our novel self-supervised framework: cross-context learning between global and hypercolumn features (CGH), that enforces the consistency of instance relations between low- and high-level semantics. Specifically, we stack the intermediate feature maps to construct a hypercolumn representation so that we can measure instance relations using two contexts (hypercolumn and global feature) separately, and then use the relations of one context to guide the learning of the other. This cross-context learning allows the model to learn from the differences between the two contexts. The experimental results on linear classification and downstream tasks show that our method outperforms the state-of-the-art methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2308.13392v2-abstract-full').style.display = 'none'; document.getElementById('2308.13392v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 September, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 25 August, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2308.13382">arXiv:2308.13382</a> <span> [<a href="https://arxiv.org/pdf/2308.13382">pdf</a>, <a href="https://arxiv.org/format/2308.13382">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Prompting Visual-Language Models for Dynamic Facial Expression Recognition </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhao%2C+Z">Zengqun Zhao</a>, <a href="/search/cs?searchtype=author&query=Patras%2C+I">Ioannis Patras</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2308.13382v3-abstract-short" style="display: inline;"> This paper presents a novel visual-language model called DFER-CLIP, which is based on the CLIP model and designed for in-the-wild Dynamic Facial Expression Recognition (DFER). Specifically, the proposed DFER-CLIP consists of a visual part and a textual part. For the visual part, based on the CLIP image encoder, a temporal model consisting of several Transformer encoders is introduced for extractin… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2308.13382v3-abstract-full').style.display = 'inline'; document.getElementById('2308.13382v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2308.13382v3-abstract-full" style="display: none;"> This paper presents a novel visual-language model called DFER-CLIP, which is based on the CLIP model and designed for in-the-wild Dynamic Facial Expression Recognition (DFER). Specifically, the proposed DFER-CLIP consists of a visual part and a textual part. For the visual part, based on the CLIP image encoder, a temporal model consisting of several Transformer encoders is introduced for extracting temporal facial expression features, and the final feature embedding is obtained as a learnable "class" token. For the textual part, we use as inputs textual descriptions of the facial behaviour that is related to the classes (facial expressions) that we are interested in recognising -- those descriptions are generated using large language models, like ChatGPT. This, in contrast to works that use only the class names and more accurately captures the relationship between them. Alongside the textual description, we introduce a learnable token which helps the model learn relevant context information for each expression during training. Extensive experiments demonstrate the effectiveness of the proposed method and show that our DFER-CLIP also achieves state-of-the-art results compared with the current supervised DFER methods on the DFEW, FERV39k, and MAFW benchmarks. Code is publicly available at https://github.com/zengqunzhao/DFER-CLIP. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2308.13382v3-abstract-full').style.display = 'none'; document.getElementById('2308.13382v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 25 August, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted at BMVC 2023 (Camera-Ready Version)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2307.15697">arXiv:2307.15697</a> <span> [<a href="https://arxiv.org/pdf/2307.15697">pdf</a>, <a href="https://arxiv.org/format/2307.15697">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Aligned Unsupervised Pretraining of Object Detectors with Self-training </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Metaxas%2C+I+M">Ioannis Maniadis Metaxas</a>, <a href="/search/cs?searchtype=author&query=Bulat%2C+A">Adrian Bulat</a>, <a href="/search/cs?searchtype=author&query=Patras%2C+I">Ioannis Patras</a>, <a href="/search/cs?searchtype=author&query=Martinez%2C+B">Brais Martinez</a>, <a href="/search/cs?searchtype=author&query=Tzimiropoulos%2C+G">Georgios Tzimiropoulos</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2307.15697v2-abstract-short" style="display: inline;"> The unsupervised pretraining of object detectors has recently become a key component of object detector training, as it leads to improved performance and faster convergence during the supervised fine-tuning stage. Existing unsupervised pretraining methods, however, typically rely on low-level information to define proposals that are used to train the detector. Furthermore, in the absence of class… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2307.15697v2-abstract-full').style.display = 'inline'; document.getElementById('2307.15697v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2307.15697v2-abstract-full" style="display: none;"> The unsupervised pretraining of object detectors has recently become a key component of object detector training, as it leads to improved performance and faster convergence during the supervised fine-tuning stage. Existing unsupervised pretraining methods, however, typically rely on low-level information to define proposals that are used to train the detector. Furthermore, in the absence of class labels for these proposals, an auxiliary loss is used to add high-level semantics. This results in complex pipelines and a task gap between the pretraining and the downstream task. We propose a framework that mitigates this issue and consists of three simple yet key ingredients: (i) richer initial proposals that do encode high-level semantics, (ii) class pseudo-labeling through clustering, that enables pretraining using a standard object detection training pipeline, (iii) self-training to iteratively improve and enrich the object proposals. Once the pretraining and downstream tasks are aligned, a simple detection pipeline without further bells and whistles can be directly used for pretraining and, in fact, results in state-of-the-art performance on both the full and low data regimes, across detector architectures and datasets, by significant margins. We further show that our pretraining strategy is also capable of pretraining from scratch (including the backbone) and works on complex images like COCO, paving the path for unsupervised representation learning using object detection directly as a pretext task. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2307.15697v2-abstract-full').style.display = 'none'; document.getElementById('2307.15697v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 28 July, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2307.10797">arXiv:2307.10797</a> <span> [<a href="https://arxiv.org/pdf/2307.10797">pdf</a>, <a href="https://arxiv.org/format/2307.10797">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> HyperReenact: One-Shot Reenactment via Jointly Learning to Refine and Retarget Faces </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Bounareli%2C+S">Stella Bounareli</a>, <a href="/search/cs?searchtype=author&query=Tzelepis%2C+C">Christos Tzelepis</a>, <a href="/search/cs?searchtype=author&query=Argyriou%2C+V">Vasileios Argyriou</a>, <a href="/search/cs?searchtype=author&query=Patras%2C+I">Ioannis Patras</a>, <a href="/search/cs?searchtype=author&query=Tzimiropoulos%2C+G">Georgios Tzimiropoulos</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2307.10797v1-abstract-short" style="display: inline;"> In this paper, we present our method for neural face reenactment, called HyperReenact, that aims to generate realistic talking head images of a source identity, driven by a target facial pose. Existing state-of-the-art face reenactment methods train controllable generative models that learn to synthesize realistic facial images, yet producing reenacted faces that are prone to significant visual ar… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2307.10797v1-abstract-full').style.display = 'inline'; document.getElementById('2307.10797v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2307.10797v1-abstract-full" style="display: none;"> In this paper, we present our method for neural face reenactment, called HyperReenact, that aims to generate realistic talking head images of a source identity, driven by a target facial pose. Existing state-of-the-art face reenactment methods train controllable generative models that learn to synthesize realistic facial images, yet producing reenacted faces that are prone to significant visual artifacts, especially under the challenging condition of extreme head pose changes, or requiring expensive few-shot fine-tuning to better preserve the source identity characteristics. We propose to address these limitations by leveraging the photorealistic generation ability and the disentangled properties of a pretrained StyleGAN2 generator, by first inverting the real images into its latent space and then using a hypernetwork to perform: (i) refinement of the source identity characteristics and (ii) facial pose re-targeting, eliminating this way the dependence on external editing methods that typically produce artifacts. Our method operates under the one-shot setting (i.e., using a single source frame) and allows for cross-subject reenactment, without requiring any subject-specific fine-tuning. We compare our method both quantitatively and qualitatively against several state-of-the-art techniques on the standard benchmarks of VoxCeleb1 and VoxCeleb2, demonstrating the superiority of our approach in producing artifact-free images, exhibiting remarkable robustness even under extreme head pose changes. We make the code and the pretrained models publicly available at: https://github.com/StelaBou/HyperReenact . <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2307.10797v1-abstract-full').style.display = 'none'; document.getElementById('2307.10797v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 July, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted for publication in ICCV 2023. Project page: https://stelabou.github.io/hyperreenact.github.io/ Code: https://github.com/StelaBou/HyperReenact</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2305.14053">arXiv:2305.14053</a> <span> [<a href="https://arxiv.org/pdf/2305.14053">pdf</a>, <a href="https://arxiv.org/format/2305.14053">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Parts of Speech-Grounded Subspaces in Vision-Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Oldfield%2C+J">James Oldfield</a>, <a href="/search/cs?searchtype=author&query=Tzelepis%2C+C">Christos Tzelepis</a>, <a href="/search/cs?searchtype=author&query=Panagakis%2C+Y">Yannis Panagakis</a>, <a href="/search/cs?searchtype=author&query=Nicolaou%2C+M+A">Mihalis A. Nicolaou</a>, <a href="/search/cs?searchtype=author&query=Patras%2C+I">Ioannis Patras</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2305.14053v2-abstract-short" style="display: inline;"> Latent image representations arising from vision-language models have proved immensely useful for a variety of downstream tasks. However, their utility is limited by their entanglement with respect to different visual attributes. For instance, recent work has shown that CLIP image representations are often biased toward specific visual properties (such as objects or actions) in an unpredictable ma… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2305.14053v2-abstract-full').style.display = 'inline'; document.getElementById('2305.14053v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2305.14053v2-abstract-full" style="display: none;"> Latent image representations arising from vision-language models have proved immensely useful for a variety of downstream tasks. However, their utility is limited by their entanglement with respect to different visual attributes. For instance, recent work has shown that CLIP image representations are often biased toward specific visual properties (such as objects or actions) in an unpredictable manner. In this paper, we propose to separate representations of the different visual modalities in CLIP's joint vision-language space by leveraging the association between parts of speech and specific visual modes of variation (e.g. nouns relate to objects, adjectives describe appearance). This is achieved by formulating an appropriate component analysis model that learns subspaces capturing variability corresponding to a specific part of speech, while jointly minimising variability to the rest. Such a subspace yields disentangled representations of the different visual properties of an image or text in closed form while respecting the underlying geometry of the manifold on which the representations lie. What's more, we show the proposed model additionally facilitates learning subspaces corresponding to specific visual appearances (e.g. artists' painting styles), which enables the selective removal of entire visual themes from CLIP-based text-to-image synthesis. We validate the model both qualitatively, by visualising the subspace projections with a text-to-image model and by preventing the imitation of artists' styles, and quantitatively, through class invariance metrics and improvements to baseline zero-shot classification. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2305.14053v2-abstract-full').style.display = 'none'; document.getElementById('2305.14053v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 November, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 23 May, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted at NeurIPS 2023</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2304.03378">arXiv:2304.03378</a> <span> [<a href="https://arxiv.org/pdf/2304.03378">pdf</a>, <a href="https://arxiv.org/format/2304.03378">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Self-Supervised Video Similarity Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Kordopatis-Zilos%2C+G">Giorgos Kordopatis-Zilos</a>, <a href="/search/cs?searchtype=author&query=Tolias%2C+G">Giorgos Tolias</a>, <a href="/search/cs?searchtype=author&query=Tzelepis%2C+C">Christos Tzelepis</a>, <a href="/search/cs?searchtype=author&query=Kompatsiaris%2C+I">Ioannis Kompatsiaris</a>, <a href="/search/cs?searchtype=author&query=Patras%2C+I">Ioannis Patras</a>, <a href="/search/cs?searchtype=author&query=Papadopoulos%2C+S">Symeon Papadopoulos</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2304.03378v2-abstract-short" style="display: inline;"> We introduce S$^2$VS, a video similarity learning approach with self-supervision. Self-Supervised Learning (SSL) is typically used to train deep models on a proxy task so as to have strong transferability on target tasks after fine-tuning. Here, in contrast to prior work, SSL is used to perform video similarity learning and address multiple retrieval and detection tasks at once with no use of labe… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2304.03378v2-abstract-full').style.display = 'inline'; document.getElementById('2304.03378v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2304.03378v2-abstract-full" style="display: none;"> We introduce S$^2$VS, a video similarity learning approach with self-supervision. Self-Supervised Learning (SSL) is typically used to train deep models on a proxy task so as to have strong transferability on target tasks after fine-tuning. Here, in contrast to prior work, SSL is used to perform video similarity learning and address multiple retrieval and detection tasks at once with no use of labeled data. This is achieved by learning via instance-discrimination with task-tailored augmentations and the widely used InfoNCE loss together with an additional loss operating jointly on self-similarity and hard-negative similarity. We benchmark our method on tasks where video relevance is defined with varying granularity, ranging from video copies to videos depicting the same incident or event. We learn a single universal model that achieves state-of-the-art performance on all tasks, surpassing previously proposed methods that use labeled data. The code and pretrained models are publicly available at: https://github.com/gkordo/s2vs <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2304.03378v2-abstract-full').style.display = 'none'; document.getElementById('2304.03378v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 June, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 6 April, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2304.01042">arXiv:2304.01042</a> <span> [<a href="https://arxiv.org/pdf/2304.01042">pdf</a>, <a href="https://arxiv.org/format/2304.01042">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> DivClust: Controlling Diversity in Deep Clustering </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Metaxas%2C+I+M">Ioannis Maniadis Metaxas</a>, <a href="/search/cs?searchtype=author&query=Tzimiropoulos%2C+G">Georgios Tzimiropoulos</a>, <a href="/search/cs?searchtype=author&query=Patras%2C+I">Ioannis Patras</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2304.01042v1-abstract-short" style="display: inline;"> Clustering has been a major research topic in the field of machine learning, one to which Deep Learning has recently been applied with significant success. However, an aspect of clustering that is not addressed by existing deep clustering methods, is that of efficiently producing multiple, diverse partitionings for a given dataset. This is particularly important, as a diverse set of base clusterin… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2304.01042v1-abstract-full').style.display = 'inline'; document.getElementById('2304.01042v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2304.01042v1-abstract-full" style="display: none;"> Clustering has been a major research topic in the field of machine learning, one to which Deep Learning has recently been applied with significant success. However, an aspect of clustering that is not addressed by existing deep clustering methods, is that of efficiently producing multiple, diverse partitionings for a given dataset. This is particularly important, as a diverse set of base clusterings are necessary for consensus clustering, which has been found to produce better and more robust results than relying on a single clustering. To address this gap, we propose DivClust, a diversity controlling loss that can be incorporated into existing deep clustering frameworks to produce multiple clusterings with the desired degree of diversity. We conduct experiments with multiple datasets and deep clustering frameworks and show that: a) our method effectively controls diversity across frameworks and datasets with very small additional computational cost, b) the sets of clusterings learned by DivClust include solutions that significantly outperform single-clustering baselines, and c) using an off-the-shelf consensus clustering algorithm, DivClust produces consensus clustering solutions that consistently outperform single-clustering baselines, effectively improving the performance of the base deep clustering framework. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2304.01042v1-abstract-full').style.display = 'none'; document.getElementById('2304.01042v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 April, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted for publication in CVPR 2023</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2303.12756">arXiv:2303.12756</a> <span> [<a href="https://arxiv.org/pdf/2303.12756">pdf</a>, <a href="https://arxiv.org/format/2303.12756">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> MaskCon: Masked Contrastive Learning for Coarse-Labelled Dataset </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Feng%2C+C">Chen Feng</a>, <a href="/search/cs?searchtype=author&query=Patras%2C+I">Ioannis Patras</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2303.12756v1-abstract-short" style="display: inline;"> Deep learning has achieved great success in recent years with the aid of advanced neural network structures and large-scale human-annotated datasets. However, it is often costly and difficult to accurately and efficiently annotate large-scale datasets, especially for some specialized domains where fine-grained labels are required. In this setting, coarse labels are much easier to acquire as they d… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2303.12756v1-abstract-full').style.display = 'inline'; document.getElementById('2303.12756v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2303.12756v1-abstract-full" style="display: none;"> Deep learning has achieved great success in recent years with the aid of advanced neural network structures and large-scale human-annotated datasets. However, it is often costly and difficult to accurately and efficiently annotate large-scale datasets, especially for some specialized domains where fine-grained labels are required. In this setting, coarse labels are much easier to acquire as they do not require expert knowledge. In this work, we propose a contrastive learning method, called $\textbf{Mask}$ed $\textbf{Con}$trastive learning~($\textbf{MaskCon}$) to address the under-explored problem setting, where we learn with a coarse-labelled dataset in order to address a finer labelling problem. More specifically, within the contrastive learning framework, for each sample our method generates soft-labels with the aid of coarse labels against other samples and another augmented view of the sample in question. By contrast to self-supervised contrastive learning where only the sample's augmentations are considered hard positives, and in supervised contrastive learning where only samples with the same coarse labels are considered hard positives, we propose soft labels based on sample distances, that are masked by the coarse labels. This allows us to utilize both inter-sample relations and coarse labels. We demonstrate that our method can obtain as special cases many existing state-of-the-art works and that it provides tighter bounds on the generalization error. Experimentally, our method achieves significant improvement over the current state-of-the-art in various datasets, including CIFAR10, CIFAR100, ImageNet-1K, Standford Online Products and Stanford Cars196 datasets. Code and annotations are available at https://github.com/MrChenFeng/MaskCon_CVPR2023. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2303.12756v1-abstract-full').style.display = 'none'; document.getElementById('2303.12756v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 March, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">CVPR 2023 camera-ready version. Codes are available at https://github.com/MrChenFeng/MaskCon_CVPR2023</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2303.11296">arXiv:2303.11296</a> <span> [<a href="https://arxiv.org/pdf/2303.11296">pdf</a>, <a href="https://arxiv.org/format/2303.11296">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Attribute-preserving Face Dataset Anonymization via Latent Code Optimization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Barattin%2C+S">Simone Barattin</a>, <a href="/search/cs?searchtype=author&query=Tzelepis%2C+C">Christos Tzelepis</a>, <a href="/search/cs?searchtype=author&query=Patras%2C+I">Ioannis Patras</a>, <a href="/search/cs?searchtype=author&query=Sebe%2C+N">Nicu Sebe</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2303.11296v1-abstract-short" style="display: inline;"> This work addresses the problem of anonymizing the identity of faces in a dataset of images, such that the privacy of those depicted is not violated, while at the same time the dataset is useful for downstream task such as for training machine learning models. To the best of our knowledge, we are the first to explicitly address this issue and deal with two major drawbacks of the existing state-of-… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2303.11296v1-abstract-full').style.display = 'inline'; document.getElementById('2303.11296v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2303.11296v1-abstract-full" style="display: none;"> This work addresses the problem of anonymizing the identity of faces in a dataset of images, such that the privacy of those depicted is not violated, while at the same time the dataset is useful for downstream task such as for training machine learning models. To the best of our knowledge, we are the first to explicitly address this issue and deal with two major drawbacks of the existing state-of-the-art approaches, namely that they (i) require the costly training of additional, purpose-trained neural networks, and/or (ii) fail to retain the facial attributes of the original images in the anonymized counterparts, the preservation of which is of paramount importance for their use in downstream tasks. We accordingly present a task-agnostic anonymization procedure that directly optimizes the images' latent representation in the latent space of a pre-trained GAN. By optimizing the latent codes directly, we ensure both that the identity is of a desired distance away from the original (with an identity obfuscation loss), whilst preserving the facial attributes (using a novel feature-matching loss in FaRL's deep feature space). We demonstrate through a series of both qualitative and quantitative experiments that our method is capable of anonymizing the identity of the images whilst -- crucially -- better-preserving the facial attributes. We make the code and the pre-trained models publicly available at: https://github.com/chi0tzp/FALCO. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2303.11296v1-abstract-full').style.display = 'none'; document.getElementById('2303.11296v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 March, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted for publication in CVPR 2023</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2303.00180">arXiv:2303.00180</a> <span> [<a href="https://arxiv.org/pdf/2303.00180">pdf</a>, <a href="https://arxiv.org/format/2303.00180">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> MMA-MRNNet: Harnessing Multiple Models of Affect and Dynamic Masked RNN for Precise Facial Expression Intensity Estimation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Kollias%2C+D">Dimitrios Kollias</a>, <a href="/search/cs?searchtype=author&query=Psaroudakis%2C+A">Andreas Psaroudakis</a>, <a href="/search/cs?searchtype=author&query=Arsenos%2C+A">Anastasios Arsenos</a>, <a href="/search/cs?searchtype=author&query=Theofilou%2C+P">Paraskevi Theofilou</a>, <a href="/search/cs?searchtype=author&query=Shao%2C+C">Chunchang Shao</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+G">Guanyu Hu</a>, <a href="/search/cs?searchtype=author&query=Patras%2C+I">Ioannis Patras</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2303.00180v4-abstract-short" style="display: inline;"> This paper presents MMA-MRNNet, a novel deep learning architecture for dynamic multi-output Facial Expression Intensity Estimation (FEIE) from video data. Traditional approaches to this task often rely on complex 3-D CNNs, which require extensive pre-training and assume that facial expressions are uniformly distributed across all frames of a video. These methods struggle to handle videos of varyin… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2303.00180v4-abstract-full').style.display = 'inline'; document.getElementById('2303.00180v4-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2303.00180v4-abstract-full" style="display: none;"> This paper presents MMA-MRNNet, a novel deep learning architecture for dynamic multi-output Facial Expression Intensity Estimation (FEIE) from video data. Traditional approaches to this task often rely on complex 3-D CNNs, which require extensive pre-training and assume that facial expressions are uniformly distributed across all frames of a video. These methods struggle to handle videos of varying lengths, often resorting to ad-hoc strategies that either discard valuable information or introduce bias. MMA-MRNNet addresses these challenges through a two-stage process. First, the Multiple Models of Affect (MMA) extractor component is a Multi-Task Learning CNN that concurrently estimates valence-arousal, recognizes basic facial expressions, and detects action units in each frame. These representations are then processed by a Masked RNN component, which captures temporal dependencies and dynamically updates weights according to the true length of the input video, ensuring that only the most relevant features are used for the final prediction. The proposed unimodal non-ensemble learning MMA-MRNNet was evaluated on the Hume-Reaction dataset and demonstrated significantly superior performance, surpassing state-of-the-art methods by a wide margin, regardless of whether they were unimodal, multimodal, or ensemble approaches. Finally, we demonstrated the effectiveness of the MMA component of our proposed method across multiple in-the-wild datasets, where it consistently outperformed all state-of-the-art methods across various metrics. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2303.00180v4-abstract-full').style.display = 'none'; document.getElementById('2303.00180v4-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 28 February, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2211.11460">arXiv:2211.11460</a> <span> [<a href="https://arxiv.org/pdf/2211.11460">pdf</a>, <a href="https://arxiv.org/format/2211.11460">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Motor Imagery Decoding Using Ensemble Curriculum Learning and Collaborative Training </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zoumpourlis%2C+G">Georgios Zoumpourlis</a>, <a href="/search/cs?searchtype=author&query=Patras%2C+I">Ioannis Patras</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2211.11460v2-abstract-short" style="display: inline;"> In this work, we study the problem of cross-subject motor imagery (MI) decoding from electroencephalography (EEG) data. Multi-subject EEG datasets present several kinds of domain shifts due to various inter-individual differences (e.g. brain anatomy, personality and cognitive profile). These domain shifts render multi-subject training a challenging task and also impede robust cross-subject general… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2211.11460v2-abstract-full').style.display = 'inline'; document.getElementById('2211.11460v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2211.11460v2-abstract-full" style="display: none;"> In this work, we study the problem of cross-subject motor imagery (MI) decoding from electroencephalography (EEG) data. Multi-subject EEG datasets present several kinds of domain shifts due to various inter-individual differences (e.g. brain anatomy, personality and cognitive profile). These domain shifts render multi-subject training a challenging task and also impede robust cross-subject generalization. Inspired by the importance of domain generalization techniques for tackling such issues, we propose a two-stage model ensemble architecture built with multiple feature extractors (first stage) and a shared classifier (second stage), which we train end-to-end with two novel loss terms. The first loss applies curriculum learning, forcing each feature extractor to specialize to a subset of the training subjects and promoting feature diversity. The second loss is an intra-ensemble distillation objective that allows collaborative exchange of knowledge between the models of the ensemble. We compare our method against several state-of-the-art techniques, conducting subject-independent experiments on two large MI datasets, namely PhysioNet and OpenBMI. Our algorithm outperforms all of the methods in both 5-fold cross-validation and leave-one-subject-out evaluation settings, using a substantially lower number of trainable parameters. We demonstrate that our model ensembling approach combining the powers of curriculum learning and collaborative training, leads to high learning capacity and robust performance. Our work addresses the issue of domain shifts in multi-subject EEG datasets, paving the way for calibration-free brain-computer interfaces. We make our code publicly available at: https://github.com/gzoumpourlis/Ensemble-MI <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2211.11460v2-abstract-full').style.display = 'none'; document.getElementById('2211.11460v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 February, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 21 November, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted for publication in 12th IEEE International Winter Conference on Brain-Computer Interface (BCI), 2024. Code: https://github.com/gzoumpourlis/Ensemble-MI</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2209.13375">arXiv:2209.13375</a> <span> [<a href="https://arxiv.org/pdf/2209.13375">pdf</a>, <a href="https://arxiv.org/format/2209.13375">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> StyleMask: Disentangling the Style Space of StyleGAN2 for Neural Face Reenactment </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Bounareli%2C+S">Stella Bounareli</a>, <a href="/search/cs?searchtype=author&query=Tzelepis%2C+C">Christos Tzelepis</a>, <a href="/search/cs?searchtype=author&query=Argyriou%2C+V">Vasileios Argyriou</a>, <a href="/search/cs?searchtype=author&query=Patras%2C+I">Ioannis Patras</a>, <a href="/search/cs?searchtype=author&query=Tzimiropoulos%2C+G">Georgios Tzimiropoulos</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2209.13375v1-abstract-short" style="display: inline;"> In this paper we address the problem of neural face reenactment, where, given a pair of a source and a target facial image, we need to transfer the target's pose (defined as the head pose and its facial expressions) to the source image, by preserving at the same time the source's identity characteristics (e.g., facial shape, hair style, etc), even in the challenging case where the source and the t… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2209.13375v1-abstract-full').style.display = 'inline'; document.getElementById('2209.13375v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2209.13375v1-abstract-full" style="display: none;"> In this paper we address the problem of neural face reenactment, where, given a pair of a source and a target facial image, we need to transfer the target's pose (defined as the head pose and its facial expressions) to the source image, by preserving at the same time the source's identity characteristics (e.g., facial shape, hair style, etc), even in the challenging case where the source and the target faces belong to different identities. In doing so, we address some of the limitations of the state-of-the-art works, namely, a) that they depend on paired training data (i.e., source and target faces have the same identity), b) that they rely on labeled data during inference, and c) that they do not preserve identity in large head pose changes. More specifically, we propose a framework that, using unpaired randomly generated facial images, learns to disentangle the identity characteristics of the face from its pose by incorporating the recently introduced style space $\mathcal{S}$ of StyleGAN2, a latent representation space that exhibits remarkable disentanglement properties. By capitalizing on this, we learn to successfully mix a pair of source and target style codes using supervision from a 3D model. The resulting latent code, that is subsequently used for reenactment, consists of latent units corresponding to the facial pose of the target only and of units corresponding to the identity of the source only, leading to notable improvement in the reenactment performance compared to recent state-of-the-art methods. In comparison to state of the art, we quantitatively and qualitatively show that the proposed method produces higher quality results even on extreme pose variations. Finally, we report results on real images by first embedding them on the latent space of the pretrained generator. We make the code and pretrained models publicly available at: https://github.com/StelaBou/StyleMask <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2209.13375v1-abstract-full').style.display = 'none'; document.getElementById('2209.13375v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 September, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted for publication in IEEE FG 2023. Code: https://github.com/StelaBou/StyleMask</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2209.11276">arXiv:2209.11276</a> <span> [<a href="https://arxiv.org/pdf/2209.11276">pdf</a>, <a href="https://arxiv.org/format/2209.11276">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Capsule Network based Contrastive Learning of Unsupervised Visual Representations </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Panwar%2C+H">Harsh Panwar</a>, <a href="/search/cs?searchtype=author&query=Patras%2C+I">Ioannis Patras</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2209.11276v1-abstract-short" style="display: inline;"> Capsule Networks have shown tremendous advancement in the past decade, outperforming the traditional CNNs in various task due to it's equivariant properties. With the use of vector I/O which provides information of both magnitude and direction of an object or it's part, there lies an enormous possibility of using Capsule Networks in unsupervised learning environment for visual representation tasks… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2209.11276v1-abstract-full').style.display = 'inline'; document.getElementById('2209.11276v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2209.11276v1-abstract-full" style="display: none;"> Capsule Networks have shown tremendous advancement in the past decade, outperforming the traditional CNNs in various task due to it's equivariant properties. With the use of vector I/O which provides information of both magnitude and direction of an object or it's part, there lies an enormous possibility of using Capsule Networks in unsupervised learning environment for visual representation tasks such as multi class image classification. In this paper, we propose Contrastive Capsule (CoCa) Model which is a Siamese style Capsule Network using Contrastive loss with our novel architecture, training and testing algorithm. We evaluate the model on unsupervised image classification CIFAR-10 dataset and achieve a top-1 test accuracy of 70.50% and top-5 test accuracy of 98.10%. Due to our efficient architecture our model has 31 times less parameters and 71 times less FLOPs than the current SOTA in both supervised and unsupervised learning. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2209.11276v1-abstract-full').style.display = 'none'; document.getElementById('2209.11276v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 September, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2022. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2207.11163">arXiv:2207.11163</a> <span> [<a href="https://arxiv.org/pdf/2207.11163">pdf</a>, <a href="https://arxiv.org/format/2207.11163">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1109/ICPR56361.2022.9956660">10.1109/ICPR56361.2022.9956660 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Adaptive Soft Contrastive Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Feng%2C+C">Chen Feng</a>, <a href="/search/cs?searchtype=author&query=Patras%2C+I">Ioannis Patras</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2207.11163v1-abstract-short" style="display: inline;"> Self-supervised learning has recently achieved great success in representation learning without human annotations. The dominant method -- that is contrastive learning, is generally based on instance discrimination tasks, i.e., individual samples are treated as independent categories. However, presuming all the samples are different contradicts the natural grouping of similar samples in common visu… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2207.11163v1-abstract-full').style.display = 'inline'; document.getElementById('2207.11163v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2207.11163v1-abstract-full" style="display: none;"> Self-supervised learning has recently achieved great success in representation learning without human annotations. The dominant method -- that is contrastive learning, is generally based on instance discrimination tasks, i.e., individual samples are treated as independent categories. However, presuming all the samples are different contradicts the natural grouping of similar samples in common visual datasets, e.g., multiple views of the same dog. To bridge the gap, this paper proposes an adaptive method that introduces soft inter-sample relations, namely Adaptive Soft Contrastive Learning (ASCL). More specifically, ASCL transforms the original instance discrimination task into a multi-instance soft discrimination task, and adaptively introduces inter-sample relations. As an effective and concise plug-in module for existing self-supervised learning frameworks, ASCL achieves the best performance on several benchmarks in terms of both performance and efficiency. Code is available at https://github.com/MrChenFeng/ASCL_ICPR2022. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2207.11163v1-abstract-full').style.display = 'none'; document.getElementById('2207.11163v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 July, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to ICPR2022</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2207.05577">arXiv:2207.05577</a> <span> [<a href="https://arxiv.org/pdf/2207.05577">pdf</a>, <a href="https://arxiv.org/format/2207.05577">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1145/3503161.3548373">10.1145/3503161.3548373 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Learning from Label Relationships in Human Affect </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Foteinopoulou%2C+N+M">Niki Maria Foteinopoulou</a>, <a href="/search/cs?searchtype=author&query=Patras%2C+I">Ioannis Patras</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2207.05577v2-abstract-short" style="display: inline;"> Human affect and mental state estimation in an automated manner, face a number of difficulties, including learning from labels with poor or no temporal resolution, learning from few datasets with little data (often due to confidentiality constraints) and, (very) long, in-the-wild videos. For these reasons, deep learning methodologies tend to overfit, that is, arrive at latent representations with… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2207.05577v2-abstract-full').style.display = 'inline'; document.getElementById('2207.05577v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2207.05577v2-abstract-full" style="display: none;"> Human affect and mental state estimation in an automated manner, face a number of difficulties, including learning from labels with poor or no temporal resolution, learning from few datasets with little data (often due to confidentiality constraints) and, (very) long, in-the-wild videos. For these reasons, deep learning methodologies tend to overfit, that is, arrive at latent representations with poor generalisation performance on the final regression task. To overcome this, in this work, we introduce two complementary contributions. First, we introduce a novel relational loss for multilabel regression and ordinal problems that regularises learning and leads to better generalisation. The proposed loss uses label vector inter-relational information to learn better latent representations by aligning batch label distances to the distances in the latent feature space. Second, we utilise a two-stage attention architecture that estimates a target for each clip by using features from the neighbouring clips as temporal context. We evaluate the proposed methodology on both continuous affect and schizophrenia severity estimation problems, as there are methodological and contextual parallels between the two. Experimental results demonstrate that the proposed methodology outperforms all baselines. In the domain of schizophrenia, the proposed methodology outperforms previous state-of-the-art by a large margin, achieving a PCC of up to 78% performance close to that of human experts (85%) and much higher than previous works (uplift of up to 40%). In the case of affect recognition, we outperform previous vision-based methods in terms of CCC on both the OMG and the AMIGOS datasets. Specifically for AMIGOS, we outperform previous SoTA CCC for both arousal and valence by 9% and 13% respectively, and in the OMG dataset we outperform previous vision works by up to 5% for both arousal and valence. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2207.05577v2-abstract-full').style.display = 'none'; document.getElementById('2207.05577v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 August, 2022; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 12 July, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted at ACM Multimedia (ACMMM) 2022, 10 pages, 4 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2206.02104">arXiv:2206.02104</a> <span> [<a href="https://arxiv.org/pdf/2206.02104">pdf</a>, <a href="https://arxiv.org/format/2206.02104">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> ContraCLIP: Interpretable GAN generation driven by pairs of contrasting sentences </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Tzelepis%2C+C">Christos Tzelepis</a>, <a href="/search/cs?searchtype=author&query=Oldfield%2C+J">James Oldfield</a>, <a href="/search/cs?searchtype=author&query=Tzimiropoulos%2C+G">Georgios Tzimiropoulos</a>, <a href="/search/cs?searchtype=author&query=Patras%2C+I">Ioannis Patras</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2206.02104v1-abstract-short" style="display: inline;"> This work addresses the problem of discovering non-linear interpretable paths in the latent space of pre-trained GANs in a model-agnostic manner. In the proposed method, the discovery is driven by a set of pairs of natural language sentences with contrasting semantics, named semantic dipoles, that serve as the limits of the interpretation that we require by the trainable latent paths to encode. By… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2206.02104v1-abstract-full').style.display = 'inline'; document.getElementById('2206.02104v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2206.02104v1-abstract-full" style="display: none;"> This work addresses the problem of discovering non-linear interpretable paths in the latent space of pre-trained GANs in a model-agnostic manner. In the proposed method, the discovery is driven by a set of pairs of natural language sentences with contrasting semantics, named semantic dipoles, that serve as the limits of the interpretation that we require by the trainable latent paths to encode. By using the pre-trained CLIP encoder, the sentences are projected into the vision-language space, where they serve as dipoles, and where RBF-based warping functions define a set of non-linear directional paths, one for each semantic dipole, allowing in this way traversals from one semantic pole to the other. By defining an objective that discovers paths in the latent space of GANs that generate changes along the desired paths in the vision-language embedding space, we provide an intuitive way of controlling the underlying generative factors and address some of the limitations of the state-of-the-art works, namely, that a) they are typically tailored to specific GAN architectures (i.e., StyleGAN), b) they disregard the relative position of the manipulated and the original image in the image embedding and the relative position of the image and the text embeddings, and c) they lead to abrupt image manipulations and quickly arrive at regions of low density and, thus, low image quality, providing limited control of the generative factors. We provide extensive qualitative and quantitative results that demonstrate our claims with two pre-trained GANs, and make the code and the pre-trained models publicly available at: https://github.com/chi0tzp/ContraCLIP <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2206.02104v1-abstract-full').style.display = 'none'; document.getElementById('2206.02104v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 June, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2022. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2206.00048">arXiv:2206.00048</a> <span> [<a href="https://arxiv.org/pdf/2206.00048">pdf</a>, <a href="https://arxiv.org/format/2206.00048">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> PandA: Unsupervised Learning of Parts and Appearances in the Feature Maps of GANs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Oldfield%2C+J">James Oldfield</a>, <a href="/search/cs?searchtype=author&query=Tzelepis%2C+C">Christos Tzelepis</a>, <a href="/search/cs?searchtype=author&query=Panagakis%2C+Y">Yannis Panagakis</a>, <a href="/search/cs?searchtype=author&query=Nicolaou%2C+M+A">Mihalis A. Nicolaou</a>, <a href="/search/cs?searchtype=author&query=Patras%2C+I">Ioannis Patras</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2206.00048v2-abstract-short" style="display: inline;"> Recent advances in the understanding of Generative Adversarial Networks (GANs) have led to remarkable progress in visual editing and synthesis tasks, capitalizing on the rich semantics that are embedded in the latent spaces of pre-trained GANs. However, existing methods are often tailored to specific GAN architectures and are limited to either discovering global semantic directions that do not fac… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2206.00048v2-abstract-full').style.display = 'inline'; document.getElementById('2206.00048v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2206.00048v2-abstract-full" style="display: none;"> Recent advances in the understanding of Generative Adversarial Networks (GANs) have led to remarkable progress in visual editing and synthesis tasks, capitalizing on the rich semantics that are embedded in the latent spaces of pre-trained GANs. However, existing methods are often tailored to specific GAN architectures and are limited to either discovering global semantic directions that do not facilitate localized control, or require some form of supervision through manually provided regions or segmentation masks. In this light, we present an architecture-agnostic approach that jointly discovers factors representing spatial parts and their appearances in an entirely unsupervised fashion. These factors are obtained by applying a semi-nonnegative tensor factorization on the feature maps, which in turn enables context-aware local image editing with pixel-level control. In addition, we show that the discovered appearance factors correspond to saliency maps that localize concepts of interest, without using any labels. Experiments on a wide range of GAN architectures and datasets show that, in comparison to the state of the art, our method is far more efficient in terms of training time and, most importantly, provides much more accurate localized control. Our code is available at: https://github.com/james-oldfield/PandA. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2206.00048v2-abstract-full').style.display = 'none'; document.getElementById('2206.00048v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 February, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 31 May, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted at ICLR 2023. Code available at: https://github.com/james-oldfield/PandA</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2111.11736">arXiv:2111.11736</a> <span> [<a href="https://arxiv.org/pdf/2111.11736">pdf</a>, <a href="https://arxiv.org/format/2111.11736">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Tensor Component Analysis for Interpreting the Latent Space of GANs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Oldfield%2C+J">James Oldfield</a>, <a href="/search/cs?searchtype=author&query=Georgopoulos%2C+M">Markos Georgopoulos</a>, <a href="/search/cs?searchtype=author&query=Panagakis%2C+Y">Yannis Panagakis</a>, <a href="/search/cs?searchtype=author&query=Nicolaou%2C+M+A">Mihalis A. Nicolaou</a>, <a href="/search/cs?searchtype=author&query=Patras%2C+I">Ioannis Patras</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2111.11736v1-abstract-short" style="display: inline;"> This paper addresses the problem of finding interpretable directions in the latent space of pre-trained Generative Adversarial Networks (GANs) to facilitate controllable image synthesis. Such interpretable directions correspond to transformations that can affect both the style and geometry of the synthetic images. However, existing approaches that utilise linear techniques to find these transforma… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2111.11736v1-abstract-full').style.display = 'inline'; document.getElementById('2111.11736v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2111.11736v1-abstract-full" style="display: none;"> This paper addresses the problem of finding interpretable directions in the latent space of pre-trained Generative Adversarial Networks (GANs) to facilitate controllable image synthesis. Such interpretable directions correspond to transformations that can affect both the style and geometry of the synthetic images. However, existing approaches that utilise linear techniques to find these transformations often fail to provide an intuitive way to separate these two sources of variation. To address this, we propose to a) perform a multilinear decomposition of the tensor of intermediate representations, and b) use a tensor-based regression to map directions found using this decomposition to the latent space. Our scheme allows for both linear edits corresponding to the individual modes of the tensor, and non-linear ones that model the multiplicative interactions between them. We show experimentally that we can utilise the former to better separate style- from geometry-based transformations, and the latter to generate an extended set of possible transformations in comparison to prior works. We demonstrate our approach's efficacy both quantitatively and qualitatively compared to the current state-of-the-art. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2111.11736v1-abstract-full').style.display = 'none'; document.getElementById('2111.11736v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 November, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2021. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">BMVC 2021</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2111.11288">arXiv:2111.11288</a> <span> [<a href="https://arxiv.org/pdf/2111.11288">pdf</a>, <a href="https://arxiv.org/format/2111.11288">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1109/TCSVT.2024.3426994">10.1109/TCSVT.2024.3426994 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> SSR: An Efficient and Robust Framework for Learning with Unknown Label Noise </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Feng%2C+C">Chen Feng</a>, <a href="/search/cs?searchtype=author&query=Tzimiropoulos%2C+G">Georgios Tzimiropoulos</a>, <a href="/search/cs?searchtype=author&query=Patras%2C+I">Ioannis Patras</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2111.11288v2-abstract-short" style="display: inline;"> Despite the large progress in supervised learning with neural networks, there are significant challenges in obtaining high-quality, large-scale and accurately labelled datasets. In such a context, how to learn in the presence of noisy labels has received more and more attention. As a relatively complex problem, in order to achieve good results, current approaches often integrate components from se… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2111.11288v2-abstract-full').style.display = 'inline'; document.getElementById('2111.11288v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2111.11288v2-abstract-full" style="display: none;"> Despite the large progress in supervised learning with neural networks, there are significant challenges in obtaining high-quality, large-scale and accurately labelled datasets. In such a context, how to learn in the presence of noisy labels has received more and more attention. As a relatively complex problem, in order to achieve good results, current approaches often integrate components from several fields, such as supervised learning, semi-supervised learning, transfer learning and resulting in complicated methods. Furthermore, they often make multiple assumptions about the type of noise of the data. This affects the model robustness and limits its performance under different noise conditions. In this paper, we consider a novel problem setting, Learning with Unknown Label Noise}(LULN), that is, learning when both the degree and the type of noise are unknown. Under this setting, unlike previous methods that often introduce multiple assumptions and lead to complex solutions, we propose a simple, efficient and robust framework named Sample Selection and Relabelling(SSR), that with a minimal number of hyperparameters achieves SOTA results in various conditions. At the heart of our method is a sample selection and relabelling mechanism based on a non-parametric KNN classifier~(NPK) $g_q$ and a parametric model classifier~(PMC) $g_p$, respectively, to select the clean samples and gradually relabel the noisy samples. Without bells and whistles, such as model co-training, self-supervised pre-training and semi-supervised learning, and with robustness concerning the settings of its few hyper-parameters, our method significantly surpasses previous methods on both CIFAR10/CIFAR100 with synthetic noise and real-world noisy datasets such as WebVision, Clothing1M and ANIMAL-10N. Code is available at https://github.com/MrChenFeng/SSR_BMVC2022. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2111.11288v2-abstract-full').style.display = 'none'; document.getElementById('2111.11288v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 October, 2022; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 22 November, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2021. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to BMVC2022</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2109.13357">arXiv:2109.13357</a> <span> [<a href="https://arxiv.org/pdf/2109.13357">pdf</a>, <a href="https://arxiv.org/format/2109.13357">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> WarpedGANSpace: Finding non-linear RBF paths in GAN latent space </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Tzelepis%2C+C">Christos Tzelepis</a>, <a href="/search/cs?searchtype=author&query=Tzimiropoulos%2C+G">Georgios Tzimiropoulos</a>, <a href="/search/cs?searchtype=author&query=Patras%2C+I">Ioannis Patras</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2109.13357v1-abstract-short" style="display: inline;"> This work addresses the problem of discovering, in an unsupervised manner, interpretable paths in the latent space of pretrained GANs, so as to provide an intuitive and easy way of controlling the underlying generative factors. In doing so, it addresses some of the limitations of the state-of-the-art works, namely, a) that they discover directions that are independent of the latent code, i.e., pat… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2109.13357v1-abstract-full').style.display = 'inline'; document.getElementById('2109.13357v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2109.13357v1-abstract-full" style="display: none;"> This work addresses the problem of discovering, in an unsupervised manner, interpretable paths in the latent space of pretrained GANs, so as to provide an intuitive and easy way of controlling the underlying generative factors. In doing so, it addresses some of the limitations of the state-of-the-art works, namely, a) that they discover directions that are independent of the latent code, i.e., paths that are linear, and b) that their evaluation relies either on visual inspection or on laborious human labeling. More specifically, we propose to learn non-linear warpings on the latent space, each one parametrized by a set of RBF-based latent space warping functions, and where each warping gives rise to a family of non-linear paths via the gradient of the function. Building on the work of Voynov and Babenko, that discovers linear paths, we optimize the trainable parameters of the set of RBFs, so as that images that are generated by codes along different paths, are easily distinguishable by a discriminator network. This leads to easily distinguishable image transformations, such as pose and facial expressions in facial images. We show that linear paths can be derived as a special case of our method, and show experimentally that non-linear paths in the latent space lead to steeper, more disentangled and interpretable changes in the image space than in state-of-the art methods, both qualitatively and quantitatively. We make the code and the pretrained models publicly available at: https://github.com/chi0tzp/WarpedGANSpace. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2109.13357v1-abstract-full').style.display = 'none'; document.getElementById('2109.13357v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 September, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2021. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted for publication in ICCV 2021</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2106.13266">arXiv:2106.13266</a> <span> [<a href="https://arxiv.org/pdf/2106.13266">pdf</a>, <a href="https://arxiv.org/format/2106.13266">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1007/s11263-022-01651-3">10.1007/s11263-022-01651-3 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> DnS: Distill-and-Select for Efficient and Accurate Video Indexing and Retrieval </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Kordopatis-Zilos%2C+G">Giorgos Kordopatis-Zilos</a>, <a href="/search/cs?searchtype=author&query=Tzelepis%2C+C">Christos Tzelepis</a>, <a href="/search/cs?searchtype=author&query=Papadopoulos%2C+S">Symeon Papadopoulos</a>, <a href="/search/cs?searchtype=author&query=Kompatsiaris%2C+I">Ioannis Kompatsiaris</a>, <a href="/search/cs?searchtype=author&query=Patras%2C+I">Ioannis Patras</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2106.13266v3-abstract-short" style="display: inline;"> In this paper, we address the problem of high performance and computationally efficient content-based video retrieval in large-scale datasets. Current methods typically propose either: (i) fine-grained approaches employing spatio-temporal representations and similarity calculations, achieving high performance at a high computational cost or (ii) coarse-grained approaches representing/indexing vide… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2106.13266v3-abstract-full').style.display = 'inline'; document.getElementById('2106.13266v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2106.13266v3-abstract-full" style="display: none;"> In this paper, we address the problem of high performance and computationally efficient content-based video retrieval in large-scale datasets. Current methods typically propose either: (i) fine-grained approaches employing spatio-temporal representations and similarity calculations, achieving high performance at a high computational cost or (ii) coarse-grained approaches representing/indexing videos as global vectors, where the spatio-temporal structure is lost, providing low performance but also having low computational cost. In this work, we propose a Knowledge Distillation framework, called Distill-and-Select (DnS), that starting from a well-performing fine-grained Teacher Network learns: a) Student Networks at different retrieval performance and computational efficiency trade-offs and b) a Selector Network that at test time rapidly directs samples to the appropriate student to maintain both high retrieval performance and high computational efficiency. We train several students with different architectures and arrive at different trade-offs of performance and efficiency, i.e., speed and storage requirements, including fine-grained students that store/index videos using binary representations. Importantly, the proposed scheme allows Knowledge Distillation in large, unlabelled datasets -- this leads to good students. We evaluate DnS on five public datasets on three different video retrieval tasks and demonstrate a) that our students achieve state-of-the-art performance in several cases and b) that the DnS framework provides an excellent trade-off between retrieval performance, computational speed, and storage space. In specific configurations, the proposed method achieves similar mAP with the teacher but is 20 times faster and requires 240 times less storage space. The collected dataset and implementation are publicly available: https://github.com/mever-team/distill-and-select. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2106.13266v3-abstract-full').style.display = 'none'; document.getElementById('2106.13266v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 August, 2022; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 24 June, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2021. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> International Journal of Computer Vision (2022) </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2106.04150">arXiv:2106.04150</a> <span> [<a href="https://arxiv.org/pdf/2106.04150">pdf</a>, <a href="https://arxiv.org/format/2106.04150">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1145/3460426.3463643">10.1145/3460426.3463643 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Few-Shot Action Localization without Knowing Boundaries </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xie%2C+T">Ting-Ting Xie</a>, <a href="/search/cs?searchtype=author&query=Tzelepis%2C+C">Christos Tzelepis</a>, <a href="/search/cs?searchtype=author&query=Fu%2C+F">Fan Fu</a>, <a href="/search/cs?searchtype=author&query=Patras%2C+I">Ioannis Patras</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2106.04150v2-abstract-short" style="display: inline;"> Learning to localize actions in long, cluttered, and untrimmed videos is a hard task, that in the literature has typically been addressed assuming the availability of large amounts of annotated training samples for each class -- either in a fully-supervised setting, where action boundaries are known, or in a weakly-supervised setting, where only class labels are known for each video. In this paper… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2106.04150v2-abstract-full').style.display = 'inline'; document.getElementById('2106.04150v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2106.04150v2-abstract-full" style="display: none;"> Learning to localize actions in long, cluttered, and untrimmed videos is a hard task, that in the literature has typically been addressed assuming the availability of large amounts of annotated training samples for each class -- either in a fully-supervised setting, where action boundaries are known, or in a weakly-supervised setting, where only class labels are known for each video. In this paper, we go a step further and show that it is possible to learn to localize actions in untrimmed videos when a) only one/few trimmed examples of the target action are available at test time, and b) when a large collection of videos with only class label annotation (some trimmed and some weakly annotated untrimmed ones) are available for training; with no overlap between the classes used during training and testing. To do so, we propose a network that learns to estimate Temporal Similarity Matrices (TSMs) that model a fine-grained similarity pattern between pairs of videos (trimmed or untrimmed), and uses them to generate Temporal Class Activation Maps (TCAMs) for seen or unseen classes. The TCAMs serve as temporal attention mechanisms to extract video-level representations of untrimmed videos, and to temporally localize actions at test time. To the best of our knowledge, we are the first to propose a weakly-supervised, one/few-shot action localization network that can be trained in an end-to-end fashion. Experimental results on THUMOS14 and ActivityNet1.2 datasets, show that our method achieves performance comparable or better to state-of-the-art fully-supervised, few-shot learning methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2106.04150v2-abstract-full').style.display = 'none'; document.getElementById('2106.04150v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 September, 2021; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 8 June, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2021. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ICMR21 Camera ready; link to code: https://github.com/June01/WFSAL-icmr21</span> </p> </li> </ol> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Patras%2C+I&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Patras%2C+I&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Patras%2C+I&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> </ul> </nav> <div class="is-hidden-tablet">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary">  <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div>   <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div>  </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>