CINXE.COM
Search | arXiv e-print repository
<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/> <!-- new favicon config and versions by realfavicongenerator.net --> <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b"> <!-- end favicon config --> <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\\(","\\)"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a> <!-- contains Cornell logo and sponsor statement --> <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div> <!-- contains arXiv identity and search bar --> <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div> <!-- closes identity --> <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1–50 of 79 results for author: <span class="mathjax">Koot, A C</span> </h1> </div> <div class="level-right is-hidden-mobile"> <!-- feedback for mobile is moved to footer --> <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a> </span> </div> </div> <div class="content"> <form method="GET" action="/search/cs" aria-role="search"> Searching in archive <strong>cs</strong>. <a href="/search/?searchtype=author&query=Koot%2C+A+C">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Koot, A C"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Koot%2C+A+C&terms-0-field=author&size=50&order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Koot, A C"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Koot%2C+A+C&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Koot%2C+A+C&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Koot%2C+A+C&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> </ul> </nav> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.17132">arXiv:2503.17132</a> <span> [<a href="https://arxiv.org/pdf/2503.17132">pdf</a>, <a href="https://arxiv.org/format/2503.17132">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Neural and Evolutionary Computing">cs.NE</span> </div> </div> <p class="title is-5 mathjax"> Temporal-Guided Spiking Neural Networks for Event-Based Human Action Recognition </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yang%2C+S">Siyuan Yang</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+S">Shilin Lu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+S">Shizheng Wang</a>, <a href="/search/cs?searchtype=author&query=Er%2C+M+H">Meng Hwa Er</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+Z">Zengwei Zheng</a>, <a href="/search/cs?searchtype=author&query=Kot%2C+A+C">Alex C. Kot</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.17132v2-abstract-short" style="display: inline;"> This paper explores the promising interplay between spiking neural networks (SNNs) and event-based cameras for privacy-preserving human action recognition (HAR). The unique feature of event cameras in capturing only the outlines of motion, combined with SNNs' proficiency in processing spatiotemporal data through spikes, establishes a highly synergistic compatibility for event-based HAR. Previous s… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.17132v2-abstract-full').style.display = 'inline'; document.getElementById('2503.17132v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.17132v2-abstract-full" style="display: none;"> This paper explores the promising interplay between spiking neural networks (SNNs) and event-based cameras for privacy-preserving human action recognition (HAR). The unique feature of event cameras in capturing only the outlines of motion, combined with SNNs' proficiency in processing spatiotemporal data through spikes, establishes a highly synergistic compatibility for event-based HAR. Previous studies, however, have been limited by SNNs' ability to process long-term temporal information, essential for precise HAR. In this paper, we introduce two novel frameworks to address this: temporal segment-based SNN (\textit{TS-SNN}) and 3D convolutional SNN (\textit{3D-SNN}). The \textit{TS-SNN} extracts long-term temporal information by dividing actions into shorter segments, while the \textit{3D-SNN} replaces 2D spatial elements with 3D components to facilitate the transmission of temporal information. To promote further research in event-based HAR, we create a dataset, \textit{FallingDetection-CeleX}, collected using the high-resolution CeleX-V event camera $(1280 \times 800)$, comprising 7 distinct actions. Extensive experimental results show that our proposed frameworks surpass state-of-the-art SNN methods on our newly collected dataset and three other neuromorphic datasets, showcasing their effectiveness in handling long-range temporal information for event-based HAR. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.17132v2-abstract-full').style.display = 'none'; document.getElementById('2503.17132v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 21 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.01288">arXiv:2503.01288</a> <span> [<a href="https://arxiv.org/pdf/2503.01288">pdf</a>, <a href="https://arxiv.org/format/2503.01288">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Reconciling Stochastic and Deterministic Strategies for Zero-shot Image Restoration using Diffusion Model in Dual </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+C">Chong Wang</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+L">Lanqing Guo</a>, <a href="/search/cs?searchtype=author&query=Fu%2C+Z">Zixuan Fu</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+S">Siyuan Yang</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+H">Hao Cheng</a>, <a href="/search/cs?searchtype=author&query=Kot%2C+A+C">Alex C. Kot</a>, <a href="/search/cs?searchtype=author&query=Wen%2C+B">Bihan Wen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.01288v1-abstract-short" style="display: inline;"> Plug-and-play (PnP) methods offer an iterative strategy for solving image restoration (IR) problems in a zero-shot manner, using a learned \textit{discriminative denoiser} as the implicit prior. More recently, a sampling-based variant of this approach, which utilizes a pre-trained \textit{generative diffusion model}, has gained great popularity for solving IR problems through stochastic sampling.… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.01288v1-abstract-full').style.display = 'inline'; document.getElementById('2503.01288v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.01288v1-abstract-full" style="display: none;"> Plug-and-play (PnP) methods offer an iterative strategy for solving image restoration (IR) problems in a zero-shot manner, using a learned \textit{discriminative denoiser} as the implicit prior. More recently, a sampling-based variant of this approach, which utilizes a pre-trained \textit{generative diffusion model}, has gained great popularity for solving IR problems through stochastic sampling. The IR results using PnP with a pre-trained diffusion model demonstrate distinct advantages compared to those using discriminative denoisers, \ie improved perceptual quality while sacrificing the data fidelity. The unsatisfactory results are due to the lack of integration of these strategies in the IR tasks. In this work, we propose a novel zero-shot IR scheme, dubbed Reconciling Diffusion Model in Dual (RDMD), which leverages only a \textbf{single} pre-trained diffusion model to construct \textbf{two} complementary regularizers. Specifically, the diffusion model in RDMD will iteratively perform deterministic denoising and stochastic sampling, aiming to achieve high-fidelity image restoration with appealing perceptual quality. RDMD also allows users to customize the distortion-perception tradeoff with a single hyperparameter, enhancing the adaptability of the restoration process in different practical scenarios. Extensive experiments on several IR tasks demonstrate that our proposed method could achieve superior results compared to existing approaches on both the FFHQ and ImageNet datasets. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.01288v1-abstract-full').style.display = 'none'; document.getElementById('2503.01288v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to CVPR 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.00515">arXiv:2503.00515</a> <span> [<a href="https://arxiv.org/pdf/2503.00515">pdf</a>, <a href="https://arxiv.org/format/2503.00515">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Class-Independent Increment: An Efficient Approach for Multi-label Class-Incremental Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Dong%2C+S">Songlin Dong</a>, <a href="/search/cs?searchtype=author&query=He%2C+Y">Yuhang He</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+Z">Zhengdong Zhou</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+H">Haoyu Luo</a>, <a href="/search/cs?searchtype=author&query=Wei%2C+X">Xing Wei</a>, <a href="/search/cs?searchtype=author&query=Kot%2C+A+C">Alex C. Kot</a>, <a href="/search/cs?searchtype=author&query=Gong%2C+Y">Yihong Gong</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.00515v1-abstract-short" style="display: inline;"> Current research on class-incremental learning primarily focuses on single-label classification tasks. However, real-world applications often involve multi-label scenarios, such as image retrieval and medical imaging. Therefore, this paper focuses on the challenging yet practical multi-label class-incremental learning (MLCIL) problem. In addition to the challenge of catastrophic forgetting, MLCIL… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.00515v1-abstract-full').style.display = 'inline'; document.getElementById('2503.00515v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.00515v1-abstract-full" style="display: none;"> Current research on class-incremental learning primarily focuses on single-label classification tasks. However, real-world applications often involve multi-label scenarios, such as image retrieval and medical imaging. Therefore, this paper focuses on the challenging yet practical multi-label class-incremental learning (MLCIL) problem. In addition to the challenge of catastrophic forgetting, MLCIL encounters issues related to feature confusion, encompassing inter-session and intra-feature confusion. To address these problems, we propose a novel MLCIL approach called class-independent increment (CLIN). Specifically, in contrast to existing methods that extract image-level features, we propose a class-independent incremental network (CINet) to extract multiple class-level embeddings for multi-label samples. It learns and preserves the knowledge of different classes by constructing class-specific tokens. On this basis, we develop two novel loss functions, optimizing the learning of class-specific tokens and class-level embeddings, respectively. These losses aim to distinguish between new and old classes, further alleviating the problem of feature confusion. Extensive experiments on MS-COCO and PASCAL VOC datasets demonstrate the effectiveness of our method for improving recognition performance and mitigating forgetting on various MLCIL tasks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.00515v1-abstract-full').style.display = 'none'; document.getElementById('2503.00515v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2503.00383">arXiv:2503.00383</a> <span> [<a href="https://arxiv.org/pdf/2503.00383">pdf</a>, <a href="https://arxiv.org/format/2503.00383">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Theoretical Insights in Model Inversion Robustness and Conditional Entropy Maximization for Collaborative Inference Systems </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xia%2C+S">Song Xia</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+Y">Yi Yu</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+W">Wenhan Yang</a>, <a href="/search/cs?searchtype=author&query=Ding%2C+M">Meiwen Ding</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Z">Zhuo Chen</a>, <a href="/search/cs?searchtype=author&query=Duan%2C+L">Lingyu Duan</a>, <a href="/search/cs?searchtype=author&query=Kot%2C+A+C">Alex C. Kot</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+X">Xudong Jiang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2503.00383v1-abstract-short" style="display: inline;"> By locally encoding raw data into intermediate features, collaborative inference enables end users to leverage powerful deep learning models without exposure of sensitive raw data to cloud servers. However, recent studies have revealed that these intermediate features may not sufficiently preserve privacy, as information can be leaked and raw data can be reconstructed via model inversion attacks (… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.00383v1-abstract-full').style.display = 'inline'; document.getElementById('2503.00383v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2503.00383v1-abstract-full" style="display: none;"> By locally encoding raw data into intermediate features, collaborative inference enables end users to leverage powerful deep learning models without exposure of sensitive raw data to cloud servers. However, recent studies have revealed that these intermediate features may not sufficiently preserve privacy, as information can be leaked and raw data can be reconstructed via model inversion attacks (MIAs). Obfuscation-based methods, such as noise corruption, adversarial representation learning, and information filters, enhance the inversion robustness by obfuscating the task-irrelevant redundancy empirically. However, methods for quantifying such redundancy remain elusive, and the explicit mathematical relation between this redundancy minimization and inversion robustness enhancement has not yet been established. To address that, this work first theoretically proves that the conditional entropy of inputs given intermediate features provides a guaranteed lower bound on the reconstruction mean square error (MSE) under any MIA. Then, we derive a differentiable and solvable measure for bounding this conditional entropy based on the Gaussian mixture estimation and propose a conditional entropy maximization (CEM) algorithm to enhance the inversion robustness. Experimental results on four datasets demonstrate the effectiveness and adaptability of our proposed CEM; without compromising feature utility and computing efficiency, plugging the proposed CEM into obfuscation-based defense mechanisms consistently boosts their inversion robustness, achieving average gains ranging from 12.9\% to 48.2\%. Code is available at \href{https://github.com/xiasong0501/CEM}{https://github.com/xiasong0501/CEM}. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2503.00383v1-abstract-full').style.display = 'none'; document.getElementById('2503.00383v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 March, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">accepted by CVPR2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.01646">arXiv:2412.01646</a> <span> [<a href="https://arxiv.org/pdf/2412.01646">pdf</a>, <a href="https://arxiv.org/format/2412.01646">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> </div> </div> <p class="title is-5 mathjax"> Robust and Transferable Backdoor Attacks Against Deep Image Compression With Selective Frequency Prior </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yu%2C+Y">Yi Yu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yufei Wang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+W">Wenhan Yang</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+L">Lanqing Guo</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+S">Shijian Lu</a>, <a href="/search/cs?searchtype=author&query=Duan%2C+L">Ling-Yu Duan</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+Y">Yap-Peng Tan</a>, <a href="/search/cs?searchtype=author&query=Kot%2C+A+C">Alex C. Kot</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.01646v1-abstract-short" style="display: inline;"> Recent advancements in deep learning-based compression techniques have surpassed traditional methods. However, deep neural networks remain vulnerable to backdoor attacks, where pre-defined triggers induce malicious behaviors. This paper introduces a novel frequency-based trigger injection model for launching backdoor attacks with multiple triggers on learned image compression models. Inspired by t… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.01646v1-abstract-full').style.display = 'inline'; document.getElementById('2412.01646v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.01646v1-abstract-full" style="display: none;"> Recent advancements in deep learning-based compression techniques have surpassed traditional methods. However, deep neural networks remain vulnerable to backdoor attacks, where pre-defined triggers induce malicious behaviors. This paper introduces a novel frequency-based trigger injection model for launching backdoor attacks with multiple triggers on learned image compression models. Inspired by the widely used DCT in compression codecs, triggers are embedded in the DCT domain. We design attack objectives tailored to diverse scenarios, including: 1) degrading compression quality in terms of bit-rate and reconstruction accuracy; 2) targeting task-driven measures like face recognition and semantic segmentation. To improve training efficiency, we propose a dynamic loss function that balances loss terms with fewer hyper-parameters, optimizing attack objectives effectively. For advanced scenarios, we evaluate the attack's resistance to defensive preprocessing and propose a two-stage training schedule with robust frequency selection to enhance resilience. To improve cross-model and cross-domain transferability for downstream tasks, we adjust the classification boundary in the attack loss during training. Experiments show that our trigger injection models, combined with minor modifications to encoder parameters, successfully inject multiple backdoors and their triggers into a single compression model, demonstrating strong performance and versatility. (*Due to the notification of arXiv "The Abstract field cannot be longer than 1,920 characters", the appeared Abstract is shortened. For the full Abstract, please download the Article.) <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.01646v1-abstract-full').style.display = 'none'; document.getElementById('2412.01646v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by IEEE TPAMI</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.01345">arXiv:2412.01345</a> <span> [<a href="https://arxiv.org/pdf/2412.01345">pdf</a>, <a href="https://arxiv.org/format/2412.01345">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> See What You Seek: Semantic Contextual Integration for Cloth-Changing Person Re-Identification </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Han%2C+X">Xiyu Han</a>, <a href="/search/cs?searchtype=author&query=Zhong%2C+X">Xian Zhong</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+W">Wenxin Huang</a>, <a href="/search/cs?searchtype=author&query=Jia%2C+X">Xuemei Jia</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+W">Wenxuan Liu</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+X">Xiaohan Yu</a>, <a href="/search/cs?searchtype=author&query=Kot%2C+A+C">Alex Chichung Kot</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.01345v1-abstract-short" style="display: inline;"> Cloth-changing person re-identification (CC-ReID) aims to match individuals across multiple surveillance cameras despite variations in clothing. Existing methods typically focus on mitigating the effects of clothing changes or enhancing ID-relevant features but often struggle to capture complex semantic information. In this paper, we propose a novel prompt learning framework, Semantic Contextual I… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.01345v1-abstract-full').style.display = 'inline'; document.getElementById('2412.01345v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.01345v1-abstract-full" style="display: none;"> Cloth-changing person re-identification (CC-ReID) aims to match individuals across multiple surveillance cameras despite variations in clothing. Existing methods typically focus on mitigating the effects of clothing changes or enhancing ID-relevant features but often struggle to capture complex semantic information. In this paper, we propose a novel prompt learning framework, Semantic Contextual Integration (SCI), for CC-ReID, which leverages the visual-text representation capabilities of CLIP to minimize the impact of clothing changes and enhance ID-relevant features. Specifically, we introduce Semantic Separation Enhancement (SSE) module, which uses dual learnable text tokens to separately capture confounding and clothing-related semantic information, effectively isolating ID-relevant features from distracting clothing semantics. Additionally, we develop a Semantic-Guided Interaction Module (SIM) that uses orthogonalized text features to guide visual representations, sharpening the model's focus on distinctive ID characteristics. This integration enhances the model's discriminative power and enriches the visual context with high-dimensional semantic insights. Extensive experiments on three CC-ReID datasets demonstrate that our method outperforms state-of-the-art techniques. The code will be released at github. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.01345v1-abstract-full').style.display = 'none'; document.getElementById('2412.01345v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">11 pages, 9 figures, submitted to IEEE TNNLS</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.00811">arXiv:2412.00811</a> <span> [<a href="https://arxiv.org/pdf/2412.00811">pdf</a>, <a href="https://arxiv.org/format/2412.00811">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Vid-Morp: Video Moment Retrieval Pretraining from Unlabeled Videos in the Wild </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Bao%2C+P">Peijun Bao</a>, <a href="/search/cs?searchtype=author&query=Kong%2C+C">Chenqi Kong</a>, <a href="/search/cs?searchtype=author&query=Shao%2C+Z">Zihao Shao</a>, <a href="/search/cs?searchtype=author&query=Ng%2C+B+P">Boon Poh Ng</a>, <a href="/search/cs?searchtype=author&query=Er%2C+M+H">Meng Hwa Er</a>, <a href="/search/cs?searchtype=author&query=Kot%2C+A+C">Alex C. Kot</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.00811v1-abstract-short" style="display: inline;"> Given a natural language query, video moment retrieval aims to localize the described temporal moment in an untrimmed video. A major challenge of this task is its heavy dependence on labor-intensive annotations for training. Unlike existing works that directly train models on manually curated data, we propose a novel paradigm to reduce annotation costs: pretraining the model on unlabeled, real-wor… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.00811v1-abstract-full').style.display = 'inline'; document.getElementById('2412.00811v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.00811v1-abstract-full" style="display: none;"> Given a natural language query, video moment retrieval aims to localize the described temporal moment in an untrimmed video. A major challenge of this task is its heavy dependence on labor-intensive annotations for training. Unlike existing works that directly train models on manually curated data, we propose a novel paradigm to reduce annotation costs: pretraining the model on unlabeled, real-world videos. To support this, we introduce Video Moment Retrieval Pretraining (Vid-Morp), a large-scale dataset collected with minimal human intervention, consisting of over 50K videos captured in the wild and 200K pseudo annotations. Direct pretraining on these imperfect pseudo annotations, however, presents significant challenges, including mismatched sentence-video pairs and imprecise temporal boundaries. To address these issues, we propose the ReCorrect algorithm, which comprises two main phases: semantics-guided refinement and memory-consensus correction. The semantics-guided refinement enhances the pseudo labels by leveraging semantic similarity with video frames to clean out unpaired data and make initial adjustments to temporal boundaries. In the following memory-consensus correction phase, a memory bank tracks the model predictions, progressively correcting the temporal boundaries based on consensus within the memory. Comprehensive experiments demonstrate ReCorrect's strong generalization abilities across multiple downstream settings. Zero-shot ReCorrect achieves over 75% and 80% of the best fully-supervised performance on two benchmarks, while unsupervised ReCorrect reaches about 85% on both. The code, dataset, and pretrained models are available at https://github.com/baopj/Vid-Morp. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.00811v1-abstract-full').style.display = 'none'; document.getElementById('2412.00811v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.07945">arXiv:2411.07945</a> <span> [<a href="https://arxiv.org/pdf/2411.07945">pdf</a>, <a href="https://arxiv.org/format/2411.07945">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> SimBase: A Simple Baseline for Temporal Video Grounding </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Bao%2C+P">Peijun Bao</a>, <a href="/search/cs?searchtype=author&query=Kot%2C+A+C">Alex C. Kot</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.07945v1-abstract-short" style="display: inline;"> This paper presents SimBase, a simple yet effective baseline for temporal video grounding. While recent advances in temporal grounding have led to impressive performance, they have also driven network architectures toward greater complexity, with a range of methods to (1) capture temporal relationships and (2) achieve effective multimodal fusion. In contrast, this paper explores the question: How… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.07945v1-abstract-full').style.display = 'inline'; document.getElementById('2411.07945v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.07945v1-abstract-full" style="display: none;"> This paper presents SimBase, a simple yet effective baseline for temporal video grounding. While recent advances in temporal grounding have led to impressive performance, they have also driven network architectures toward greater complexity, with a range of methods to (1) capture temporal relationships and (2) achieve effective multimodal fusion. In contrast, this paper explores the question: How effective can a simplified approach be? To investigate, we design SimBase, a network that leverages lightweight, one-dimensional temporal convolutional layers instead of complex temporal structures. For cross-modal interaction, SimBase only employs an element-wise product instead of intricate multimodal fusion. Remarkably, SimBase achieves state-of-the-art results on two large-scale datasets. As a simple yet powerful baseline, we hope SimBase will spark new ideas and streamline future evaluations in temporal video grounding. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.07945v1-abstract-full').style.display = 'none'; document.getElementById('2411.07945v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Technical report</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.08466">arXiv:2410.08466</a> <span> [<a href="https://arxiv.org/pdf/2410.08466">pdf</a>, <a href="https://arxiv.org/format/2410.08466">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1109/ICECET61485.2024.10698085">10.1109/ICECET61485.2024.10698085 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Aligned Divergent Pathways for Omni-Domain Generalized Person Re-Identification </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ang%2C+E+P+W">Eugene P. W. Ang</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+S">Shan Lin</a>, <a href="/search/cs?searchtype=author&query=Kot%2C+A+C">Alex C. Kot</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.08466v1-abstract-short" style="display: inline;"> Person Re-identification (Person ReID) has advanced significantly in fully supervised and domain generalized Person R e ID. However, methods developed for one task domain transfer poorly to the other. An ideal Person ReID method should be effective regardless of the number of domains involved in training or testing. Furthermore, given training data from the target domain, it should perform at leas… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.08466v1-abstract-full').style.display = 'inline'; document.getElementById('2410.08466v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.08466v1-abstract-full" style="display: none;"> Person Re-identification (Person ReID) has advanced significantly in fully supervised and domain generalized Person R e ID. However, methods developed for one task domain transfer poorly to the other. An ideal Person ReID method should be effective regardless of the number of domains involved in training or testing. Furthermore, given training data from the target domain, it should perform at least as well as state-of-the-art (SOTA) fully supervised Person ReID methods. We call this paradigm Omni-Domain Generalization Person ReID, referred to as ODG-ReID, and propose a way to achieve this by expanding compatible backbone architectures into multiple diverse pathways. Our method, Aligned Divergent Pathways (ADP), first converts a base architecture into a multi-branch structure by copying the tail of the original backbone. We design our module Dynamic Max-Deviance Adaptive Instance Normalization (DyMAIN) that encourages learning of generalized features that are robust to omni-domain directions and apply DyMAIN to the branches of ADP. Our proposed Phased Mixture-of-Cosines (PMoC) coordinates a mix of stable and turbulent learning rate schedules among branches for further diversified learning. Finally, we realign the feature space between branches with our proposed Dimensional Consistency Metric Loss (DCML). ADP outperforms the state-of-the-art (SOTA) results for multi-source domain generalization and supervised ReID within the same domain. Furthermore, our method demonstrates improvement on a wide range of single-source domain generalization benchmarks, achieving Omni-Domain Generalization over Person ReID tasks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.08466v1-abstract-full').style.display = 'none'; document.getElementById('2410.08466v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">2024 International Conference on Electrical, Computer and Energy Technologies (ICECET)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.08460">arXiv:2410.08460</a> <span> [<a href="https://arxiv.org/pdf/2410.08460">pdf</a>, <a href="https://arxiv.org/format/2410.08460">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1145/3665026.3665036">10.1145/3665026.3665036 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Diverse Deep Feature Ensemble Learning for Omni-Domain Generalized Person Re-identification </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ang%2C+E+P+W">Eugene P. W. Ang</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+S">Shan Lin</a>, <a href="/search/cs?searchtype=author&query=Kot%2C+A+C">Alex C. Kot</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.08460v1-abstract-short" style="display: inline;"> Person Re-identification (Person ReID) has progressed to a level where single-domain supervised Person ReID performance has saturated. However, such methods experience a significant drop in performance when trained and tested across different datasets, motivating the development of domain generalization techniques. However, our research reveals that domain generalization methods significantly unde… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.08460v1-abstract-full').style.display = 'inline'; document.getElementById('2410.08460v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.08460v1-abstract-full" style="display: none;"> Person Re-identification (Person ReID) has progressed to a level where single-domain supervised Person ReID performance has saturated. However, such methods experience a significant drop in performance when trained and tested across different datasets, motivating the development of domain generalization techniques. However, our research reveals that domain generalization methods significantly underperform single-domain supervised methods on single dataset benchmarks. An ideal Person ReID method should be effective regardless of the number of domains involved, and when test domain data is available for training it should perform as well as state-of-the-art (SOTA) fully supervised methods. This is a paradigm that we call Omni-Domain Generalization Person ReID (ODG-ReID). We propose a way to achieve ODG-ReID by creating deep feature diversity with self-ensembles. Our method, Diverse Deep Feature Ensemble Learning (D2FEL), deploys unique instance normalization patterns that generate multiple diverse views and recombines these views into a compact encoding. To the best of our knowledge, our work is one of few to consider omni-domain generalization in Person ReID, and we advance the study of applying feature ensembles in Person ReID. D2FEL significantly improves and matches the SOTA performance for major domain generalization and single-domain supervised benchmarks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.08460v1-abstract-full').style.display = 'none'; document.getElementById('2410.08460v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ICMIP '24: Proceedings of the 2024 9th International Conference on Multimedia and Image Processing, Pages 64 - 71</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.08456">arXiv:2410.08456</a> <span> [<a href="https://arxiv.org/pdf/2410.08456">pdf</a>, <a href="https://arxiv.org/format/2410.08456">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1016/j.neucom.2024.128120">10.1016/j.neucom.2024.128120 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> A Unified Deep Semantic Expansion Framework for Domain-Generalized Person Re-identification </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ang%2C+E+P+W">Eugene P. W. Ang</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+S">Shan Lin</a>, <a href="/search/cs?searchtype=author&query=Kot%2C+A+C">Alex C. Kot</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.08456v1-abstract-short" style="display: inline;"> Supervised Person Re-identification (Person ReID) methods have achieved excellent performance when training and testing within one camera network. However, they usually suffer from considerable performance degradation when applied to different camera systems. In recent years, many Domain Adaptation Person ReID methods have been proposed, achieving impressive performance without requiring labeled d… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.08456v1-abstract-full').style.display = 'inline'; document.getElementById('2410.08456v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.08456v1-abstract-full" style="display: none;"> Supervised Person Re-identification (Person ReID) methods have achieved excellent performance when training and testing within one camera network. However, they usually suffer from considerable performance degradation when applied to different camera systems. In recent years, many Domain Adaptation Person ReID methods have been proposed, achieving impressive performance without requiring labeled data from the target domain. However, these approaches still need the unlabeled data of the target domain during the training process, making them impractical in many real-world scenarios. Our work focuses on the more practical Domain Generalized Person Re-identification (DG-ReID) problem. Given one or more source domains, it aims to learn a generalized model that can be applied to unseen target domains. One promising research direction in DG-ReID is the use of implicit deep semantic feature expansion, and our previous method, Domain Embedding Expansion (DEX), is one such example that achieves powerful results in DG-ReID. However, in this work we show that DEX and other similar implicit deep semantic feature expansion methods, due to limitations in their proposed loss function, fail to reach their full potential on large evaluation benchmarks as they have a tendency to saturate too early. Leveraging on this analysis, we propose Unified Deep Semantic Expansion, our novel framework that unifies implicit and explicit semantic feature expansion techniques in a single framework to mitigate this early over-fitting and achieve a new state-of-the-art (SOTA) in all DG-ReID benchmarks. Further, we apply our method on more general image retrieval tasks, also surpassing the current SOTA in all of these benchmarks by wide margins. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.08456v1-abstract-full').style.display = 'none'; document.getElementById('2410.08456v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Neurocomputing Volume 600, 1 October 2024, 128120. 15 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.06811">arXiv:2410.06811</a> <span> [<a href="https://arxiv.org/pdf/2410.06811">pdf</a>, <a href="https://arxiv.org/format/2410.06811">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Rethinking the Evaluation of Visible and Infrared Image Fusion </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Guan%2C+D">Dayan Guan</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Y">Yixuan Wu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+T">Tianzhu Liu</a>, <a href="/search/cs?searchtype=author&query=Kot%2C+A+C">Alex C. Kot</a>, <a href="/search/cs?searchtype=author&query=Gu%2C+Y">Yanfeng Gu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.06811v1-abstract-short" style="display: inline;"> Visible and Infrared Image Fusion (VIF) has garnered significant interest across a wide range of high-level vision tasks, such as object detection and semantic segmentation. However, the evaluation of VIF methods remains challenging due to the absence of ground truth. This paper proposes a Segmentation-oriented Evaluation Approach (SEA) to assess VIF methods by incorporating the semantic segmentat… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.06811v1-abstract-full').style.display = 'inline'; document.getElementById('2410.06811v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.06811v1-abstract-full" style="display: none;"> Visible and Infrared Image Fusion (VIF) has garnered significant interest across a wide range of high-level vision tasks, such as object detection and semantic segmentation. However, the evaluation of VIF methods remains challenging due to the absence of ground truth. This paper proposes a Segmentation-oriented Evaluation Approach (SEA) to assess VIF methods by incorporating the semantic segmentation task and leveraging segmentation labels available in latest VIF datasets. Specifically, SEA utilizes universal segmentation models, capable of handling diverse images and classes, to predict segmentation outputs from fused images and compare these outputs with segmentation labels. Our evaluation of recent VIF methods using SEA reveals that their performance is comparable or even inferior to using visible images only, despite nearly half of the infrared images demonstrating better performance than visible images. Further analysis indicates that the two metrics most correlated to our SEA are the gradient-based fusion metric $Q_{\text{ABF}}$ and the visual information fidelity metric $Q_{\text{VIFF}}$ in conventional VIF evaluation metrics, which can serve as proxies when segmentation labels are unavailable. We hope that our evaluation will guide the development of novel and practical VIF methods. The code has been released in \url{https://github.com/Yixuan-2002/SEA/}. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.06811v1-abstract-full').style.display = 'none'; document.getElementById('2410.06811v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">The code has been released in \url{https://github.com/Yixuan-2002/SEA/}</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.12791">arXiv:2408.12791</a> <span> [<a href="https://arxiv.org/pdf/2408.12791">pdf</a>, <a href="https://arxiv.org/format/2408.12791">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Open-Set Deepfake Detection: A Parameter-Efficient Adaptation Method with Forgery Style Mixture </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Kong%2C+C">Chenqi Kong</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+A">Anwei Luo</a>, <a href="/search/cs?searchtype=author&query=Bao%2C+P">Peijun Bao</a>, <a href="/search/cs?searchtype=author&query=Li%2C+H">Haoliang Li</a>, <a href="/search/cs?searchtype=author&query=Wan%2C+R">Renjie Wan</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+Z">Zengwei Zheng</a>, <a href="/search/cs?searchtype=author&query=Rocha%2C+A">Anderson Rocha</a>, <a href="/search/cs?searchtype=author&query=Kot%2C+A+C">Alex C. Kot</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.12791v1-abstract-short" style="display: inline;"> Open-set face forgery detection poses significant security threats and presents substantial challenges for existing detection models. These detectors primarily have two limitations: they cannot generalize across unknown forgery domains and inefficiently adapt to new data. To address these issues, we introduce an approach that is both general and parameter-efficient for face forgery detection. It b… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.12791v1-abstract-full').style.display = 'inline'; document.getElementById('2408.12791v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.12791v1-abstract-full" style="display: none;"> Open-set face forgery detection poses significant security threats and presents substantial challenges for existing detection models. These detectors primarily have two limitations: they cannot generalize across unknown forgery domains and inefficiently adapt to new data. To address these issues, we introduce an approach that is both general and parameter-efficient for face forgery detection. It builds on the assumption that different forgery source domains exhibit distinct style statistics. Previous methods typically require fully fine-tuning pre-trained networks, consuming substantial time and computational resources. In turn, we design a forgery-style mixture formulation that augments the diversity of forgery source domains, enhancing the model's generalizability across unseen domains. Drawing on recent advancements in vision transformers (ViT) for face forgery detection, we develop a parameter-efficient ViT-based detection model that includes lightweight forgery feature extraction modules and enables the model to extract global and local forgery clues simultaneously. We only optimize the inserted lightweight modules during training, maintaining the original ViT structure with its pre-trained ImageNet weights. This training strategy effectively preserves the informative pre-trained knowledge while flexibly adapting the model to the task of Deepfake detection. Extensive experimental results demonstrate that the designed model achieves state-of-the-art generalizability with significantly reduced trainable parameters, representing an important step toward open-set Deepfake detection in the wild. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.12791v1-abstract-full').style.display = 'none'; document.getElementById('2408.12791v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.08865">arXiv:2407.08865</a> <span> [<a href="https://arxiv.org/pdf/2407.08865">pdf</a>, <a href="https://arxiv.org/format/2407.08865">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Single-Image Shadow Removal Using Deep Learning: A Comprehensive Survey </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Guo%2C+L">Laniqng Guo</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+C">Chong Wang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yufei Wang</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+Y">Yi Yu</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+S">Siyu Huang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+W">Wenhan Yang</a>, <a href="/search/cs?searchtype=author&query=Kot%2C+A+C">Alex C. Kot</a>, <a href="/search/cs?searchtype=author&query=Wen%2C+B">Bihan Wen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.08865v2-abstract-short" style="display: inline;"> Shadow removal aims at restoring the image content within shadow regions, pursuing a uniform distribution of illumination that is consistent between shadow and non-shadow regions. {Comparing to other image restoration tasks, there are two unique challenges in shadow removal:} 1) The patterns of shadows are arbitrary, varied, and often have highly complex trace structures, making ``trace-less'' ima… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.08865v2-abstract-full').style.display = 'inline'; document.getElementById('2407.08865v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.08865v2-abstract-full" style="display: none;"> Shadow removal aims at restoring the image content within shadow regions, pursuing a uniform distribution of illumination that is consistent between shadow and non-shadow regions. {Comparing to other image restoration tasks, there are two unique challenges in shadow removal:} 1) The patterns of shadows are arbitrary, varied, and often have highly complex trace structures, making ``trace-less'' image recovery difficult. 2) The degradation caused by shadows is spatially non-uniform, resulting in inconsistencies in illumination and color between shadow and non-shadow areas. Recent developments in this field are primarily driven by deep learning-based solutions, employing a variety of learning strategies, network architectures, loss functions, and training data. Nevertheless, a thorough and insightful review of deep learning-based shadow removal techniques is still lacking. In this paper, we are the first to provide a comprehensive survey to cover various aspects ranging from technical details to applications. We highlight the major advancements in deep learning-based single-image shadow removal methods, thoroughly review previous research across various categories, and provide insights into the historical progression of these developments. Additionally, we summarize performance comparisons both quantitatively and qualitatively. Beyond the technical aspects of shadow removal methods, we also explore potential future directions for this field. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.08865v2-abstract-full').style.display = 'none'; document.getElementById('2407.08865v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 11 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">url: https://github.com/GuoLanqing/Awesome-Shadow-Removal</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.17349">arXiv:2406.17349</a> <span> [<a href="https://arxiv.org/pdf/2406.17349">pdf</a>, <a href="https://arxiv.org/format/2406.17349">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Semantic Deep Hiding for Robust Unlearnable Examples </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Meng%2C+R">Ruohan Meng</a>, <a href="/search/cs?searchtype=author&query=Yi%2C+C">Chenyu Yi</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+Y">Yi Yu</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+S">Siyuan Yang</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+B">Bingquan Shen</a>, <a href="/search/cs?searchtype=author&query=Kot%2C+A+C">Alex C. Kot</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.17349v1-abstract-short" style="display: inline;"> Ensuring data privacy and protection has become paramount in the era of deep learning. Unlearnable examples are proposed to mislead the deep learning models and prevent data from unauthorized exploration by adding small perturbations to data. However, such perturbations (e.g., noise, texture, color change) predominantly impact low-level features, making them vulnerable to common countermeasures. I… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.17349v1-abstract-full').style.display = 'inline'; document.getElementById('2406.17349v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.17349v1-abstract-full" style="display: none;"> Ensuring data privacy and protection has become paramount in the era of deep learning. Unlearnable examples are proposed to mislead the deep learning models and prevent data from unauthorized exploration by adding small perturbations to data. However, such perturbations (e.g., noise, texture, color change) predominantly impact low-level features, making them vulnerable to common countermeasures. In contrast, semantic images with intricate shapes have a wealth of high-level features, making them more resilient to countermeasures and potential for producing robust unlearnable examples. In this paper, we propose a Deep Hiding (DH) scheme that adaptively hides semantic images enriched with high-level features. We employ an Invertible Neural Network (INN) to invisibly integrate predefined images, inherently hiding them with deceptive perturbations. To enhance data unlearnability, we introduce a Latent Feature Concentration module, designed to work with the INN, regularizing the intra-class variance of these perturbations. To further boost the robustness of unlearnable examples, we design a Semantic Images Generation module that produces hidden semantic images. By utilizing similar semantic information, this module generates similar semantic images for samples within the same classes, thereby enlarging the inter-class distance and narrowing the intra-class distance. Extensive experiments on CIFAR-10, CIFAR-100, and an ImageNet subset, against 18 countermeasures, reveal that our proposed method exhibits outstanding robustness for unlearnable examples, demonstrating its efficacy in preventing unauthorized data exploitation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.17349v1-abstract-full').style.display = 'none'; document.getElementById('2406.17349v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by TIFS 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.09121">arXiv:2406.09121</a> <span> [<a href="https://arxiv.org/pdf/2406.09121">pdf</a>, <a href="https://arxiv.org/format/2406.09121">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> MMRel: A Relation Understanding Benchmark in the MLLM Era </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Nie%2C+J">Jiahao Nie</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+G">Gongjie Zhang</a>, <a href="/search/cs?searchtype=author&query=An%2C+W">Wenbin An</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+Y">Yap-Peng Tan</a>, <a href="/search/cs?searchtype=author&query=Kot%2C+A+C">Alex C. Kot</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+S">Shijian Lu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.09121v2-abstract-short" style="display: inline;"> Though Multi-modal Large Language Models (MLLMs) have recently achieved significant progress, they often face various problems while handling inter-object relations, i.e., the interaction or association among distinct objects. This constraint largely stems from insufficient training and evaluation data for relation understanding, which has greatly impeded MLLMs in various vision-language generatio… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.09121v2-abstract-full').style.display = 'inline'; document.getElementById('2406.09121v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.09121v2-abstract-full" style="display: none;"> Though Multi-modal Large Language Models (MLLMs) have recently achieved significant progress, they often face various problems while handling inter-object relations, i.e., the interaction or association among distinct objects. This constraint largely stems from insufficient training and evaluation data for relation understanding, which has greatly impeded MLLMs in various vision-language generation and reasoning tasks. We attempt to address this challenge by introducing Multi-Modal Relation Understanding (MMRel), a benchmark that features large-scale, high-quality, and diverse data on inter-object relations. MMRel features three distinctive attributes: (i) It contains over 22K question-answer pairs, spanning three distinct domains and covering three relation categories, ensuring both scale and diversity; (ii) it provides manually verified, high-quality labels to ensure exceptional annotation accuracy; (iii) it includes adversarial cases with highly unusual relations, offering a challenging setting for evaluating relation hallucination. These features make MMRel ideal for evaluating MLLMs on relation understanding, as well as for fine-tuning MLLMs to enhance relation comprehension capability. Extensive experiments verify the effectiveness of MMRel in evaluating and enhancing MLLMs' relation understanding capabilities. The benchmark has been released publicly at: https://niejiahao1998.github.io/MMRel/ <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.09121v2-abstract-full').style.display = 'none'; document.getElementById('2406.09121v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 13 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.20721">arXiv:2405.20721</a> <span> [<a href="https://arxiv.org/pdf/2405.20721">pdf</a>, <a href="https://arxiv.org/format/2405.20721">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> ContextGS: Compact 3D Gaussian Splatting with Anchor Level Context Model </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yufei Wang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zhihao Li</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+L">Lanqing Guo</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+W">Wenhan Yang</a>, <a href="/search/cs?searchtype=author&query=Kot%2C+A+C">Alex C. Kot</a>, <a href="/search/cs?searchtype=author&query=Wen%2C+B">Bihan Wen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.20721v1-abstract-short" style="display: inline;"> Recently, 3D Gaussian Splatting (3DGS) has become a promising framework for novel view synthesis, offering fast rendering speeds and high fidelity. However, the large number of Gaussians and their associated attributes require effective compression techniques. Existing methods primarily compress neural Gaussians individually and independently, i.e., coding all the neural Gaussians at the same time… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.20721v1-abstract-full').style.display = 'inline'; document.getElementById('2405.20721v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.20721v1-abstract-full" style="display: none;"> Recently, 3D Gaussian Splatting (3DGS) has become a promising framework for novel view synthesis, offering fast rendering speeds and high fidelity. However, the large number of Gaussians and their associated attributes require effective compression techniques. Existing methods primarily compress neural Gaussians individually and independently, i.e., coding all the neural Gaussians at the same time, with little design for their interactions and spatial dependence. Inspired by the effectiveness of the context model in image compression, we propose the first autoregressive model at the anchor level for 3DGS compression in this work. We divide anchors into different levels and the anchors that are not coded yet can be predicted based on the already coded ones in all the coarser levels, leading to more accurate modeling and higher coding efficiency. To further improve the efficiency of entropy coding, e.g., to code the coarsest level with no already coded anchors, we propose to introduce a low-dimensional quantized feature as the hyperprior for each anchor, which can be effectively compressed. Our work pioneers the context model in the anchor level for 3DGS representation, yielding an impressive size reduction of over 100 times compared to vanilla 3DGS and 15 times compared to the most recent state-of-the-art work Scaffold-GS, while achieving comparable or even higher rendering quality. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.20721v1-abstract-full').style.display = 'none'; document.getElementById('2405.20721v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.11852">arXiv:2405.11852</a> <span> [<a href="https://arxiv.org/pdf/2405.11852">pdf</a>, <a href="https://arxiv.org/format/2405.11852">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Evolving Storytelling: Benchmarks and Methods for New Character Customization with Diffusion Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xiyu Wang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yufei Wang</a>, <a href="/search/cs?searchtype=author&query=Tsutsui%2C+S">Satoshi Tsutsui</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+W">Weisi Lin</a>, <a href="/search/cs?searchtype=author&query=Wen%2C+B">Bihan Wen</a>, <a href="/search/cs?searchtype=author&query=Kot%2C+A+C">Alex C. Kot</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.11852v1-abstract-short" style="display: inline;"> Diffusion-based models for story visualization have shown promise in generating content-coherent images for storytelling tasks. However, how to effectively integrate new characters into existing narratives while maintaining character consistency remains an open problem, particularly with limited data. Two major limitations hinder the progress: (1) the absence of a suitable benchmark due to potenti… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.11852v1-abstract-full').style.display = 'inline'; document.getElementById('2405.11852v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.11852v1-abstract-full" style="display: none;"> Diffusion-based models for story visualization have shown promise in generating content-coherent images for storytelling tasks. However, how to effectively integrate new characters into existing narratives while maintaining character consistency remains an open problem, particularly with limited data. Two major limitations hinder the progress: (1) the absence of a suitable benchmark due to potential character leakage and inconsistent text labeling, and (2) the challenge of distinguishing between new and old characters, leading to ambiguous results. To address these challenges, we introduce the NewEpisode benchmark, comprising refined datasets designed to evaluate generative models' adaptability in generating new stories with fresh characters using just a single example story. The refined dataset involves refined text prompts and eliminates character leakage. Additionally, to mitigate the character confusion of generated results, we propose EpicEvo, a method that customizes a diffusion-based visual story generation model with a single story featuring the new characters seamlessly integrating them into established character dynamics. EpicEvo introduces a novel adversarial character alignment module to align the generated images progressively in the diffusive process, with exemplar images of new characters, while applying knowledge distillation to prevent forgetting of characters and background details. Our evaluation quantitatively demonstrates that EpicEvo outperforms existing baselines on the NewEpisode benchmark, and qualitative studies confirm its superior customization of visual story generation in diffusion models. In summary, EpicEvo provides an effective way to incorporate new characters using only one example story, unlocking new possibilities for applications such as serialized cartoons. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.11852v1-abstract-full').style.display = 'none'; document.getElementById('2405.11852v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.09487">arXiv:2405.09487</a> <span> [<a href="https://arxiv.org/pdf/2405.09487">pdf</a>, <a href="https://arxiv.org/format/2405.09487">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Color Space Learning for Cross-Color Person Re-Identification </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Nie%2C+J">Jiahao Nie</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+S">Shan Lin</a>, <a href="/search/cs?searchtype=author&query=Kot%2C+A+C">Alex C. Kot</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.09487v1-abstract-short" style="display: inline;"> The primary color profile of the same identity is assumed to remain consistent in typical Person Re-identification (Person ReID) tasks. However, this assumption may be invalid in real-world situations and images hold variant color profiles, because of cross-modality cameras or identity with different clothing. To address this issue, we propose Color Space Learning (CSL) for those Cross-Color Perso… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.09487v1-abstract-full').style.display = 'inline'; document.getElementById('2405.09487v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.09487v1-abstract-full" style="display: none;"> The primary color profile of the same identity is assumed to remain consistent in typical Person Re-identification (Person ReID) tasks. However, this assumption may be invalid in real-world situations and images hold variant color profiles, because of cross-modality cameras or identity with different clothing. To address this issue, we propose Color Space Learning (CSL) for those Cross-Color Person ReID problems. Specifically, CSL guides the model to be less color-sensitive with two modules: Image-level Color-Augmentation and Pixel-level Color-Transformation. The first module increases the color diversity of the inputs and guides the model to focus more on the non-color information. The second module projects every pixel of input images onto a new color space. In addition, we introduce a new Person ReID benchmark across RGB and Infrared modalities, NTU-Corridor, which is the first with privacy agreements from all participants. To evaluate the effectiveness and robustness of our proposed CSL, we evaluate it on several Cross-Color Person ReID benchmarks. Our method surpasses the state-of-the-art methods consistently. The code and benchmark are available at: https://github.com/niejiahao1998/CSL <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.09487v1-abstract-full').style.display = 'none'; document.getElementById('2405.09487v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by ICME 2024 (Oral)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.06995">arXiv:2405.06995</a> <span> [<a href="https://arxiv.org/pdf/2405.06995">pdf</a>, <a href="https://arxiv.org/format/2405.06995">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Benchmarking Cross-Domain Audio-Visual Deception Detection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Guo%2C+X">Xiaobao Guo</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+Z">Zitong Yu</a>, <a href="/search/cs?searchtype=author&query=Selvaraj%2C+N+M">Nithish Muthuchamy Selvaraj</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+B">Bingquan Shen</a>, <a href="/search/cs?searchtype=author&query=Kong%2C+A+W">Adams Wai-Kin Kong</a>, <a href="/search/cs?searchtype=author&query=Kot%2C+A+C">Alex C. Kot</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.06995v2-abstract-short" style="display: inline;"> Automated deception detection is crucial for assisting humans in accurately assessing truthfulness and identifying deceptive behavior. Conventional contact-based techniques, like polygraph devices, rely on physiological signals to determine the authenticity of an individual's statements. Nevertheless, recent developments in automated deception detection have demonstrated that multimodal features d… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.06995v2-abstract-full').style.display = 'inline'; document.getElementById('2405.06995v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.06995v2-abstract-full" style="display: none;"> Automated deception detection is crucial for assisting humans in accurately assessing truthfulness and identifying deceptive behavior. Conventional contact-based techniques, like polygraph devices, rely on physiological signals to determine the authenticity of an individual's statements. Nevertheless, recent developments in automated deception detection have demonstrated that multimodal features derived from both audio and video modalities may outperform human observers on publicly available datasets. Despite these positive findings, the generalizability of existing audio-visual deception detection approaches across different scenarios remains largely unexplored. To close this gap, we present the first cross-domain audio-visual deception detection benchmark, that enables us to assess how well these methods generalize for use in real-world scenarios. We used widely adopted audio and visual features and different architectures for benchmarking, comparing single-to-single and multi-to-single domain generalization performance. To further exploit the impacts using data from multiple source domains for training, we investigate three types of domain sampling strategies, including domain-simultaneous, domain-alternating, and domain-by-domain for multi-to-single domain generalization evaluation. We also propose an algorithm to enhance the generalization performance by maximizing the gradient inner products between modality encoders, named ``MM-IDGM". Furthermore, we proposed the Attention-Mixer fusion method to improve performance, and we believe that this new cross-domain benchmark will facilitate future research in audio-visual deception detection. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.06995v2-abstract-full').style.display = 'none'; document.getElementById('2405.06995v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 11 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">12 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.01460">arXiv:2405.01460</a> <span> [<a href="https://arxiv.org/pdf/2405.01460">pdf</a>, <a href="https://arxiv.org/format/2405.01460">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Purify Unlearnable Examples via Rate-Constrained Variational Autoencoders </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yu%2C+Y">Yi Yu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yufei Wang</a>, <a href="/search/cs?searchtype=author&query=Xia%2C+S">Song Xia</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+W">Wenhan Yang</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+S">Shijian Lu</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+Y">Yap-Peng Tan</a>, <a href="/search/cs?searchtype=author&query=Kot%2C+A+C">Alex C. Kot</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.01460v2-abstract-short" style="display: inline;"> Unlearnable examples (UEs) seek to maximize testing error by making subtle modifications to training examples that are correctly labeled. Defenses against these poisoning attacks can be categorized based on whether specific interventions are adopted during training. The first approach is training-time defense, such as adversarial training, which can mitigate poisoning effects but is computationall… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.01460v2-abstract-full').style.display = 'inline'; document.getElementById('2405.01460v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.01460v2-abstract-full" style="display: none;"> Unlearnable examples (UEs) seek to maximize testing error by making subtle modifications to training examples that are correctly labeled. Defenses against these poisoning attacks can be categorized based on whether specific interventions are adopted during training. The first approach is training-time defense, such as adversarial training, which can mitigate poisoning effects but is computationally intensive. The other approach is pre-training purification, e.g., image short squeezing, which consists of several simple compressions but often encounters challenges in dealing with various UEs. Our work provides a novel disentanglement mechanism to build an efficient pre-training purification method. Firstly, we uncover rate-constrained variational autoencoders (VAEs), demonstrating a clear tendency to suppress the perturbations in UEs. We subsequently conduct a theoretical analysis for this phenomenon. Building upon these insights, we introduce a disentangle variational autoencoder (D-VAE), capable of disentangling the perturbations with learnable class-wise embeddings. Based on this network, a two-stage purification approach is naturally developed. The first stage focuses on roughly eliminating perturbations, while the second stage produces refined, poison-free results, ensuring effectiveness and robustness across various scenarios. Extensive experiments demonstrate the remarkable performance of our method across CIFAR-10, CIFAR-100, and a 100-class ImageNet-subset. Code is available at https://github.com/yuyi-sd/D-VAE. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.01460v2-abstract-full').style.display = 'none'; document.getElementById('2405.01460v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 2 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by ICML 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2404.13576">arXiv:2404.13576</a> <span> [<a href="https://arxiv.org/pdf/2404.13576">pdf</a>, <a href="https://arxiv.org/format/2404.13576">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> I2CANSAY:Inter-Class Analogical Augmentation and Intra-Class Significance Analysis for Non-Exemplar Online Task-Free Continual Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Dong%2C+S">Songlin Dong</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yingjie Chen</a>, <a href="/search/cs?searchtype=author&query=He%2C+Y">Yuhang He</a>, <a href="/search/cs?searchtype=author&query=Jin%2C+Y">Yuhan Jin</a>, <a href="/search/cs?searchtype=author&query=Kot%2C+A+C">Alex C. Kot</a>, <a href="/search/cs?searchtype=author&query=Gong%2C+Y">Yihong Gong</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2404.13576v1-abstract-short" style="display: inline;"> Online task-free continual learning (OTFCL) is a more challenging variant of continual learning which emphasizes the gradual shift of task boundaries and learns in an online mode. Existing methods rely on a memory buffer composed of old samples to prevent forgetting. However,the use of memory buffers not only raises privacy concerns but also hinders the efficient learning of new samples. To addres… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.13576v1-abstract-full').style.display = 'inline'; document.getElementById('2404.13576v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2404.13576v1-abstract-full" style="display: none;"> Online task-free continual learning (OTFCL) is a more challenging variant of continual learning which emphasizes the gradual shift of task boundaries and learns in an online mode. Existing methods rely on a memory buffer composed of old samples to prevent forgetting. However,the use of memory buffers not only raises privacy concerns but also hinders the efficient learning of new samples. To address this problem, we propose a novel framework called I2CANSAY that gets rid of the dependence on memory buffers and efficiently learns the knowledge of new data from one-shot samples. Concretely, our framework comprises two main modules. Firstly, the Inter-Class Analogical Augmentation (ICAN) module generates diverse pseudo-features for old classes based on the inter-class analogy of feature distributions for different new classes, serving as a substitute for the memory buffer. Secondly, the Intra-Class Significance Analysis (ISAY) module analyzes the significance of attributes for each class via its distribution standard deviation, and generates the importance vector as a correction bias for the linear classifier, thereby enhancing the capability of learning from new samples. We run our experiments on four popular image classification datasets: CoRe50, CIFAR-10, CIFAR-100, and CUB-200, our approach outperforms the prior state-of-the-art by a large margin. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.13576v1-abstract-full').style.display = 'none'; document.getElementById('2404.13576v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2404.08452">arXiv:2404.08452</a> <span> [<a href="https://arxiv.org/pdf/2404.08452">pdf</a>, <a href="https://arxiv.org/format/2404.08452">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> MoE-FFD: Mixture of Experts for Generalized and Parameter-Efficient Face Forgery Detection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Kong%2C+C">Chenqi Kong</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+A">Anwei Luo</a>, <a href="/search/cs?searchtype=author&query=Bao%2C+P">Peijun Bao</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+Y">Yi Yu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+H">Haoliang Li</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+Z">Zengwei Zheng</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+S">Shiqi Wang</a>, <a href="/search/cs?searchtype=author&query=Kot%2C+A+C">Alex C. Kot</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2404.08452v2-abstract-short" style="display: inline;"> Deepfakes have recently raised significant trust issues and security concerns among the public. Compared to CNN face forgery detectors, ViT-based methods take advantage of the expressivity of transformers, achieving superior detection performance. However, these approaches still exhibit the following limitations: (1) Fully fine-tuning ViT-based models from ImageNet weights demands substantial comp… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.08452v2-abstract-full').style.display = 'inline'; document.getElementById('2404.08452v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2404.08452v2-abstract-full" style="display: none;"> Deepfakes have recently raised significant trust issues and security concerns among the public. Compared to CNN face forgery detectors, ViT-based methods take advantage of the expressivity of transformers, achieving superior detection performance. However, these approaches still exhibit the following limitations: (1) Fully fine-tuning ViT-based models from ImageNet weights demands substantial computational and storage resources; (2) ViT-based methods struggle to capture local forgery clues, leading to model bias; (3) These methods limit their scope on only one or few face forgery features, resulting in limited generalizability. To tackle these challenges, this work introduces Mixture-of-Experts modules for Face Forgery Detection (MoE-FFD), a generalized yet parameter-efficient ViT-based approach. MoE-FFD only updates lightweight Low-Rank Adaptation (LoRA) and Adapter layers while keeping the ViT backbone frozen, thereby achieving parameter-efficient training. Moreover, MoE-FFD leverages the expressivity of transformers and local priors of CNNs to simultaneously extract global and local forgery clues. Additionally, novel MoE modules are designed to scale the model's capacity and smartly select optimal forgery experts, further enhancing forgery detection performance. Our proposed learning scheme can be seamlessly adapted to various transformer backbones in a plug-and-play manner. Extensive experimental results demonstrate that the proposed method achieves state-of-the-art face forgery detection performance with significantly reduced parameter overhead. The code is released at: https://github.com/LoveSiameseCat/MoE-FFD. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.08452v2-abstract-full').style.display = 'none'; document.getElementById('2404.08452v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 12 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2401.08407">arXiv:2401.08407</a> <span> [<a href="https://arxiv.org/pdf/2401.08407">pdf</a>, <a href="https://arxiv.org/format/2401.08407">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Cross-Domain Few-Shot Segmentation via Iterative Support-Query Correspondence Mining </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Nie%2C+J">Jiahao Nie</a>, <a href="/search/cs?searchtype=author&query=Xing%2C+Y">Yun Xing</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+G">Gongjie Zhang</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+P">Pei Yan</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+A">Aoran Xiao</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+Y">Yap-Peng Tan</a>, <a href="/search/cs?searchtype=author&query=Kot%2C+A+C">Alex C. Kot</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+S">Shijian Lu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2401.08407v2-abstract-short" style="display: inline;"> Cross-Domain Few-Shot Segmentation (CD-FSS) poses the challenge of segmenting novel categories from a distinct domain using only limited exemplars. In this paper, we undertake a comprehensive study of CD-FSS and uncover two crucial insights: (i) the necessity of a fine-tuning stage to effectively transfer the learned meta-knowledge across domains, and (ii) the overfitting risk during the na茂ve fin… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2401.08407v2-abstract-full').style.display = 'inline'; document.getElementById('2401.08407v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2401.08407v2-abstract-full" style="display: none;"> Cross-Domain Few-Shot Segmentation (CD-FSS) poses the challenge of segmenting novel categories from a distinct domain using only limited exemplars. In this paper, we undertake a comprehensive study of CD-FSS and uncover two crucial insights: (i) the necessity of a fine-tuning stage to effectively transfer the learned meta-knowledge across domains, and (ii) the overfitting risk during the na茂ve fine-tuning due to the scarcity of novel category examples. With these insights, we propose a novel cross-domain fine-tuning strategy that addresses the challenging CD-FSS tasks. We first design Bi-directional Few-shot Prediction (BFP), which establishes support-query correspondence in a bi-directional manner, crafting augmented supervision to reduce the overfitting risk. Then we further extend BFP into Iterative Few-shot Adaptor (IFA), which is a recursive framework to capture the support-query correspondence iteratively, targeting maximal exploitation of supervisory signals from the sparse novel category samples. Extensive empirical evaluations show that our method significantly outperforms the state-of-the-arts (+7.8\%), which verifies that IFA tackles the cross-domain challenges and mitigates the overfitting simultaneously. The code is available at: https://github.com/niejiahao1998/IFA. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2401.08407v2-abstract-full').style.display = 'none'; document.getElementById('2401.08407v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 16 January, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by CVPR 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2312.15490">arXiv:2312.15490</a> <span> [<a href="https://arxiv.org/pdf/2312.15490">pdf</a>, <a href="https://arxiv.org/format/2312.15490">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Diffusion-EXR: Controllable Review Generation for Explainable Recommendation via Diffusion Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+L">Ling Li</a>, <a href="/search/cs?searchtype=author&query=Li%2C+S">Shaohua Li</a>, <a href="/search/cs?searchtype=author&query=Marantika%2C+W">Winda Marantika</a>, <a href="/search/cs?searchtype=author&query=Kot%2C+A+C">Alex C. Kot</a>, <a href="/search/cs?searchtype=author&query=Zhan%2C+H">Huijing Zhan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2312.15490v4-abstract-short" style="display: inline;"> Denoising Diffusion Probabilistic Model (DDPM) has shown great competence in image and audio generation tasks. However, there exist few attempts to employ DDPM in the text generation, especially review generation under recommendation systems. Fueled by the predicted reviews explainability that justifies recommendations could assist users better understand the recommended items and increase the tra… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.15490v4-abstract-full').style.display = 'inline'; document.getElementById('2312.15490v4-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2312.15490v4-abstract-full" style="display: none;"> Denoising Diffusion Probabilistic Model (DDPM) has shown great competence in image and audio generation tasks. However, there exist few attempts to employ DDPM in the text generation, especially review generation under recommendation systems. Fueled by the predicted reviews explainability that justifies recommendations could assist users better understand the recommended items and increase the transparency of recommendation system, we propose a Diffusion Model-based Review Generation towards EXplainable Recommendation named Diffusion-EXR. Diffusion-EXR corrupts the sequence of review embeddings by incrementally introducing varied levels of Gaussian noise to the sequence of word embeddings and learns to reconstruct the original word representations in the reverse process. The nature of DDPM enables our lightweight Transformer backbone to perform excellently in the recommendation review generation task. Extensive experimental results have demonstrated that Diffusion-EXR can achieve state-of-the-art review generation for recommendation on two publicly available benchmark datasets. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.15490v4-abstract-full').style.display = 'none'; document.getElementById('2312.15490v4-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 24 December, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">We request to withdraw our paper from the archive due to significant errors identified in the analysis and conclusions. Upon further review, we realized that these errors undermine the validity of our findings. We plan to conduct additional research to correct these issues and resubmit a revised version in the future</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2311.14760">arXiv:2311.14760</a> <span> [<a href="https://arxiv.org/pdf/2311.14760">pdf</a>, <a href="https://arxiv.org/format/2311.14760">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> SinSR: Diffusion-Based Image Super-Resolution in a Single Step </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yufei Wang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+W">Wenhan Yang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+X">Xinyuan Chen</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yaohui Wang</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+L">Lanqing Guo</a>, <a href="/search/cs?searchtype=author&query=Chau%2C+L">Lap-Pui Chau</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Ziwei Liu</a>, <a href="/search/cs?searchtype=author&query=Qiao%2C+Y">Yu Qiao</a>, <a href="/search/cs?searchtype=author&query=Kot%2C+A+C">Alex C. Kot</a>, <a href="/search/cs?searchtype=author&query=Wen%2C+B">Bihan Wen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2311.14760v1-abstract-short" style="display: inline;"> While super-resolution (SR) methods based on diffusion models exhibit promising results, their practical application is hindered by the substantial number of required inference steps. Recent methods utilize degraded images in the initial state, thereby shortening the Markov chain. Nevertheless, these solutions either rely on a precise formulation of the degradation process or still necessitate a r… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2311.14760v1-abstract-full').style.display = 'inline'; document.getElementById('2311.14760v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2311.14760v1-abstract-full" style="display: none;"> While super-resolution (SR) methods based on diffusion models exhibit promising results, their practical application is hindered by the substantial number of required inference steps. Recent methods utilize degraded images in the initial state, thereby shortening the Markov chain. Nevertheless, these solutions either rely on a precise formulation of the degradation process or still necessitate a relatively lengthy generation path (e.g., 15 iterations). To enhance inference speed, we propose a simple yet effective method for achieving single-step SR generation, named SinSR. Specifically, we first derive a deterministic sampling process from the most recent state-of-the-art (SOTA) method for accelerating diffusion-based SR. This allows the mapping between the input random noise and the generated high-resolution image to be obtained in a reduced and acceptable number of inference steps during training. We show that this deterministic mapping can be distilled into a student model that performs SR within only one inference step. Additionally, we propose a novel consistency-preserving loss to simultaneously leverage the ground-truth image during the distillation process, ensuring that the performance of the student model is not solely bound by the feature manifold of the teacher model, resulting in further performance improvement. Extensive experiments conducted on synthetic and real-world datasets demonstrate that the proposed method can achieve comparable or even superior performance compared to both previous SOTA methods and the teacher model, in just one sampling step, resulting in a remarkable up to x10 speedup for inference. Our code will be released at https://github.com/wyf0912/SinSR <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2311.14760v1-abstract-full').style.display = 'none'; document.getElementById('2311.14760v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 November, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2310.00234">arXiv:2310.00234</a> <span> [<a href="https://arxiv.org/pdf/2310.00234">pdf</a>, <a href="https://arxiv.org/format/2310.00234">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> Pixel-Inconsistency Modeling for Image Manipulation Localization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Kong%2C+C">Chenqi Kong</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+A">Anwei Luo</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+S">Shiqi Wang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+H">Haoliang Li</a>, <a href="/search/cs?searchtype=author&query=Rocha%2C+A">Anderson Rocha</a>, <a href="/search/cs?searchtype=author&query=Kot%2C+A+C">Alex C. Kot</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2310.00234v2-abstract-short" style="display: inline;"> Digital image forensics plays a crucial role in image authentication and manipulation localization. Despite the progress powered by deep neural networks, existing forgery localization methodologies exhibit limitations when deployed to unseen datasets and perturbed images (i.e., lack of generalization and robustness to real-world applications). To circumvent these problems and aid image integrity,… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2310.00234v2-abstract-full').style.display = 'inline'; document.getElementById('2310.00234v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2310.00234v2-abstract-full" style="display: none;"> Digital image forensics plays a crucial role in image authentication and manipulation localization. Despite the progress powered by deep neural networks, existing forgery localization methodologies exhibit limitations when deployed to unseen datasets and perturbed images (i.e., lack of generalization and robustness to real-world applications). To circumvent these problems and aid image integrity, this paper presents a generalized and robust manipulation localization model through the analysis of pixel inconsistency artifacts. The rationale is grounded on the observation that most image signal processors (ISP) involve the demosaicing process, which introduces pixel correlations in pristine images. Moreover, manipulating operations, including splicing, copy-move, and inpainting, directly affect such pixel regularity. We, therefore, first split the input image into several blocks and design masked self-attention mechanisms to model the global pixel dependency in input images. Simultaneously, we optimize another local pixel dependency stream to mine local manipulation clues within input forgery images. In addition, we design novel Learning-to-Weight Modules (LWM) to combine features from the two streams, thereby enhancing the final forgery localization performance. To improve the training process, we propose a novel Pixel-Inconsistency Data Augmentation (PIDA) strategy, driving the model to focus on capturing inherent pixel-level artifacts instead of mining semantic forgery traces. This work establishes a comprehensive benchmark integrating 15 representative detection models across 12 datasets. Extensive experiments show that our method successfully extracts inherent pixel-inconsistency forgery fingerprints and achieve state-of-the-art generalization and robustness performances in image manipulation localization. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2310.00234v2-abstract-full').style.display = 'none'; document.getElementById('2310.00234v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 29 September, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2309.11092">arXiv:2309.11092</a> <span> [<a href="https://arxiv.org/pdf/2309.11092">pdf</a>, <a href="https://arxiv.org/format/2309.11092">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> </div> </div> <p class="title is-5 mathjax"> Generalized Face Forgery Detection via Adaptive Learning for Pre-trained Vision Transformer </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Luo%2C+A">Anwei Luo</a>, <a href="/search/cs?searchtype=author&query=Cai%2C+R">Rizhao Cai</a>, <a href="/search/cs?searchtype=author&query=Kong%2C+C">Chenqi Kong</a>, <a href="/search/cs?searchtype=author&query=Ju%2C+Y">Yakun Ju</a>, <a href="/search/cs?searchtype=author&query=Kang%2C+X">Xiangui Kang</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+J">Jiwu Huang</a>, <a href="/search/cs?searchtype=author&query=Kot%2C+A+C">Alex C. Kot</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2309.11092v2-abstract-short" style="display: inline;"> With the rapid progress of generative models, the current challenge in face forgery detection is how to effectively detect realistic manipulated faces from different unseen domains. Though previous studies show that pre-trained Vision Transformer (ViT) based models can achieve some promising results after fully fine-tuning on the Deepfake dataset, their generalization performances are still unsati… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2309.11092v2-abstract-full').style.display = 'inline'; document.getElementById('2309.11092v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2309.11092v2-abstract-full" style="display: none;"> With the rapid progress of generative models, the current challenge in face forgery detection is how to effectively detect realistic manipulated faces from different unseen domains. Though previous studies show that pre-trained Vision Transformer (ViT) based models can achieve some promising results after fully fine-tuning on the Deepfake dataset, their generalization performances are still unsatisfactory. One possible reason is that fully fine-tuned ViT-based models may disrupt the pre-trained features [1, 2] and overfit to some data-specific patterns [3]. To alleviate this issue, we present a \textbf{F}orgery-aware \textbf{A}daptive \textbf{Vi}sion \textbf{T}ransformer (FA-ViT) under the adaptive learning paradigm, where the parameters in the pre-trained ViT are kept fixed while the designed adaptive modules are optimized to capture forgery features. Specifically, a global adaptive module is designed to model long-range interactions among input tokens, which takes advantage of self-attention mechanism to mine global forgery clues. To further explore essential local forgery clues, a local adaptive module is proposed to expose local inconsistencies by enhancing the local contextual association. In addition, we introduce a fine-grained adaptive learning module that emphasizes the common compact representation of genuine faces through relationship learning in fine-grained pairs, driving these proposed adaptive modules to be aware of fine-grained forgery-aware information. Extensive experiments demonstrate that our FA-ViT achieves state-of-the-arts results in the cross-dataset evaluation, and enhances the robustness against unseen perturbations. Particularly, FA-ViT achieves 93.83\% and 78.32\% AUC scores on Celeb-DF and DFDC datasets in the cross-dataset evaluation. The code and trained model have been released at: https://github.com/LoveSiameseCat/FAViT. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2309.11092v2-abstract-full').style.display = 'none'; document.getElementById('2309.11092v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 20 September, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2307.07710">arXiv:2307.07710</a> <span> [<a href="https://arxiv.org/pdf/2307.07710">pdf</a>, <a href="https://arxiv.org/format/2307.07710">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> ExposureDiffusion: Learning to Expose for Low-light Image Enhancement </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yufei Wang</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+Y">Yi Yu</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+W">Wenhan Yang</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+L">Lanqing Guo</a>, <a href="/search/cs?searchtype=author&query=Chau%2C+L">Lap-Pui Chau</a>, <a href="/search/cs?searchtype=author&query=Kot%2C+A+C">Alex C. Kot</a>, <a href="/search/cs?searchtype=author&query=Wen%2C+B">Bihan Wen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2307.07710v2-abstract-short" style="display: inline;"> Previous raw image-based low-light image enhancement methods predominantly relied on feed-forward neural networks to learn deterministic mappings from low-light to normally-exposed images. However, they failed to capture critical distribution information, leading to visually undesirable results. This work addresses the issue by seamlessly integrating a diffusion model with a physics-based exposure… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2307.07710v2-abstract-full').style.display = 'inline'; document.getElementById('2307.07710v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2307.07710v2-abstract-full" style="display: none;"> Previous raw image-based low-light image enhancement methods predominantly relied on feed-forward neural networks to learn deterministic mappings from low-light to normally-exposed images. However, they failed to capture critical distribution information, leading to visually undesirable results. This work addresses the issue by seamlessly integrating a diffusion model with a physics-based exposure model. Different from a vanilla diffusion model that has to perform Gaussian denoising, with the injected physics-based exposure model, our restoration process can directly start from a noisy image instead of pure noise. As such, our method obtains significantly improved performance and reduced inference time compared with vanilla diffusion models. To make full use of the advantages of different intermediate steps, we further propose an adaptive residual layer that effectively screens out the side-effect in the iterative refinement when the intermediate results have been already well-exposed. The proposed framework can work with both real-paired datasets, SOTA noise models, and different backbone networks. Note that, the proposed framework is compatible with real-paired datasets, real/synthetic noise models, and different backbone networks. We evaluate the proposed method on various public benchmarks, achieving promising results with consistent improvements using different exposure models and backbones. Besides, the proposed method achieves better generalization capacity for unseen amplifying ratios and better performance than a larger feedforward neural model when few parameters are adopted. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2307.07710v2-abstract-full').style.display = 'none'; document.getElementById('2307.07710v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 August, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 15 July, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">accepted by ICCV2023</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2307.07286">arXiv:2307.07286</a> <span> [<a href="https://arxiv.org/pdf/2307.07286">pdf</a>, <a href="https://arxiv.org/format/2307.07286">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> One-Shot Action Recognition via Multi-Scale Spatial-Temporal Skeleton Matching </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yang%2C+S">Siyuan Yang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+J">Jun Liu</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+S">Shijian Lu</a>, <a href="/search/cs?searchtype=author&query=Hwa%2C+E+M">Er Meng Hwa</a>, <a href="/search/cs?searchtype=author&query=Kot%2C+A+C">Alex C. Kot</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2307.07286v2-abstract-short" style="display: inline;"> One-shot skeleton action recognition, which aims to learn a skeleton action recognition model with a single training sample, has attracted increasing interest due to the challenge of collecting and annotating large-scale skeleton action data. However, most existing studies match skeleton sequences by comparing their feature vectors directly which neglects spatial structures and temporal orders of… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2307.07286v2-abstract-full').style.display = 'inline'; document.getElementById('2307.07286v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2307.07286v2-abstract-full" style="display: none;"> One-shot skeleton action recognition, which aims to learn a skeleton action recognition model with a single training sample, has attracted increasing interest due to the challenge of collecting and annotating large-scale skeleton action data. However, most existing studies match skeleton sequences by comparing their feature vectors directly which neglects spatial structures and temporal orders of skeleton data. This paper presents a novel one-shot skeleton action recognition technique that handles skeleton action recognition via multi-scale spatial-temporal feature matching. We represent skeleton data at multiple spatial and temporal scales and achieve optimal feature matching from two perspectives. The first is multi-scale matching which captures the scale-wise semantic relevance of skeleton data at multiple spatial and temporal scales simultaneously. The second is cross-scale matching which handles different motion magnitudes and speeds by capturing sample-wise relevance across multiple scales. Extensive experiments over three large-scale datasets (NTU RGB+D, NTU RGB+D 120, and PKU-MMD) show that our method achieves superior one-shot skeleton action recognition, and it outperforms the state-of-the-art consistently by large margins. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2307.07286v2-abstract-full').style.display = 'none'; document.getElementById('2307.07286v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 February, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 14 July, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">8 pages, 4 figures, 6 tables. Accepted by IEEE Transactions on Pattern Analysis and Machine Intelligence</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2307.04122">arXiv:2307.04122</a> <span> [<a href="https://arxiv.org/pdf/2307.04122">pdf</a>, <a href="https://arxiv.org/format/2307.04122">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> Enhancing Low-Light Images Using Infrared-Encoded Images </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Tian%2C+S">Shulin Tian</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yufei Wang</a>, <a href="/search/cs?searchtype=author&query=Wan%2C+R">Renjie Wan</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+W">Wenhan Yang</a>, <a href="/search/cs?searchtype=author&query=Kot%2C+A+C">Alex C. Kot</a>, <a href="/search/cs?searchtype=author&query=Wen%2C+B">Bihan Wen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2307.04122v1-abstract-short" style="display: inline;"> Low-light image enhancement task is essential yet challenging as it is ill-posed intrinsically. Previous arts mainly focus on the low-light images captured in the visible spectrum using pixel-wise loss, which limits the capacity of recovering the brightness, contrast, and texture details due to the small number of income photons. In this work, we propose a novel approach to increase the visibility… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2307.04122v1-abstract-full').style.display = 'inline'; document.getElementById('2307.04122v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2307.04122v1-abstract-full" style="display: none;"> Low-light image enhancement task is essential yet challenging as it is ill-posed intrinsically. Previous arts mainly focus on the low-light images captured in the visible spectrum using pixel-wise loss, which limits the capacity of recovering the brightness, contrast, and texture details due to the small number of income photons. In this work, we propose a novel approach to increase the visibility of images captured under low-light environments by removing the in-camera infrared (IR) cut-off filter, which allows for the capture of more photons and results in improved signal-to-noise ratio due to the inclusion of information from the IR spectrum. To verify the proposed strategy, we collect a paired dataset of low-light images captured without the IR cut-off filter, with corresponding long-exposure reference images with an external filter. The experimental results on the proposed dataset demonstrate the effectiveness of the proposed method, showing better performance quantitatively and qualitatively. The dataset and code are publicly available at https://wyf0912.github.io/ELIEI/ <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2307.04122v1-abstract-full').style.display = 'none'; document.getElementById('2307.04122v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 July, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">The first two authors contribute equally. The work is accepted by ICIP 2023</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2306.12058">arXiv:2306.12058</a> <span> [<a href="https://arxiv.org/pdf/2306.12058">pdf</a>, <a href="https://arxiv.org/format/2306.12058">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> Beyond Learned Metadata-based Raw Image Reconstruction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yufei Wang</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+Y">Yi Yu</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+W">Wenhan Yang</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+L">Lanqing Guo</a>, <a href="/search/cs?searchtype=author&query=Chau%2C+L">Lap-Pui Chau</a>, <a href="/search/cs?searchtype=author&query=Kot%2C+A+C">Alex C. Kot</a>, <a href="/search/cs?searchtype=author&query=Wen%2C+B">Bihan Wen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2306.12058v1-abstract-short" style="display: inline;"> While raw images have distinct advantages over sRGB images, e.g., linearity and fine-grained quantization levels, they are not widely adopted by general users due to their substantial storage requirements. Very recent studies propose to compress raw images by designing sampling masks within the pixel space of the raw image. However, these approaches often leave space for pursuing more effective im… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2306.12058v1-abstract-full').style.display = 'inline'; document.getElementById('2306.12058v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2306.12058v1-abstract-full" style="display: none;"> While raw images have distinct advantages over sRGB images, e.g., linearity and fine-grained quantization levels, they are not widely adopted by general users due to their substantial storage requirements. Very recent studies propose to compress raw images by designing sampling masks within the pixel space of the raw image. However, these approaches often leave space for pursuing more effective image representations and compact metadata. In this work, we propose a novel framework that learns a compact representation in the latent space, serving as metadata, in an end-to-end manner. Compared with lossy image compression, we analyze the intrinsic difference of the raw image reconstruction task caused by rich information from the sRGB image. Based on the analysis, a novel backbone design with asymmetric and hybrid spatial feature resolutions is proposed, which significantly improves the rate-distortion performance. Besides, we propose a novel design of the context model, which can better predict the order masks of encoding/decoding based on both the sRGB image and the masks of already processed features. Benefited from the better modeling of the correlation between order masks, the already processed information can be better utilized. Moreover, a novel sRGB-guided adaptive quantization precision strategy, which dynamically assigns varying levels of quantization precision to different regions, further enhances the representation ability of the model. Finally, based on the iterative properties of the proposed context model, we propose a novel strategy to achieve variable bit rates using a single model. This strategy allows for the continuous convergence of a wide range of bit rates. Extensive experimental results demonstrate that the proposed method can achieve better reconstruction quality with a smaller metadata size. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2306.12058v1-abstract-full').style.display = 'none'; document.getElementById('2306.12058v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 June, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2304.12489">arXiv:2304.12489</a> <span> [<a href="https://arxiv.org/pdf/2304.12489">pdf</a>, <a href="https://arxiv.org/format/2304.12489">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> </div> </div> <p class="title is-5 mathjax"> Beyond the Prior Forgery Knowledge: Mining Critical Clues for General Face Forgery Detection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Luo%2C+A">Anwei Luo</a>, <a href="/search/cs?searchtype=author&query=Kong%2C+C">Chenqi Kong</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+J">Jiwu Huang</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+Y">Yongjian Hu</a>, <a href="/search/cs?searchtype=author&query=Kang%2C+X">Xiangui Kang</a>, <a href="/search/cs?searchtype=author&query=Kot%2C+A+C">Alex C. Kot</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2304.12489v1-abstract-short" style="display: inline;"> Face forgery detection is essential in combating malicious digital face attacks. Previous methods mainly rely on prior expert knowledge to capture specific forgery clues, such as noise patterns, blending boundaries, and frequency artifacts. However, these methods tend to get trapped in local optima, resulting in limited robustness and generalization capability. To address these issues, we propose… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2304.12489v1-abstract-full').style.display = 'inline'; document.getElementById('2304.12489v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2304.12489v1-abstract-full" style="display: none;"> Face forgery detection is essential in combating malicious digital face attacks. Previous methods mainly rely on prior expert knowledge to capture specific forgery clues, such as noise patterns, blending boundaries, and frequency artifacts. However, these methods tend to get trapped in local optima, resulting in limited robustness and generalization capability. To address these issues, we propose a novel Critical Forgery Mining (CFM) framework, which can be flexibly assembled with various backbones to boost their generalization and robustness performance. Specifically, we first build a fine-grained triplet and suppress specific forgery traces through prior knowledge-agnostic data augmentation. Subsequently, we propose a fine-grained relation learning prototype to mine critical information in forgeries through instance and local similarity-aware losses. Moreover, we design a novel progressive learning controller to guide the model to focus on principal feature components, enabling it to learn critical forgery features in a coarse-to-fine manner. The proposed method achieves state-of-the-art forgery detection performance under various challenging evaluation settings. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2304.12489v1-abstract-full').style.display = 'none'; document.getElementById('2304.12489v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 April, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2304.08799">arXiv:2304.08799</a> <span> [<a href="https://arxiv.org/pdf/2304.08799">pdf</a>, <a href="https://arxiv.org/format/2304.08799">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Self-Supervised 3D Action Representation Learning with Skeleton Cloud Colorization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yang%2C+S">Siyuan Yang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+J">Jun Liu</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+S">Shijian Lu</a>, <a href="/search/cs?searchtype=author&query=Hwa%2C+E+M">Er Meng Hwa</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+Y">Yongjian Hu</a>, <a href="/search/cs?searchtype=author&query=Kot%2C+A+C">Alex C. Kot</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2304.08799v3-abstract-short" style="display: inline;"> 3D Skeleton-based human action recognition has attracted increasing attention in recent years. Most of the existing work focuses on supervised learning which requires a large number of labeled action sequences that are often expensive and time-consuming to annotate. In this paper, we address self-supervised 3D action representation learning for skeleton-based action recognition. We investigate sel… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2304.08799v3-abstract-full').style.display = 'inline'; document.getElementById('2304.08799v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2304.08799v3-abstract-full" style="display: none;"> 3D Skeleton-based human action recognition has attracted increasing attention in recent years. Most of the existing work focuses on supervised learning which requires a large number of labeled action sequences that are often expensive and time-consuming to annotate. In this paper, we address self-supervised 3D action representation learning for skeleton-based action recognition. We investigate self-supervised representation learning and design a novel skeleton cloud colorization technique that is capable of learning spatial and temporal skeleton representations from unlabeled skeleton sequence data. We represent a skeleton action sequence as a 3D skeleton cloud and colorize each point in the cloud according to its temporal and spatial orders in the original (unannotated) skeleton sequence. Leveraging the colorized skeleton point cloud, we design an auto-encoder framework that can learn spatial-temporal features from the artificial color labels of skeleton joints effectively. Specifically, we design a two-steam pretraining network that leverages fine-grained and coarse-grained colorization to learn multi-scale spatial-temporal features. In addition, we design a Masked Skeleton Cloud Repainting task that can pretrain the designed auto-encoder framework to learn informative representations. We evaluate our skeleton cloud colorization approach with linear classifiers trained under different configurations, including unsupervised, semi-supervised, fully-supervised, and transfer learning settings. Extensive experiments on NTU RGB+D, NTU RGB+D 120, PKU-MMD, NW-UCLA, and UWA3D datasets show that the proposed method outperforms existing unsupervised and semi-supervised 3D action recognition methods by large margins and achieves competitive performance in supervised 3D action recognition as well. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2304.08799v3-abstract-full').style.display = 'none'; document.getElementById('2304.08799v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 October, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 18 April, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by TPAMI. This work is an extension of our ICCV 2021 paper [arXiv:2108.01959] https://openaccess.thecvf.com/content/ICCV2021/html/Yang_Skeleton_Cloud_Colorization_for_Unsupervised_3D_Action_Representation_Learning_ICCV_2021_paper.html</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2303.10452">arXiv:2303.10452</a> <span> [<a href="https://arxiv.org/pdf/2303.10452">pdf</a>, <a href="https://arxiv.org/format/2303.10452">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Confidence Attention and Generalization Enhanced Distillation for Continuous Video Domain Adaptation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xiyu Wang</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+Y">Yuecong Xu</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+J">Jianfei Yang</a>, <a href="/search/cs?searchtype=author&query=Wen%2C+B">Bihan Wen</a>, <a href="/search/cs?searchtype=author&query=Kot%2C+A+C">Alex C. Kot</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2303.10452v2-abstract-short" style="display: inline;"> Continuous Video Domain Adaptation (CVDA) is a scenario where a source model is required to adapt to a series of individually available changing target domains continuously without source data or target supervision. It has wide applications, such as robotic vision and autonomous driving. The main underlying challenge of CVDA is to learn helpful information only from the unsupervised target data wh… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2303.10452v2-abstract-full').style.display = 'inline'; document.getElementById('2303.10452v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2303.10452v2-abstract-full" style="display: none;"> Continuous Video Domain Adaptation (CVDA) is a scenario where a source model is required to adapt to a series of individually available changing target domains continuously without source data or target supervision. It has wide applications, such as robotic vision and autonomous driving. The main underlying challenge of CVDA is to learn helpful information only from the unsupervised target data while avoiding forgetting previously learned knowledge catastrophically, which is out of the capability of previous Video-based Unsupervised Domain Adaptation methods. Therefore, we propose a Confidence-Attentive network with geneRalization enhanced self-knowledge disTillation (CART) to address the challenge in CVDA. Firstly, to learn from unsupervised domains, we propose to learn from pseudo labels. However, in continuous adaptation, prediction errors can accumulate rapidly in pseudo labels, and CART effectively tackles this problem with two key modules. Specifically, The first module generates refined pseudo labels using model predictions and deploys a novel attentive learning strategy. The second module compares the outputs of augmented data from the current model to the outputs of weakly augmented data from the source model, forming a novel consistency regularization on the model to alleviate the accumulation of prediction errors. Extensive experiments suggest that the CVDA performance of CART outperforms existing methods by a considerable margin. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2303.10452v2-abstract-full').style.display = 'none'; document.getElementById('2303.10452v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 August, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 18 March, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">16 pages, 9 tables, 10 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2303.02057">arXiv:2303.02057</a> <span> [<a href="https://arxiv.org/pdf/2303.02057">pdf</a>, <a href="https://arxiv.org/format/2303.02057">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Unsupervised Deep Digital Staining For Microscopic Cell Images Via Knowledge Distillation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xu%2C+Z">Ziwang Xu</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+L">Lanqing Guo</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+S">Shuyan Zhang</a>, <a href="/search/cs?searchtype=author&query=Kot%2C+A+C">Alex C. Kot</a>, <a href="/search/cs?searchtype=author&query=Wen%2C+B">Bihan Wen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2303.02057v1-abstract-short" style="display: inline;"> Staining is critical to cell imaging and medical diagnosis, which is expensive, time-consuming, labor-intensive, and causes irreversible changes to cell tissues. Recent advances in deep learning enabled digital staining via supervised model training. However, it is difficult to obtain large-scale stained/unstained cell image pairs in practice, which need to be perfectly aligned with the supervisio… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2303.02057v1-abstract-full').style.display = 'inline'; document.getElementById('2303.02057v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2303.02057v1-abstract-full" style="display: none;"> Staining is critical to cell imaging and medical diagnosis, which is expensive, time-consuming, labor-intensive, and causes irreversible changes to cell tissues. Recent advances in deep learning enabled digital staining via supervised model training. However, it is difficult to obtain large-scale stained/unstained cell image pairs in practice, which need to be perfectly aligned with the supervision. In this work, we propose a novel unsupervised deep learning framework for the digital staining of cell images using knowledge distillation and generative adversarial networks (GANs). A teacher model is first trained mainly for the colorization of bright-field images. After that,a student GAN for staining is obtained by knowledge distillation with hybrid non-reference losses. We show that the proposed unsupervised deep staining method can generate stained images with more accurate positions and shapes of the cell targets. Compared with other unsupervised deep generative models for staining, our method achieves much more promising results both qualitatively and quantitatively. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2303.02057v1-abstract-full').style.display = 'none'; document.getElementById('2303.02057v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 March, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by ICASSP 2023</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2302.14677">arXiv:2302.14677</a> <span> [<a href="https://arxiv.org/pdf/2302.14677">pdf</a>, <a href="https://arxiv.org/format/2302.14677">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> Backdoor Attacks Against Deep Image Compression via Adaptive Frequency Trigger </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yu%2C+Y">Yi Yu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yufei Wang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+W">Wenhan Yang</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+S">Shijian Lu</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+Y">Yap-peng Tan</a>, <a href="/search/cs?searchtype=author&query=Kot%2C+A+C">Alex C. Kot</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2302.14677v1-abstract-short" style="display: inline;"> Recent deep-learning-based compression methods have achieved superior performance compared with traditional approaches. However, deep learning models have proven to be vulnerable to backdoor attacks, where some specific trigger patterns added to the input can lead to malicious behavior of the models. In this paper, we present a novel backdoor attack with multiple triggers against learned image com… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2302.14677v1-abstract-full').style.display = 'inline'; document.getElementById('2302.14677v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2302.14677v1-abstract-full" style="display: none;"> Recent deep-learning-based compression methods have achieved superior performance compared with traditional approaches. However, deep learning models have proven to be vulnerable to backdoor attacks, where some specific trigger patterns added to the input can lead to malicious behavior of the models. In this paper, we present a novel backdoor attack with multiple triggers against learned image compression models. Motivated by the widely used discrete cosine transform (DCT) in existing compression systems and standards, we propose a frequency-based trigger injection model that adds triggers in the DCT domain. In particular, we design several attack objectives for various attacking scenarios, including: 1) attacking compression quality in terms of bit-rate and reconstruction quality; 2) attacking task-driven measures, such as down-stream face recognition and semantic segmentation. Moreover, a novel simple dynamic loss is designed to balance the influence of different loss terms adaptively, which helps achieve more efficient training. Extensive experiments show that with our trained trigger injection models and simple modification of encoder parameters (of the compression model), the proposed attack can successfully inject several backdoors with corresponding triggers in a single image compression model. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2302.14677v1-abstract-full').style.display = 'none'; document.getElementById('2302.14677v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 February, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by CVPR 2023</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> I.4 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2302.14309">arXiv:2302.14309</a> <span> [<a href="https://arxiv.org/pdf/2302.14309">pdf</a>, <a href="https://arxiv.org/format/2302.14309">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Temporal Coherent Test-Time Optimization for Robust Video Classification </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yi%2C+C">Chenyu Yi</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+S">Siyuan Yang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yufei Wang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+H">Haoliang Li</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+Y">Yap-Peng Tan</a>, <a href="/search/cs?searchtype=author&query=Kot%2C+A+C">Alex C. Kot</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2302.14309v1-abstract-short" style="display: inline;"> Deep neural networks are likely to fail when the test data is corrupted in real-world deployment (e.g., blur, weather, etc.). Test-time optimization is an effective way that adapts models to generalize to corrupted data during testing, which has been shown in the image domain. However, the techniques for improving video classification corruption robustness remain few. In this work, we propose a Te… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2302.14309v1-abstract-full').style.display = 'inline'; document.getElementById('2302.14309v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2302.14309v1-abstract-full" style="display: none;"> Deep neural networks are likely to fail when the test data is corrupted in real-world deployment (e.g., blur, weather, etc.). Test-time optimization is an effective way that adapts models to generalize to corrupted data during testing, which has been shown in the image domain. However, the techniques for improving video classification corruption robustness remain few. In this work, we propose a Temporal Coherent Test-time Optimization framework (TeCo) to utilize spatio-temporal information in test-time optimization for robust video classification. To exploit information in video with self-supervised learning, TeCo uses global content from video clips and optimizes models for entropy minimization. TeCo minimizes the entropy of the prediction based on the global content from video clips. Meanwhile, it also feeds local content to regularize the temporal coherence at the feature level. TeCo retains the generalization ability of various video classification models and achieves significant improvements in corruption robustness across Mini Kinetics-C and Mini SSV2-C. Furthermore, TeCo sets a new baseline in video classification corruption robustness via test-time optimization. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2302.14309v1-abstract-full').style.display = 'none'; document.getElementById('2302.14309v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 February, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2302.11950">arXiv:2302.11950</a> <span> [<a href="https://arxiv.org/pdf/2302.11950">pdf</a>, <a href="https://arxiv.org/format/2302.11950">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.2352/EI.2023.35.7.IMAGE-276">10.2352/EI.2023.35.7.IMAGE-276 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Evaluating the Efficacy of Skincare Product: A Realistic Short-Term Facial Pore Simulation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+L">Ling Li</a>, <a href="/search/cs?searchtype=author&query=Dissanayake%2C+B">Bandara Dissanayake</a>, <a href="/search/cs?searchtype=author&query=Omotezako%2C+T">Tatsuya Omotezako</a>, <a href="/search/cs?searchtype=author&query=Zhong%2C+Y">Yunjie Zhong</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Q">Qing Zhang</a>, <a href="/search/cs?searchtype=author&query=Cai%2C+R">Rizhao Cai</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+Q">Qian Zheng</a>, <a href="/search/cs?searchtype=author&query=Sng%2C+D">Dennis Sng</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+W">Weisi Lin</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yufei Wang</a>, <a href="/search/cs?searchtype=author&query=Kot%2C+A+C">Alex C Kot</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2302.11950v1-abstract-short" style="display: inline;"> Simulating the effects of skincare products on face is a potential new way to communicate the efficacy of skincare products in skin diagnostics and product recommendations. Furthermore, such simulations enable one to anticipate his/her skin conditions and better manage skin health. However, there is a lack of effective simulations today. In this paper, we propose the first simulation model to reve… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2302.11950v1-abstract-full').style.display = 'inline'; document.getElementById('2302.11950v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2302.11950v1-abstract-full" style="display: none;"> Simulating the effects of skincare products on face is a potential new way to communicate the efficacy of skincare products in skin diagnostics and product recommendations. Furthermore, such simulations enable one to anticipate his/her skin conditions and better manage skin health. However, there is a lack of effective simulations today. In this paper, we propose the first simulation model to reveal facial pore changes after using skincare products. Our simulation pipeline consists of 2 steps: training data establishment and facial pore simulation. To establish training data, we collect face images with various pore quality indexes from short-term (8-weeks) clinical studies. People often experience significant skin fluctuations (due to natural rhythms, external stressors, etc.,), which introduces large perturbations in clinical data. To address this problem, we propose a sliding window mechanism to clean data and select representative index(es) to represent facial pore changes. Facial pore simulation stage consists of 3 modules: UNet-based segmentation module to localize facial pores; regression module to predict time-dependent warping hyperparameters; and deformation module, taking warping hyperparameters and pore segmentation labels as inputs, to precisely deform pores accordingly. The proposed simulation is able to render realistic facial pore changes. And this work will pave the way for future research in facial skin simulation and skincare product developments. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2302.11950v1-abstract-full').style.display = 'none'; document.getElementById('2302.11950v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 February, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">6 pages, 7 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2302.05936">arXiv:2302.05936</a> <span> [<a href="https://arxiv.org/pdf/2302.05936">pdf</a>, <a href="https://arxiv.org/format/2302.05936">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Generalized Few-Shot Continual Learning with Contrastive Mixture of Adapters </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Cui%2C+Y">Yawen Cui</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+Z">Zitong Yu</a>, <a href="/search/cs?searchtype=author&query=Cai%2C+R">Rizhao Cai</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xun Wang</a>, <a href="/search/cs?searchtype=author&query=Kot%2C+A+C">Alex C. Kot</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+L">Li Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2302.05936v1-abstract-short" style="display: inline;"> The goal of Few-Shot Continual Learning (FSCL) is to incrementally learn novel tasks with limited labeled samples and preserve previous capabilities simultaneously, while current FSCL methods are all for the class-incremental purpose. Moreover, the evaluation of FSCL solutions is only the cumulative performance of all encountered tasks, but there is no work on exploring the domain generalization a… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2302.05936v1-abstract-full').style.display = 'inline'; document.getElementById('2302.05936v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2302.05936v1-abstract-full" style="display: none;"> The goal of Few-Shot Continual Learning (FSCL) is to incrementally learn novel tasks with limited labeled samples and preserve previous capabilities simultaneously, while current FSCL methods are all for the class-incremental purpose. Moreover, the evaluation of FSCL solutions is only the cumulative performance of all encountered tasks, but there is no work on exploring the domain generalization ability. Domain generalization is a challenging yet practical task that aims to generalize beyond training domains. In this paper, we set up a Generalized FSCL (GFSCL) protocol involving both class- and domain-incremental situations together with the domain generalization assessment. Firstly, two benchmark datasets and protocols are newly arranged, and detailed baselines are provided for this unexplored configuration. We find that common continual learning methods have poor generalization ability on unseen domains and cannot better cope with the catastrophic forgetting issue in cross-incremental tasks. In this way, we further propose a rehearsal-free framework based on Vision Transformer (ViT) named Contrastive Mixture of Adapters (CMoA). Due to different optimization targets of class increment and domain increment, the CMoA contains two parts: (1) For the class-incremental issue, the Mixture of Adapters (MoA) module is incorporated into ViT, then cosine similarity regularization and the dynamic weighting are designed to make each adapter learn specific knowledge and concentrate on particular classes. (2) For the domain-related issues and domain-invariant representation learning, we alleviate the inner-class variation by prototype-calibrated contrastive learning. The codes and protocols are available at https://github.com/yawencui/CMoA. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2302.05936v1-abstract-full').style.display = 'none'; document.getElementById('2302.05936v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 February, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Submitted to International Journal of Computer Vision (IJCV)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2302.05746">arXiv:2302.05746</a> <span> [<a href="https://arxiv.org/pdf/2302.05746">pdf</a>, <a href="https://arxiv.org/format/2302.05746">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> Removing Image Artifacts From Scratched Lens Protectors </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yufei Wang</a>, <a href="/search/cs?searchtype=author&query=Wan%2C+R">Renjie Wan</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+W">Wenhan Yang</a>, <a href="/search/cs?searchtype=author&query=Wen%2C+B">Bihan Wen</a>, <a href="/search/cs?searchtype=author&query=Chau%2C+L">Lap-Pui Chau</a>, <a href="/search/cs?searchtype=author&query=Kot%2C+A+C">Alex C. Kot</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2302.05746v2-abstract-short" style="display: inline;"> A protector is placed in front of the camera lens for mobile devices to avoid damage, while the protector itself can be easily scratched accidentally, especially for plastic ones. The artifacts appear in a wide variety of patterns, making it difficult to see through them clearly. Removing image artifacts from the scratched lens protector is inherently challenging due to the occasional flare artifa… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2302.05746v2-abstract-full').style.display = 'inline'; document.getElementById('2302.05746v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2302.05746v2-abstract-full" style="display: none;"> A protector is placed in front of the camera lens for mobile devices to avoid damage, while the protector itself can be easily scratched accidentally, especially for plastic ones. The artifacts appear in a wide variety of patterns, making it difficult to see through them clearly. Removing image artifacts from the scratched lens protector is inherently challenging due to the occasional flare artifacts and the co-occurring interference within mixed artifacts. Though different methods have been proposed for some specific distortions, they seldom consider such inherent challenges. In our work, we consider the inherent challenges in a unified framework with two cooperative modules, which facilitate the performance boost of each other. We also collect a new dataset from the real world to facilitate training and evaluation purposes. The experimental results demonstrate that our method outperforms the baselines qualitatively and quantitatively. The code and datasets will be released after acceptance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2302.05746v2-abstract-full').style.display = 'none'; document.getElementById('2302.05746v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 February, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 11 February, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by ISCAS 2023</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2209.01935">arXiv:2209.01935</a> <span> [<a href="https://arxiv.org/pdf/2209.01935">pdf</a>, <a href="https://arxiv.org/format/2209.01935">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> </div> </div> <p class="title is-5 mathjax"> Forensicability Assessment of Questioned Images in Recapturing Detection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+C">Changsheng Chen</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+L">Lin Zhao</a>, <a href="/search/cs?searchtype=author&query=Cai%2C+R">Rizhao Cai</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+Z">Zitong Yu</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+J">Jiwu Huang</a>, <a href="/search/cs?searchtype=author&query=Kot%2C+A+C">Alex C. Kot</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2209.01935v1-abstract-short" style="display: inline;"> Recapture detection of face and document images is an important forensic task. With deep learning, the performances of face anti-spoofing (FAS) and recaptured document detection have been improved significantly. However, the performances are not yet satisfactory on samples with weak forensic cues. The amount of forensic cues can be quantified to allow a reliable forensic result. In this work, we p… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2209.01935v1-abstract-full').style.display = 'inline'; document.getElementById('2209.01935v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2209.01935v1-abstract-full" style="display: none;"> Recapture detection of face and document images is an important forensic task. With deep learning, the performances of face anti-spoofing (FAS) and recaptured document detection have been improved significantly. However, the performances are not yet satisfactory on samples with weak forensic cues. The amount of forensic cues can be quantified to allow a reliable forensic result. In this work, we propose a forensicability assessment network to quantify the forensicability of the questioned samples. The low-forensicability samples are rejected before the actual recapturing detection process to improve the efficiency of recapturing detection systems. We first extract forensicability features related to both image quality assessment and forensic tasks. By exploiting domain knowledge of the forensic application in image quality and forensic features, we define three task-specific forensicability classes and the initialized locations in the feature space. Based on the extracted features and the defined centers, we train the proposed forensic assessment network (FANet) with cross-entropy loss and update the centers with a momentum-based update method. We integrate the trained FANet with practical recapturing detection schemes in face anti-spoofing and recaptured document detection tasks. Experimental results show that, for a generic CNN-based FAS scheme, FANet reduces the EERs from 33.75% to 19.23% under ROSE to IDIAP protocol by rejecting samples with the lowest 30% forensicability scores. The performance of FAS schemes is poor in the rejected samples, with EER as high as 56.48%. Similar performances in rejecting low-forensicability samples have been observed for the state-of-the-art approaches in FAS and recaptured document detection tasks. To the best of our knowledge, this is the first work that assesses the forensicability of recaptured document images and improves the system efficiency. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2209.01935v1-abstract-full').style.display = 'none'; document.getElementById('2209.01935v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 September, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">12 pages, 10 figures, 2 tables (Submitted to TIFS July-2022)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2208.05401">arXiv:2208.05401</a> <span> [<a href="https://arxiv.org/pdf/2208.05401">pdf</a>, <a href="https://arxiv.org/format/2208.05401">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Benchmarking Joint Face Spoofing and Forgery Detection with Visual and Physiological Cues </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yu%2C+Z">Zitong Yu</a>, <a href="/search/cs?searchtype=author&query=Cai%2C+R">Rizhao Cai</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zhi Li</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+W">Wenhan Yang</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+J">Jingang Shi</a>, <a href="/search/cs?searchtype=author&query=Kot%2C+A+C">Alex C. Kot</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2208.05401v2-abstract-short" style="display: inline;"> Face anti-spoofing (FAS) and face forgery detection play vital roles in securing face biometric systems from presentation attacks (PAs) and vicious digital manipulation (e.g., deepfakes). Despite promising performance upon large-scale data and powerful deep models, the generalization problem of existing approaches is still an open issue. Most of recent approaches focus on 1) unimodal visual appear… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2208.05401v2-abstract-full').style.display = 'inline'; document.getElementById('2208.05401v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2208.05401v2-abstract-full" style="display: none;"> Face anti-spoofing (FAS) and face forgery detection play vital roles in securing face biometric systems from presentation attacks (PAs) and vicious digital manipulation (e.g., deepfakes). Despite promising performance upon large-scale data and powerful deep models, the generalization problem of existing approaches is still an open issue. Most of recent approaches focus on 1) unimodal visual appearance or physiological (i.e., remote photoplethysmography (rPPG)) cues; and 2) separated feature representation for FAS or face forgery detection. On one side, unimodal appearance and rPPG features are respectively vulnerable to high-fidelity face 3D mask and video replay attacks, inspiring us to design reliable multi-modal fusion mechanisms for generalized face attack detection. On the other side, there are rich common features across FAS and face forgery detection tasks (e.g., periodic rPPG rhythms and vanilla appearance for bonafides), providing solid evidence to design a joint FAS and face forgery detection system in a multi-task learning fashion. In this paper, we establish the first joint face spoofing and forgery detection benchmark using both visual appearance and physiological rPPG cues. To enhance the rPPG periodicity discrimination, we design a two-branch physiological network using both facial spatio-temporal rPPG signal map and its continuous wavelet transformed counterpart as inputs. To mitigate the modality bias and improve the fusion efficacy, we conduct a weighted batch and layer normalization for both appearance and rPPG features before multi-modal fusion. We find that the generalization capacities of both unimodal (appearance or rPPG) and multi-modal (appearance+rPPG) models can be obviously improved via joint training on these two tasks. We hope this new benchmark will facilitate the future research of both FAS and deepfake detection communities. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2208.05401v2-abstract-full').style.display = 'none'; document.getElementById('2208.05401v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 January, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 10 August, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by IEEE Transactions on Dependable and Secure Computing (TDSC). Corresponding authors: Zitong Yu and Wenhan Yang</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2207.01204">arXiv:2207.01204</a> <span> [<a href="https://arxiv.org/pdf/2207.01204">pdf</a>, <a href="https://arxiv.org/format/2207.01204">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Adversarial Pairwise Reverse Attention for Camera Performance Imbalance in Person Re-identification: New Dataset and Metrics </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ang%2C+E+P+W">Eugene P. W. Ang</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+S">Shan Lin</a>, <a href="/search/cs?searchtype=author&query=Ahuja%2C+R">Rahul Ahuja</a>, <a href="/search/cs?searchtype=author&query=Ahmed%2C+N">Nemath Ahmed</a>, <a href="/search/cs?searchtype=author&query=Kot%2C+A+C">Alex C. Kot</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2207.01204v1-abstract-short" style="display: inline;"> Existing evaluation metrics for Person Re-Identification (Person ReID) models focus on system-wide performance. However, our studies reveal weaknesses due to the uneven data distributions among cameras and different camera properties that expose the ReID system to exploitation. In this work, we raise the long-ignored ReID problem of camera performance imbalance and collect a real-world privacy-awa… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2207.01204v1-abstract-full').style.display = 'inline'; document.getElementById('2207.01204v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2207.01204v1-abstract-full" style="display: none;"> Existing evaluation metrics for Person Re-Identification (Person ReID) models focus on system-wide performance. However, our studies reveal weaknesses due to the uneven data distributions among cameras and different camera properties that expose the ReID system to exploitation. In this work, we raise the long-ignored ReID problem of camera performance imbalance and collect a real-world privacy-aware dataset from 38 cameras to assist the study of the imbalance issue. We propose new metrics to quantify camera performance imbalance and further propose the Adversarial Pairwise Reverse Attention (APRA) Module to guide the model learning the camera invariant feature with a novel pairwise attention inversion mechanism. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2207.01204v1-abstract-full').style.display = 'none'; document.getElementById('2207.01204v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 July, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted into the IEEE International Conference on Image Processing (ICIP) 2022</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2205.03792">arXiv:2205.03792</a> <span> [<a href="https://arxiv.org/pdf/2205.03792">pdf</a>, <a href="https://arxiv.org/format/2205.03792">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> One-Class Knowledge Distillation for Face Presentation Attack Detection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zhi Li</a>, <a href="/search/cs?searchtype=author&query=Cai%2C+R">Rizhao Cai</a>, <a href="/search/cs?searchtype=author&query=Li%2C+H">Haoliang Li</a>, <a href="/search/cs?searchtype=author&query=Lam%2C+K">Kwok-Yan Lam</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+Y">Yongjian Hu</a>, <a href="/search/cs?searchtype=author&query=Kot%2C+A+C">Alex C. Kot</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2205.03792v1-abstract-short" style="display: inline;"> Face presentation attack detection (PAD) has been extensively studied by research communities to enhance the security of face recognition systems. Although existing methods have achieved good performance on testing data with similar distribution as the training data, their performance degrades severely in application scenarios with data of unseen distributions. In situations where the training and… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2205.03792v1-abstract-full').style.display = 'inline'; document.getElementById('2205.03792v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2205.03792v1-abstract-full" style="display: none;"> Face presentation attack detection (PAD) has been extensively studied by research communities to enhance the security of face recognition systems. Although existing methods have achieved good performance on testing data with similar distribution as the training data, their performance degrades severely in application scenarios with data of unseen distributions. In situations where the training and testing data are drawn from different domains, a typical approach is to apply domain adaptation techniques to improve face PAD performance with the help of target domain data. However, it has always been a non-trivial challenge to collect sufficient data samples in the target domain, especially for attack samples. This paper introduces a teacher-student framework to improve the cross-domain performance of face PAD with one-class domain adaptation. In addition to the source domain data, the framework utilizes only a few genuine face samples of the target domain. Under this framework, a teacher network is trained with source domain samples to provide discriminative feature representations for face PAD. Student networks are trained to mimic the teacher network and learn similar representations for genuine face samples of the target domain. In the test phase, the similarity score between the representations of the teacher and student networks is used to distinguish attacks from genuine ones. To evaluate the proposed framework under one-class domain adaptation settings, we devised two new protocols and conducted extensive experiments. The experimental results show that our method outperforms baselines under one-class domain adaptation settings and even state-of-the-art methods with unsupervised domain adaptation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2205.03792v1-abstract-full').style.display = 'none'; document.getElementById('2205.03792v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 May, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2022. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2203.16931">arXiv:2203.16931</a> <span> [<a href="https://arxiv.org/pdf/2203.16931">pdf</a>, <a href="https://arxiv.org/format/2203.16931">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> </div> </div> <p class="title is-5 mathjax"> Towards Robust Rain Removal Against Adversarial Attacks: A Comprehensive Benchmark Analysis and Beyond </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yu%2C+Y">Yi Yu</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+W">Wenhan Yang</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+Y">Yap-Peng Tan</a>, <a href="/search/cs?searchtype=author&query=Kot%2C+A+C">Alex C. Kot</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2203.16931v1-abstract-short" style="display: inline;"> Rain removal aims to remove rain streaks from images/videos and reduce the disruptive effects caused by rain. It not only enhances image/video visibility but also allows many computer vision algorithms to function properly. This paper makes the first attempt to conduct a comprehensive study on the robustness of deep learning-based rain removal methods against adversarial attacks. Our study shows t… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2203.16931v1-abstract-full').style.display = 'inline'; document.getElementById('2203.16931v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2203.16931v1-abstract-full" style="display: none;"> Rain removal aims to remove rain streaks from images/videos and reduce the disruptive effects caused by rain. It not only enhances image/video visibility but also allows many computer vision algorithms to function properly. This paper makes the first attempt to conduct a comprehensive study on the robustness of deep learning-based rain removal methods against adversarial attacks. Our study shows that, when the image/video is highly degraded, rain removal methods are more vulnerable to the adversarial attacks as small distortions/perturbations become less noticeable or detectable. In this paper, we first present a comprehensive empirical evaluation of various methods at different levels of attacks and with various losses/targets to generate the perturbations from the perspective of human perception and machine analysis tasks. A systematic evaluation of key modules in existing methods is performed in terms of their robustness against adversarial attacks. From the insights of our analysis, we construct a more robust deraining method by integrating these effective modules. Finally, we examine various types of adversarial attacks that are specific to deraining problems and their effects on both human and machine vision tasks, including 1) rain region attacks, adding perturbations only in the rain regions to make the perturbations in the attacked rain images less visible; 2) object-sensitive attacks, adding perturbations only in regions near the given objects. Code is available at https://github.com/yuyi-sd/Robust_Rain_Removal. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2203.16931v1-abstract-full').style.display = 'none'; document.getElementById('2203.16931v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 March, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">10 pages, 6 figures, to appear in CVPR 2022</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2203.16056">arXiv:2203.16056</a> <span> [<a href="https://arxiv.org/pdf/2203.16056">pdf</a>, <a href="https://arxiv.org/format/2203.16056">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Automatic Facial Skin Feature Detection for Everyone </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zheng%2C+Q">Qian Zheng</a>, <a href="/search/cs?searchtype=author&query=Purwar%2C+A">Ankur Purwar</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+H">Heng Zhao</a>, <a href="/search/cs?searchtype=author&query=Lim%2C+G+L">Guang Liang Lim</a>, <a href="/search/cs?searchtype=author&query=Li%2C+L">Ling Li</a>, <a href="/search/cs?searchtype=author&query=Behera%2C+D">Debasish Behera</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Q">Qian Wang</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+M">Min Tan</a>, <a href="/search/cs?searchtype=author&query=Cai%2C+R">Rizhao Cai</a>, <a href="/search/cs?searchtype=author&query=Werner%2C+J">Jennifer Werner</a>, <a href="/search/cs?searchtype=author&query=Sng%2C+D">Dennis Sng</a>, <a href="/search/cs?searchtype=author&query=van+Steensel%2C+M">Maurice van Steensel</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+W">Weisi Lin</a>, <a href="/search/cs?searchtype=author&query=Kot%2C+A+C">Alex C Kot</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2203.16056v1-abstract-short" style="display: inline;"> Automatic assessment and understanding of facial skin condition have several applications, including the early detection of underlying health problems, lifestyle and dietary treatment, skin-care product recommendation, etc. Selfies in the wild serve as an excellent data resource to democratize skin quality assessment, but suffer from several data collection challenges.The key to guaranteeing an ac… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2203.16056v1-abstract-full').style.display = 'inline'; document.getElementById('2203.16056v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2203.16056v1-abstract-full" style="display: none;"> Automatic assessment and understanding of facial skin condition have several applications, including the early detection of underlying health problems, lifestyle and dietary treatment, skin-care product recommendation, etc. Selfies in the wild serve as an excellent data resource to democratize skin quality assessment, but suffer from several data collection challenges.The key to guaranteeing an accurate assessment is accurate detection of different skin features. We present an automatic facial skin feature detection method that works across a variety of skin tones and age groups for selfies in the wild. To be specific, we annotate the locations of acne, pigmentation, and wrinkle for selfie images with different skin tone colors, severity levels, and lighting conditions. The annotation is conducted in a two-phase scheme with the help of a dermatologist to train volunteers for annotation. We employ Unet++ as the network architecture for feature detection. This work shows that the two-phase annotation scheme can robustly detect the accurate locations of acne, pigmentation, and wrinkle for selfie images with different ethnicities, skin tone colors, severity levels, age groups, and lighting conditions. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2203.16056v1-abstract-full').style.display = 'none'; document.getElementById('2203.16056v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 March, 2022; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2022. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by the conference of Electronic Imaging (EI) 2022</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2110.11391">arXiv:2110.11391</a> <span> [<a href="https://arxiv.org/pdf/2110.11391">pdf</a>, <a href="https://arxiv.org/format/2110.11391">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> DEX: Domain Embedding Expansion for Generalized Person Re-identification </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ang%2C+E+P+W">Eugene P. W. Ang</a>, <a href="/search/cs?searchtype=author&query=Shan%2C+L">Lin Shan</a>, <a href="/search/cs?searchtype=author&query=Kot%2C+A+C">Alex C. Kot</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2110.11391v1-abstract-short" style="display: inline;"> In recent years, supervised Person Re-identification (Person ReID) approaches have demonstrated excellent performance. However, when these methods are applied to inputs from a different camera network, they typically suffer from significant performance degradation. Different from most domain adaptation (DA) approaches addressing this issue, we focus on developing a domain generalization (DG) Perso… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2110.11391v1-abstract-full').style.display = 'inline'; document.getElementById('2110.11391v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2110.11391v1-abstract-full" style="display: none;"> In recent years, supervised Person Re-identification (Person ReID) approaches have demonstrated excellent performance. However, when these methods are applied to inputs from a different camera network, they typically suffer from significant performance degradation. Different from most domain adaptation (DA) approaches addressing this issue, we focus on developing a domain generalization (DG) Person ReID model that can be deployed without additional fine-tuning or adaptation. In this paper, we propose the Domain Embedding Expansion (DEX) module. DEX dynamically manipulates and augments deep features based on person and domain labels during training, significantly improving the generalization capability and robustness of Person ReID models to unseen domains. We also developed a light version of DEX (DEXLite), applying negative sampling techniques to scale to larger datasets and reduce memory usage for multi-branch networks. Our proposed DEX and DEXLite can be combined with many existing methods, Bag-of-Tricks (BagTricks), the Multi-Granularity Network (MGN), and Part-Based Convolutional Baseline (PCB), in a plug-and-play manner. With DEX and DEXLite, existing methods can gain significant improvements when tested on other unseen datasets, thereby demonstrating the general applicability of our method. Our solution outperforms the state-of-the-art DG Person ReID methods in all large-scale benchmarks as well as in most the small-scale benchmarks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2110.11391v1-abstract-full').style.display = 'none'; document.getElementById('2110.11391v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 October, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2021. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted into BMVC 2021</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2110.09108">arXiv:2110.09108</a> <span> [<a href="https://arxiv.org/pdf/2110.09108">pdf</a>, <a href="https://arxiv.org/format/2110.09108">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Asymmetric Modality Translation For Face Presentation Attack Detection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zhi Li</a>, <a href="/search/cs?searchtype=author&query=Li%2C+H">Haoliang Li</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+X">Xin Luo</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+Y">Yongjian Hu</a>, <a href="/search/cs?searchtype=author&query=Lam%2C+K">Kwok-Yan Lam</a>, <a href="/search/cs?searchtype=author&query=Kot%2C+A+C">Alex C. Kot</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2110.09108v2-abstract-short" style="display: inline;"> Face presentation attack detection (PAD) is an essential measure to protect face recognition systems from being spoofed by malicious users and has attracted great attention from both academia and industry. Although most of the existing methods can achieve desired performance to some extent, the generalization issue of face presentation attack detection under cross-domain settings (e.g., the settin… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2110.09108v2-abstract-full').style.display = 'inline'; document.getElementById('2110.09108v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2110.09108v2-abstract-full" style="display: none;"> Face presentation attack detection (PAD) is an essential measure to protect face recognition systems from being spoofed by malicious users and has attracted great attention from both academia and industry. Although most of the existing methods can achieve desired performance to some extent, the generalization issue of face presentation attack detection under cross-domain settings (e.g., the setting of unseen attacks and varying illumination) remains to be solved. In this paper, we propose a novel framework based on asymmetric modality translation for face presentation attack detection in bi-modality scenarios. Under the framework, we establish connections between two modality images of genuine faces. Specifically, a novel modality fusion scheme is presented that the image of one modality is translated to the other one through an asymmetric modality translator, then fused with its corresponding paired image. The fusion result is fed as the input to a discriminator for inference. The training of the translator is supervised by an asymmetric modality translation loss. Besides, an illumination normalization module based on Pattern of Local Gravitational Force (PLGF) representation is used to reduce the impact of illumination variation. We conduct extensive experiments on three public datasets, which validate that our method is effective in detecting various types of attacks and achieves state-of-the-art performance under different evaluation protocols. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2110.09108v2-abstract-full').style.display = 'none'; document.getElementById('2110.09108v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 October, 2021; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 18 October, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2021. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2110.06753">arXiv:2110.06753</a> <span> [<a href="https://arxiv.org/pdf/2110.06753">pdf</a>, <a href="https://arxiv.org/format/2110.06753">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1109/TIFS.2022.3158551">10.1109/TIFS.2022.3158551 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Learning Meta Pattern for Face Anti-Spoofing </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Cai%2C+R">Rizhao Cai</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zhi Li</a>, <a href="/search/cs?searchtype=author&query=Wan%2C+R">Renjie Wan</a>, <a href="/search/cs?searchtype=author&query=Li%2C+H">Haoliang Li</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+Y">Yongjian Hu</a>, <a href="/search/cs?searchtype=author&query=Kot%2C+A+C">Alex Chichung Kot</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2110.06753v2-abstract-short" style="display: inline;"> Face Anti-Spoofing (FAS) is essential to secure face recognition systems and has been extensively studied in recent years. Although deep neural networks (DNNs) for the FAS task have achieved promising results in intra-dataset experiments with similar distributions of training and testing data, the DNNs' generalization ability is limited under the cross-domain scenarios with different distributions… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2110.06753v2-abstract-full').style.display = 'inline'; document.getElementById('2110.06753v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2110.06753v2-abstract-full" style="display: none;"> Face Anti-Spoofing (FAS) is essential to secure face recognition systems and has been extensively studied in recent years. Although deep neural networks (DNNs) for the FAS task have achieved promising results in intra-dataset experiments with similar distributions of training and testing data, the DNNs' generalization ability is limited under the cross-domain scenarios with different distributions of training and testing data. To improve the generalization ability, recent hybrid methods have been explored to extract task-aware handcrafted features (e.g., Local Binary Pattern) as discriminative information for the input of DNNs. However, the handcrafted feature extraction relies on experts' domain knowledge, and how to choose appropriate handcrafted features is underexplored. To this end, we propose a learnable network to extract Meta Pattern (MP) in our learning-to-learn framework. By replacing handcrafted features with the MP, the discriminative information from MP is capable of learning a more generalized model. Moreover, we devise a two-stream network to hierarchically fuse the input RGB image and the extracted MP by using our proposed Hierarchical Fusion Module (HFM). We conduct comprehensive experiments and show that our MP outperforms the compared handcrafted features. Also, our proposed method with HFM and the MP can achieve state-of-the-art performance on two different domain generalization evaluation benchmarks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2110.06753v2-abstract-full').style.display = 'none'; document.getElementById('2110.06753v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 May, 2022; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 13 October, 2021; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2021. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by IEEE Transactions on Information Forensics and Security (https://ieeexplore.ieee.org.remotexs.ntu.edu.sg/document/9732458) Source code available in https://github.com/RizhaoCai/MetaPattern_FAS</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> IEEE Transactions on Information Forensics and Security, vol. 17, pp. 1201-1213, 2022 </p> </li> </ol> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Koot%2C+A+C&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Koot%2C+A+C&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Koot%2C+A+C&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> </ul> </nav> <div class="is-hidden-tablet"> <!-- feedback for mobile only --> <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a> </span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary"> <!-- MetaColumn 1 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div> <!-- end MetaColumn 1 --> <!-- MetaColumn 2 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div> <!-- end MetaColumn 2 --> </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>